# Sentiment Classification Project

In [3]:
import numpy as np
import pandas as pd

# Load data

In [5]:
training_data = pd.read_csv('./data/training.csv',index_col = 0)

In [6]:
# Encode the Labels (either as 0,1,2 for classification or -1,0,1 for regression)
label_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
training_data['label_encoded'] = training_data['label'].map(label_mapping)

In [7]:
training_data

Unnamed: 0_level_0,sentence,label,label_encoded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Those 2 drinks are part of the HK culture and ...,negative,-1
1,I was told by the repair company that was doin...,negative,-1
2,It is there to give them a good time .,neutral,0
3,Like leafing through an album of photos accomp...,negative,-1
4,Johnny was a talker and liked to have fun.,positive,1
...,...,...,...
102092,I thought this place was supposed to be good.,negative,-1
102093,They claim it's because people didn't like it ...,negative,-1
102094,There is also another marbled-out full bathroo...,neutral,0
102095,You put in your cell phone number & select a d...,neutral,0


# Build Validation Set
We use 90% of the reviews for training, and the remaining 10% for validation

In [9]:
from sklearn.model_selection import train_test_split
sentences = training_data['sentence']
labels = training_data['label_encoded']

In [10]:
# Fix Random Seed for Reproducibility
random_seed = 42
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences,labels, test_size=0.1, stratify=labels, random_state=random_seed)

# Bag-of-words + Logistic Regression baseline

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# We only keep the 10'000 most frequent words and bigrams (i.e. word pairs)
# This is both to reduce the computational cost and reduce potential overfitting

vectorizer = CountVectorizer(ngram_range= (1,2),max_features=10000)

# An Example of another BoW Vectorizer
other_vectorizer = CountVectorizer(ngram_range=(1, 2),stop_words='english',max_features=10000,min_df = 10, max_df = 0.9)

# Important: we call fit_transform on the training set, and only transform on the validation set
X_train = vectorizer.fit_transform(train_sentences)
X_val = vectorizer.transform(val_sentences)

Y_train = train_labels
Y_val = val_labels

In [12]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1419964 stored elements and shape (91887, 10000)>

Now we train a logistic regression classifier...

In [14]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression Classifier, C: Inverse of regularization strength , max_iter: Maximum number of training iterations

model = LogisticRegression(C=1.0, max_iter=100)
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
Y_train_pred = model.predict(X_train)
# Predict on Validation Set
Y_val_pred = model.predict(X_val)

In [18]:
from sklearn.metrics import mean_absolute_error
# Score on Training Set
mae_train = mean_absolute_error(Y_train, Y_train_pred)
L_score_train = 0.5 * (2 - mae_train)
# Score on Validation Set
mae_val = mean_absolute_error(Y_val, Y_val_pred)
L_score_val = 0.5 * (2 - mae_val)

In [19]:
print(f'Evaluation Score (training set): {L_score_train:.05f}')
print(f'Evaluation Score (validation set): {L_score_val:.05f}')

Evaluation Score (training set): 0.85408
Evaluation Score (validation set): 0.79917


# Test Data

In [20]:
test_data = pd.read_csv('./data/test.csv',index_col = 0)

In [21]:
test_data

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Found Thai Spoon on the Vegan Pittsburgh website.
1,Our bill came out to around $27 and we ate lik...
2,State Farm broke down the costs for me of the ...
3,The only con for this resto is the wait to get...
4,We could hear the people above us stomping aro...
...,...
11946,I went back in to ask for cilantro dressing th...
11947,"Here , Adrian Lyne comes as close to profundit..."
11948,The actors are so terrific at conveying their ...
11949,It should be mentioned that the set design and...


In [22]:
X_test = vectorizer.transform(test_data['sentence'])
X_test

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 190560 stored elements and shape (11951, 10000)>

In [23]:
y_test = model.predict(X_test)

In [24]:
# Save predictions in the correct format
y_labels = pd.Series(y_test).map({-1: 'negative', 0: 'neutral', 1: 'positive'})
submission = pd.DataFrame({'id': test_data.index, 'label': y_labels})
submission.to_csv('test_predictions.csv', index=False) # Update filename and path as needed
print("Test predictions saved to 'test_predictions.csv'")

Test predictions saved to 'test_predictions.csv'


# Model Interpretation

In [None]:
# Top N most Important Words & Word Pairs per Output Class (Pos, Neutral, Negative)
feature_names = vectorizer.get_feature_names_out() # get names of all tokens from vectorizer
coefs = model.coef_  # Weights per Feature for each Output Class; Shape: (Num_Output_Classes, Num_Features)

# Get Top_n Features by Weight for each Class
def get_top_features(class_index, top_n=10):
    class_coef = coefs[class_index]
    top_indices = np.argsort(class_coef)[-top_n:]
    return [feature_names[i] for i in reversed(top_indices)]

print("Top words & bigrams for negative (-1):", get_top_features(0))
print("Top words & bigrams for positive (1):", get_top_features(2))
print("Top words & bigrams for neutral (0):", get_top_features(1))

Top words & bigrams for negative (-1): ['sucked', 'sucks', 'hated', 'not good', 'pissed', 'overpriced', 'stale', 'awful', 'disappointing', 'meh']
Top words & bigrams for positive (1): ['not bad', 'fabulous', 'bomb', 'terrific', 'beautifully', 'refreshing', 'awesome', 'delicious', 'amazing', 'pleased']
Top words & bigrams for neutral (0): ['because my', 'just like', 'fast forward', 'contacted', 'the left', 'one is', 'assume', 'and wanted', 'you the', 'leave the']


In [25]:
# Confusion Matrix - Negative, Neutral, Positive
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(Y_val,Y_val_pred, labels=[-1, 0, 1])
print(conf_matrix)

[[ 989  812  390]
 [ 408 3991  516]
 [ 301  983 1820]]


In [26]:
# Examples of Misclassified Sentences
number_examples = 5
label_map = {-1: 'negative', 0: 'neutral', 1: 'positive'}

misclassified = [
    (label_map[true], label_map[pred], text)
    for true, pred, text in zip(Y_val, Y_val_pred, val_sentences)
    if true != pred
]

import random
for true, pred, text in random.sample(misclassified, number_examples):
    print(f"True: {true}, Pred: {pred} → {text}")

True: negative, Pred: positive → The tickets were a touch steep for what I would have expected ($9), but that would be my only gripe.
True: neutral, Pred: positive → honey is the predominant flavor and miso is in there but if I didn't know there was miso in there, I probably couldn't pinpoint the flavor.
True: negative, Pred: neutral → When we paid we paid for double the amount that she thought it was
True: negative, Pred: positive → I've used other services before and have seen "big-name" companies just rush in and out without saying a word using unskilled workers.
True: negative, Pred: neutral → Dude, this is not brain surgery.
