In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# read the data
df = pd.read_csv("data/labelled_movie_reviews.csv")

# shuffle the rows
# NOTE: we will only use a fraction of the dataset for this lab
# so that you don't have to wait so long for the model to train
df = df.sample(frac=0.3, random_state=123).reset_index(drop=True)

In [3]:
# convert pandas to lists
Xr = df["text"].tolist()
Yr = df["label"].tolist()

# compute the train, val, test splits
train_frac, val_frac, test_frac = 0.7, 0.1, 0.2
train_end = int(train_frac*len(Xr))
val_end = int((train_frac + val_frac)*len(Xr))

# store the train val test splits
X_train = Xr[0:train_end]
Y_train = Yr[0:train_end]
X_val = Xr[train_end:val_end]
Y_val = Yr[train_end:val_end]
X_test = Xr[val_end:]
Y_test = Yr[val_end:]

In [4]:
def document_to_vector(X):
    # CountVectorizer is used to convert the text into sparse tf vectors
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    count_vect = CountVectorizer()
    X_tf = count_vect.fit_transform(X)
    return X_tf.toarray(), count_vect

In [5]:
# fit a linear model
# Xtr - A list of training documents provided as text
# Ytr - A list of training class labels
# C - the regularization parameter
def fit_model(Xtr, Ytr, C):
    #TODO: write model fitting code using CountVectorizer and LogisticRegression
   
    X_tf, count_vec = document_to_vector(Xtr)
    
    # LogisticRegression will train the classifier using these vectors
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    log_reg = LogisticRegression(C=C)
    model = log_reg.fit(X_tf, Ytr)

    # return the model and CountVectorizer
    # Note: we need to return the CountVectorizer because 
    # it stores a mapping from words -> ids which we need for testing
    return model, count_vec

In [6]:
# test a fitted linear model
# Xtst - A list of test or validation documents
# Ytst - A list of test or validation class labels
def test_model(Xtst, Ytst, model, count_vec):
    # TODO: write code to test a fitted linear model and return accuracy
    # Hint: the function accuracy_score from sklearn may be helpful
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html 
    X_tf = count_vec.transform(Xtst)
    score = model.score(X_tf, Ytst)
    return score

In [7]:
# TODO: search for the best C parameter by 
# training on the training set and testing on the validation set
# you should use fit_model and test_model
K = [-5, -4, -3, -2, -1, 0 , 1, 2, 3, 4, 5]
for k in K:
    C = math.pow(3, k)
    model, count_vec = fit_model(X_train, Y_train, C)
    score = test_model(X_val, Y_val, model, count_vec)
    print("K: " + str(k) + " Score: " + str(score))



K: -5 Score: 0.8692461641094062




K: -4 Score: 0.8812541694462975




K: -3 Score: 0.885256837891928




K: -2 Score: 0.8865910607071381




K: -1 Score: 0.8832555036691128




K: 0 Score: 0.875917278185457




K: 1 Score: 0.8745830553702468




K: 2 Score: 0.875250166777852




K: 3 Score: 0.8699132755170114




K: 4 Score: 0.8692461641094062




K: 5 Score: 0.8692461641094062


In [8]:
# TODO: fit the model to the concatenated training and validation set
# you should use the best C parameter 
# test on the test set and print the result

# compute the train, val, test splits
tr_frac = 0.8
tr_end = int(tr_frac*len(Xr))


# store the train test splits
X_tr = Xr[0:tr_end]
Y_tr = Yr[0:tr_end]
X_ts = Xr[tr_end:]
Y_ts = Yr[tr_end:]

C = math.pow(3, -2)
model, count_vec = fit_model(X_tr, Y_tr, C)
score = test_model(X_ts, Y_ts, model, count_vec)
print("K: " + str(-2) + " Score: " + str(score))



K: -2 Score: 0.8903333333333333


In [34]:
# TODO: find the words corresponding to the 5 largest and 5 smallest co-efficients of the linear model
# Hint: a fitted LogisticRegression model in sklearn has a coef_ attribute which stores the co-efficients
# Hint2: CountVectorizer has a vocabulary_ attribute that stores a 
#X_tf = count_vec.transform(X_tr)

n=5
indices = (-model.coef_[0]).argsort()[:n]
print("Top 5 positive words: ")
for idx in indices:
    print(list(count_vec.vocabulary_.keys())[list(count_vec.vocabulary_.values()).index(idx)])
    
n= 5
indices = (model.coef_[0]).argsort()[:n]
print("Top 5 negative words: ")
for idx in indices:
    print(list(count_vec.vocabulary_.keys())[list(count_vec.vocabulary_.values()).index(idx)])

Top 5 positive words: 
excellent
perfect
loved
amazing
great
Top 5 negative words: 
worst
waste
awful
boring
terrible
