In [1]:
import numpy as np
import pandas as pd
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
url='C:/Users/tassi/Downloads/CMT307/Coursework1/datasets_coursework1/IMDb/'

#Load positive reviews train
path= url+'train/imdb_train_pos.txt'
df_train_pos=pd.read_csv(path,sep='\n')

#Load negative reviews train
path= url+'train/imdb_train_neg.txt'
df_train_neg=pd.read_csv(path,sep='\n')

#Load positive reviews dev
path= url+'dev/imdb_dev_pos.txt'
df_dev_pos=pd.read_csv(path,sep='\n')

#Load negative reviews dev
path= url+'dev/imdb_dev_neg.txt'
df_dev_neg=pd.read_csv(path,sep='\n')

#Load positive reviews test
path= url+'test/imdb_test_pos.txt'
df_test_pos=pd.read_csv(path,sep='\n')

#Load negative reviews test
path= url+'test/imdb_test_neg.txt'
df_test_neg=pd.read_csv(path,sep='\n')

# Data Pre processing

Most of the pre processing will happen when vectorizing. 
Here we call the stopwords from existing list and lemmatize the reviews


In [3]:
#function read stop words in file
def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r',) as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)
 
 # load a set of stop words
stopwords=get_stop_words(url+'stopwords-en.txt')


In [4]:
# Naming review column for datasets
list_df=[df_train_pos,df_train_neg,df_dev_pos,df_dev_neg,df_test_pos,df_test_neg]

for df in list_df:
    df.columns=['text']


In [5]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

# lemmatization of words in the reviews
for df in list_df:
    df['text']=[' '.join([lemmer.lemmatize(word.lower(),'v') for word in text.split(' ')]) for text in df['text']]

In [6]:
print(df_train_neg[:5])

                                                text
0  well i guess i know the answer to that questio...
1  i really like the movie 'the emporer's new gro...
2  thats what this movie really takes. a big piec...
3  i be look for a documentary of the same journa...
4  do anyone care about any of the character in t...


# Validation

This is where we will tune the model to get better results

In [7]:
# Merging positive and negative reviews into list X_train and adding label into Y_train to train model
X_train=[]
Y_train=[]
for instance in df_train_pos['text']:
    X_train.append(instance)
    Y_train.append(1)
for instance in df_train_neg['text']:
    X_train.append(instance)
    Y_train.append(0)

#It is recommended to work with numpy arrays instead of Python lists.
X_train_sentanalysis=np.asarray(X_train)
Y_train_sentanalysis=np.asarray(Y_train)

In [8]:
# Merging positive and negative reviews into list X_dev and adding label into Y_train to validate model
X_dev=[]
Y_dev=[]
for instance in df_dev_pos['text']:
    X_dev.append(instance)
    Y_dev.append(1)
for instance in df_dev_neg['text']:
    X_dev.append(instance)
    Y_dev.append(0)

#It is recommended to work with numpy arrays instead of Python lists.
X_test=np.asarray(X_dev)
Y_test=np.asarray(Y_dev)

In [9]:
# A test to confirm shape
#X_train_sentanalysis.shape

In [24]:
# function to train SVM Classifier
def train_svm_classifier(X_train_sentanalysis, Y_train_sentanalysis, num_features, kBest):
  
    # FeatureUnion will apply parallel feature extraction and vectorization to the train set
    # Feature extraction selected: Bag of Words, BiGram Bag of words and Tfidf
    feature_union = FeatureUnion([
        ('bow',CountVectorizer(max_features=num_features, stop_words=stopwords,)),
        ('bigram_bow',CountVectorizer(max_features=num_features,ngram_range=(2,2),stop_words=stopwords)),  
        ('tfidf', TfidfVectorizer(max_features=num_features, stop_words=stopwords)), 
    ])

    # Pipeline to fit the training set to SVM, starting with feauture extraction and Chi2 feature selection
    pipeline = Pipeline([
    ('features',feature_union),
    ('reducer', SelectKBest(chi2, k=kBest)), 
    ('classifier', sklearn.svm.SVC(kernel="linear",gamma='auto')),  # train on vectors w/ classifier
    ])
  
    pipeline.fit(X_train_sentanalysis,Y_train_sentanalysis) #fit the data to model
    return pipeline

In [11]:
list_num_features=[1000,5000,10000,15000,20000] # tried with list 1000,5000,10000,15000, 20000,
best_accuracy_dev=0.0
for num_features in list_num_features:

    # Tested kBest= 5000(acc: 86.1), 3000(acc: 86.7)--Best Result found , 2000(acc: 86.3)
    svm=train_svm_classifier(X_train_sentanalysis,Y_train_sentanalysis, num_features, kBest=3000) 
    predictions = svm.predict(X_test)
  
    # if a list of number of features is used for validation, the following code will print the Best overall accuracy
    accuracy_dev=accuracy_score(Y_test,predictions)
    print ("Accuracy with "+str(num_features)+" features: "+str(round(accuracy_dev,3)))
    if accuracy_dev>=best_accuracy_dev:
        best_accuracy_dev=accuracy_dev
        best_num_features=num_features
print ("\n Best accuracy overall in the dev set is "+str(round(best_accuracy_dev,3))+" with "+str(best_num_features)+" features.")

Accuracy with 1000 features: 0.83
Accuracy with 5000 features: 0.86
Accuracy with 10000 features: 0.863
Accuracy with 15000 features: 0.863
Accuracy with 20000 features: 0.864

 Best accuracy overall in the dev set is 0.864 with 20000 features.


In [12]:
# A test to confirm number of features
# svm.named_steps['classifier'].support_vectors_.shape

# Test Model

In [13]:
# Merging positive and negative reviews X_test and adding label
X_test=[]
Y_test=[]
for instance in df_test_pos['text']:
    X_test.append(instance)
    Y_test.append(1) # 1 for positive
for instance in df_test_neg['text']:
    X_test.append(instance)
    Y_test.append(0) # 0 for negative

X_test_sentianalysis=np.asarray(X_test)
Y_test_sentianalysis=np.asarray(Y_test)

In [25]:
#training model with tuned parameter
svm_clf=train_svm_classifier(X_train_sentanalysis,Y_train_sentanalysis, num_features=20000, kBest=3000) 

In [27]:
Predicted_Y_test=svm_clf.predict(X_test_sentianalysis) # Predicting Y_test using the tuned model

In [28]:
# Model evaluation
print('Accuracy: '+str(accuracy_score(Y_test_sentianalysis, Predicted_Y_test).round(4)))
print('Precision: '+str(precision_score(Y_test_sentianalysis, Predicted_Y_test).round(4)))
print('Recall: '+str(recall_score(Y_test_sentianalysis, Predicted_Y_test).round(4)))
print('F1-score: '+str(f1_score(Y_test_sentianalysis, Predicted_Y_test).round(4)))

Accuracy: 0.8565
Precision: 0.8561
Recall: 0.8571
F1-score: 0.8566
