####Problem 3: Sentiment Analysis

In [24]:
# Importing all the necessary Libraries
from tensorflow.keras.utils import get_file
import tarfile
from glob import glob
import os,re,string
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [25]:
# Downloading Data from "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
data_directory = get_file('aclImdb_v1.tar.gz', 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', cache_subdir = "datasets",hash_algorithm = "auto", extract = True, archive_format = "auto")
tar_file = tarfile.open(data_directory)
# specifying the folder we need to extract
tar_file.extractall('./data/') 
tar_file.close()

In [26]:
# Extracting data from downloaded files and loading the dataset for sentiment analysis
#Specyfing the file path as per the problem statement
file_path ='./data/aclImdb/'
#specifying both the postive and negative files required for the analysis
files = ['neg','pos']

def load_dataset(file_path, folders):
    texts,labels = [],[]
    for i,label in enumerate(folders):
        for fname in glob(os.path.join(file_path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(i)
    
    return texts, np.array(labels).astype(np.int64)

x_train,y_train = load_dataset(f'{file_path}train',files)
x_test,y_test = load_dataset(f'{file_path}test',files)

In [4]:
#Displaying the length of the training dataset 
print(len(x_train))
print(len(y_train))

25000
25000


In [5]:
'''
Performing the data preprocessing steps:

In order to facilitate the data interpretation raw texts obtained are preprocessed. First, elements such as punctuations, line breaks, numbers, and stop words like ‘a’, ‘the’, and ‘of’ are removed since they provide
little information about the user’s impression towards a movie. Then, all the words are converted to lower cases and normalized to its true root.

'''

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(x_train)
reviews_test_clean = preprocess_reviews(x_test)

In [6]:
#veryfing the clean training dataset

print(len(reviews_train_clean))
print(reviews_train_clean[:1])



25000
["after having seen and loved postal yes i actually loved postal i decided to try another uwe boll film and i picked out seed because i happened to stumble on it in a local dvd store and it's supposed to be one of his better films while the first  to  minutes of the film were very promising and seemed like the beginning of a not too mainstream psychological thriller it soon went downhill from there and eventually degraded into one of the most generic slasher films i've seen so far including a massive amount of plot holes unrealistic emotional responses and sub par acting it seems like boll tried his best to come up with a decent plot but after a while just gave up on it maybe he should stick to comedy the few good things about this film is that he does manage to create an overall creepy atmosphere that the special effects are better than i expected and the soundtrack does go well with the overall atmosphere but the unbalanced pacing of this film combined with the utter generic na

In [32]:
# Model - SVM n-gram Vectorization

'''
SVMs acknowledge the particular properties of text: (a) high dimensional feature spaces, (b) few irrelevant features (dense concept vector), and (c) sparse instance vectors.  SVMs consistently achieve good performance on text categorization tasks, outperforming existing methods substantially and signicantly. With their ability to generalize well in high dimensional
feature spaces, SVMs eliminate the need for feature selection, making the application of text categorization considerably easier. Hence SVM is used.

Vectorization is the process of transforming the text data into numeric representations so that the data can be understandable by machine learning algorithms. 
Instead of just single-word tokens (1-gram/unigram) we can also include word pairs. n gram Vectorization along with SVM have been proven to work better for text classification problems apart from other pre trained models. So that model is considered for this problem.
'''

#Importing the required libraries for the model

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

#By removing stop words, we remove the low-level information from our text in order to give more focus to the important information.
stop_words = ['in', 'of', 'at', 'a', 'the']

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

#Spliting the data set into training and validation set
X_train, X_val, y_training, y_val = train_test_split(X, y_train, test_size=0.5)

# Regularisation parameter optimises the model. So trying to find the right value of c to get most accuracy
for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_training)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
        
# Model with final selection of Regularization parameter   
final = LinearSVC(C=0.01)
final.fit(X_train, y_training)
print ("Final Accuracy of training set: %s" 
       % accuracy_score(y_train, final.predict(X)))
#Accuracy of test set
final.fit(X, y_test)
print ("Final Accuracy of test set: %s" 
       % accuracy_score(y_test, final.predict(X_test)))


Accuracy for C=0.001: 0.87816
Accuracy for C=0.005: 0.88424
Accuracy for C=0.01: 0.88504
Accuracy for C=0.05: 0.88536
Accuracy for C=0.1: 0.8852
Final Accuracy of training set: 0.94252
Final Accuracy of test set: 0.90064


In [34]:
#saving the model in model folder 

import pickle
file_name = "models/Group46_NLP_model.h5"
pickle.dump(svm, open(file_name, 'wb'))
pickle.dump(final, open(file_name, 'wb'))

#loading the model
svm = pickle.load(open(file_name,'rb'))
final = pickle.load(open(file_name,'rb'))



In [36]:

print ("Final Accuracy of test set: %s" 
       % accuracy_score(y_test, final.predict(X_test)))


Final Accuracy of test set: 0.90064


Thus SVM Linear SVC classifier with n-gram Vectorisation provides 94% accuracy for the IMDB large movie review dataset when 50% train test split is performed with C=0.01 and n - gram range between 1 to 3. Whereas, the models provides an accuracy of 90% for the test dataset. 