# Prototype 2 for Sentiment Analysis with full scale data

This is an improvement from the first prototype for the Sentiment analysis. In this version, we will train using the entire dataset. The notebook assumes the [Split dataest into multiple files.ipynb](Split%20dataset%20into%20multiple%20files.ipynb) notebook was executed to split the dataset into multiple files of 10000 lines each. The training will do mini-batch training using the split chunks.

### Define the imports

In [1]:
import pandas as pd
import numpy as np
import bz2
import os
import matplotlib.pyplot as plt
import re
import nltk

### Load database required for removing stopword and lemmatization

In [2]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()

# Downloads all english dictionary words
nltk.download('words')
english_words = set(nltk.corpus.words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siva.alagarsamy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siva.alagarsamy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\siva.alagarsamy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


### Define a function to divide the large dataset into multiple chunks

In [3]:
def split_file(txt_bz_file, output_path=".", ):
    
    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    file_name = txt_bz_file.split("/")[-1]
    file_name_prefix = file_name.split(".")[0]
    row_number = 0
    chunk = 1
    out_file_name = f"{output_path}/{file_name_prefix}_{chunk}.txt.bz2"
    out_file = bz2.open(out_file_name, "wt", encoding='utf-8')
    
    with bz2.open(txt_bz_file, "rt", encoding='utf-8') as bz_file:
        for line in bz_file:
            out_file.write(line)
            row_number += 1
            if row_number == 10000 :
                out_file.close()
                chunk += 1
                out_file_name = f"{output_path}/{file_name_prefix}_{chunk}.txt.bz2"
                out_file = bz2.open(out_file_name, "wt", encoding='utf-8')
                row_number = 0
    out_file.close()
    
    # If the last file was an empty file, delete it. 
    if row_number == 0 :
        os.remove(out_file_name)

### Divide the training dataset

In [4]:
split_file('data/train.ft.txt.bz2', output_path='data/train_chunks')

### Divide the test dataset

In [5]:
split_file('data/test.ft.txt.bz2', output_path='data/test_chunks')

### Define a function to normalize words in a sentence
We do the following
+ Convert all words to lower case, so we are doing not analyzing words with different case as different words
+ Drop any stop words like I, me, this, is ...
+ Remove words that are not in english dictionary. 
+ Remove punctuations
+ Lemmatize words. This is converting different forms of a word to a base form.  E.g convert word like caring to care, bats to bat

In [6]:
punctuations = "!@#$%^&*()_-+={[}]|\:;'<,>.?/~`"

def to_words(text):
    words = []
    tokens = re.findall('\w+', text)
    for w in tokens:
        # Convert to lower
        w = w.lower()
        
        # Remove punctuations
        w = "".join([char for char in w if char not in punctuations])
        
        # Don't add word if it is a stopword
        if w not in stop_words:      
            
            # Make sure it is valid english word
            if w in english_words:
                # Lemmatize word
                w = lemmatizer.lemmatize(w, 'v')  #Assume most of the review is verb part of the speech (POS)
                words.append(w)
            
    return words

### Define a function that will load the reviews file and convert it to normalized words and return the sentiment labels and words as array

In [7]:
def load_data(txt_bz_file):
    sentiments = []
    reviews = []
    
    with bz2.open(txt_bz_file, "rt", encoding='utf-8') as bz_file:
        for line in bz_file:
            # Label and review are separated by space
            label, review = line.split(' ', maxsplit=1)
            
            # label has a format __label__2  we just need the last number
            sentiments.append(int(label[9:]))
            
            # The title and the body are separated by :, so we split them 
            title, body = review.split(':', maxsplit=1)
            
            title_part = " ".join(to_words(title))
            body_part = " ".join(to_words(body))
            
            sentence = " ".join([title_part, body_part])
            reviews.append(sentence)
    return sentiments, reviews

### Define methods for TfidfVectorizer to be updated over multiple dataset

Sklearn doesn't provide a partial_fit method fo TfidfVectorizer, but someone has a patch that will allow the TfidfVectorizer to be updated in batch.  [Here is the link to the Stackoverflow page](https://stackoverflow.com/questions/39109743/adding-new-text-to-sklearn-tfidif-vectorizer-python)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.dia import dia_matrix

def tfidf_first_fit(self, X):
    self.fit(X)
    self.n_docs = len(X)
    
def tfidf_update_fit(self, X):
    max_idx = max(self.vocabulary_.values())
    for a in X:
        #update vocabulary_
        if self.lowercase: a = a.lower()
        tokens = re.findall(self.token_pattern, a)
        for w in tokens:
            if w not in self.vocabulary_:
                max_idx += 1
                self.vocabulary_[w] = max_idx

        #update idf_
        df = (self.n_docs + self.smooth_idf)/np.exp(self.idf_ - 1) - self.smooth_idf
        self.n_docs += 1
        df.resize(len(self.vocabulary_))
        for w in tokens:
            df[self.vocabulary_[w]] += 1
        idf = np.log((self.n_docs + self.smooth_idf)/(df + self.smooth_idf)) + 1
        self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))

TfidfVectorizer.first_fit = tfidf_first_fit
TfidfVectorizer.update_fit = tfidf_update_fit

### Fit the TfidVectorizer on all of the train datasets

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=5)

# Go over the training chunck files 1 - 360
for chunk in range(1,361):
    file_name = f"data/train_chunks/train_{chunk}.txt.bz2"
    sentiments, reviews = load_data(file_name)
    
    if chunk == 1 :
        tfidf_vectorizer.first_fit(reviews)
    else:
        tfidf_vectorizer.update_fit(reviews)


### Train SGDClassifier using partial_fit 

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

clf = SGDClassifier(max_iter=5000, random_state=7)

# Go over the training chunck files 1 - 360
for chunk in range(1,361):
    file_name = f"data/train_chunks/train_{chunk}.txt.bz2"
    sentiments, reviews = load_data(file_name)
    train_tfidf = tfidf_vectorizer.transform(reviews)
    train_df = pd.DataFrame(train_tfidf.toarray(), 
                            columns=tfidf_vectorizer.get_feature_names())
    
    if chunk == 1 :
        # classes are 1 & 2, 1 = Bad, 2 = Good
        clf.partial_fit(train_df, sentiments, classes=[1, 2])
    else :
        clf.partial_fit(train_df, sentiments)
   

### Test the model

Test the model by going through the test chucks

In [11]:
# Go over the test chunck files

accuracy_sum = 0
f1_score_sum = 0
conf_sum = [[0, 0], [0,0]]
num_chunks = 0

result_file = open("Prototype2_result.txt", "w")

# Chunks 1 - 40
for chunk in range(1,41):
    file_name = f"data/test_chunks/test_{chunk}.txt.bz2"
    sentiments, reviews = load_data(file_name)
    test_tfidf = tfidf_vectorizer.transform(reviews)
    test_df = pd.DataFrame(test_tfidf.toarray(), 
             columns=tfidf_vectorizer.get_feature_names())

    sentiments_predicted = clf.predict(test_df)

    accuracy = accuracy_score(sentiments, sentiments_predicted)
    confusion = confusion_matrix(sentiments, sentiments_predicted)
    f1 = f1_score(sentiments, sentiments_predicted)
    
    accuracy_sum += accuracy
    f1_score_sum += f1
    conf_sum += confusion
    num_chunks += 1
    
    # Print accuracy score and confusion matrix
    result_file.write(f'{file_name}\r\n')
    result_file.write(f'Accuracy score  = {accuracy}\r\n')
    result_file.write(f'Confusion Matrix = {confusion}\r\n')
    result_file.write(f'F1 Score = {f1}\r\n' )
    result_file.write('----------------------------------------------------\r\n')

accuracy_avg = accuracy_sum / num_chunks
f1_avg = f1_score_sum / num_chunks

result_file.write(f'Average Acuuracy = {accuracy_avg}\r\n')
result_file.write(f'Average F1 Score = {f1_avg}\r\n')
result_file.write(f'Confusion Matrix = {conf_sum}\r\n')
result_file.close()

print('Average Accuracy = ', accuracy_avg)
print('Average F1 Score = ', f1_avg)
print('Confusion Matrix = ', conf_sum)


Average Accuracy =  0.8690049999999999
Average F1 Score =  0.8684925138756938
Confusion Matrix =  [[173091  26909]
 [ 25489 174511]]


## Results

The prototype has an accuracy of 86%. This will be improved using deep learning in the later iteration of the prototype.