## MFIT5010 Statistical Machine Learning Project

In [3]:
#load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import glob
import spacy
import re,string,unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from textblob import TextBlob
from textblob import Word
from wordcloud import WordCloud,STOPWORDS

## Data processing

In [4]:
#### import data #####
#add a new column with value = 1 for train positive  
train_pos = pd.read_csv('desktop/ML project/train_pos.csv')
train_pos["sentiment"] = "1"

#add a new column with value = 0 for train negative 
train_neg = pd.read_csv('desktop/ML project/train_neg.csv')
train_neg["sentiment"] = "0"

#add a new column with value = 1 for test positive 
test_pos = pd.read_csv('desktop/ML project/test_pos.csv')
test_pos["sentiment"] = "1"

#add a new column with value = 0 for test negative 
test_neg = pd.read_csv('desktop/ML project/test_neg.csv')
test_neg["sentiment"] = "0"

In [5]:
#view data
train_pos

Unnamed: 0.1,Unnamed: 0,Review,sentiment
0,0,Bromwell High is a cartoon comedy. It ran at t...,1
1,1,Homelessness (or Houselessness as George Carli...,1
2,2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,3,This is easily the most underrated film inn th...,1
4,4,This is not the typical Mel Brooks film. It wa...,1
...,...,...,...
12495,12495,"Seeing as the vote average was pretty low, and...",1
12496,12496,"The plot had some wretched, unbelievable twist...",1
12497,12497,I am amazed at how this movie(and most others ...,1
12498,12498,A Christmas Together actually came before my t...,1


In [6]:
test_pos

Unnamed: 0.1,Unnamed: 0,Review,sentiment
0,0,I went and saw this movie last night after bei...,1
1,1,Actor turned director Bill Paxton follows up h...,1
2,2,As a recreational golfer with some knowledge o...,1
3,3,"I saw this film in a sneak preview, and it is ...",1
4,4,Bill Paxton has taken the true story of the 19...,1
...,...,...,...
12495,12495,I was extraordinarily impressed by this film. ...,1
12496,12496,"Although I'm not a golf fan, I attended a snea...",1
12497,12497,"From the start of ""The Edge Of Love"", the view...",1
12498,12498,"This movie, with all its complexity and subtle...",1


In [7]:
train_neg

Unnamed: 0.1,Unnamed: 0,Review,sentiment
0,0,Story of a man who has unnatural feelings for ...,0
1,1,Airport '77 starts as a brand new luxury 747 p...,0
2,2,This film lacked something I couldn't put my f...,0
3,3,"Sorry everyone,,, I know this is supposed to b...",0
4,4,When I was little my parents took me along to ...,0
...,...,...,...
12495,12495,"Towards the end of the movie, I felt it was to...",0
12496,12496,This is the kind of movie that my enemies cont...,0
12497,12497,I saw 'Descent' last night at the Stockholm Fi...,0
12498,12498,Some films that you pick up for a pound turn o...,0


In [8]:
test_neg

Unnamed: 0.1,Unnamed: 0,Review,sentiment
0,0,Once again Mr. Costner has dragged out a movie...,0
1,1,This is an example of why the majority of acti...,0
2,2,"First of all I hate those moronic rappers, who...",0
3,3,Not even the Beatles could write songs everyon...,0
4,4,Brass pictures (movies is not a fitting word f...,0
...,...,...,...
12495,12495,I occasionally let my kids watch this garbage ...,0
12496,12496,When all we have anymore is pretty much realit...,0
12497,12497,The basic genre is a thriller intercut with an...,0
12498,12498,Four things intrigued me as to this film - fir...,0


In [9]:
# combine the train positive and train negative data 
train = [train_pos, train_neg]
train = pd.concat(train, ignore_index = True)

# combine the test positive and test negative data 
test = [test_pos, test_neg]
test = pd.concat(test, ignore_index = True)

In [10]:
print(train.shape)

(25000, 3)


In [11]:
print(test.shape)

(25000, 3)


In [12]:
#### sentiment count #####
train['sentiment'].value_counts() 

0    12500
1    12500
Name: sentiment, dtype: int64

In [13]:
test['sentiment'].value_counts() # We can see that the dataset is balanced.

0    12500
1    12500
Name: sentiment, dtype: int64

We can see that the both the training dataset and test dataset are balanced, with 12500 positives and 12500 negatives
in each dataset perspectively.

In [14]:
pip install --user -U nltk

Requirement already up-to-date: nltk in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (3.5)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install --user -U numpy

Requirement already up-to-date: numpy in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (1.18.4)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
##### text normalization #####
#Tokenization of text
tokenizer = ToktokTokenizer()

In [17]:
##### Removing html strips and noise text #####
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [18]:
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

In [19]:
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [20]:
#Apply function on review column
train['Review'] = train['Review'].apply(denoise_text)
test['Review'] = test['Review'].apply(denoise_text)

In [21]:
##### Removing special characters #####
#Define function for removing special characters 
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
train['Review'] = train['Review'].apply(remove_special_characters)
test['Review'] = test['Review'].apply(remove_special_characters)

In [22]:
##### removing stopwords ##### 
#define stopword list 
stopword_list = ['that', 'when', 'did', 'now', 'is', 'ain', 'its', "she's", 'having', 'to',
                 're', 'we', 'further', 'they', 'why', "needn't", 'being', 'hasn', 'after',
                 'there', 'until', "it's", 'own', 'as', "shan't", 'or', 'himself', 'these',
                 'here', 'a', 'those', 'wasn', 'myself', 'not', 'again', 'have', 'ourselves',
                 'nor', 'itself', 'both', "you'll", "aren't", 'because', "wouldn't", 'other',
                 'at', 'her', "didn't", 'all', 'above', 'didn', "mightn't", 'what', 'any',
                 'y', 'him', 'doesn', 'how', 'do', 'on', 'than', 'under', 'shan', 'me', 'few',
                 'no', 'can', "you'd", 'out', 'couldn', 'needn', "hadn't", 'ma', 'd', 'our', 
                 'of', 'with', 'wouldn', 'hers', 'against', 'for', 'theirs', 'has', 'yourselves',
                 'before', 'who', 'too', 's', 'each', 'had', 'ours', 'below', "isn't", 'while',
                 'don', 'yourself', 'into', "hasn't", "you're", 'their', "don't", 'over', "wasn't",
                 'yours', "should've", 'only', "weren't", 'between', 'themselves', 'once', 'be',
                 'from', 'about', 'them', 'been', 'o', 'your', 'doing', 'am', 'should', 'does',
                 'whom', 'my', 'he', 'it', 'an', 'in', 'same', 'very', 'his', 'and', 'most',
                 've', "haven't", 'isn', "mustn't", 'weren', 't', 'aren', 'if', 'then', 'won',
                 'through', 'haven', 'the', 'where', 'i', 'off', 'some', "shouldn't", 'mightn',
                 'down', 'm', 'shouldn', 'which', "that'll", 'more', 'such', "won't", 'up', 'she',
                 'was', 'herself', 'but', 'by', 'so', 'you', 'this', 'just', 'will', 'are', 'mustn',
                 'during', "doesn't", 'hadn', 'll',"you've", 'were', "couldn't"]

In [23]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [27]:
##### Bags of words model ##### 
#Count vectorizer for bag of words
cv = CountVectorizer(min_df= 0, max_df = 1, binary = False, ngram_range = (1,3))
#transformed train reviews
cv_train_reviews = cv.fit_transform(train['Review'])
#transformed test reviews
cv_test_reviews = cv.transform(test['Review'])
print('BOW_cv_train:', cv_train_reviews.shape) 
print('BOW_cv_test:', cv_test_reviews.shape)

BOW_cv_train: (25000, 4516501)
BOW_cv_test: (25000, 4516501)


In [25]:
##### Term Frequency-Inverse Document Frequency model (TFIDF) ##### 
#Tfidf vectorizer
tv = TfidfVectorizer(min_df = 0, max_df = 1, use_idf = True, ngram_range = (1,3))
#transformed train reviews
tv_train_reviews = tv.fit_transform(train['Review'])
#transformed test reviews
tv_test_reviews = tv.transform(test['Review'])
print('Tfidf_train:',tv_train_reviews.shape) 
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (25000, 4516501)
Tfidf_test: (25000, 4516501)


In [30]:
##### Labeling the sentiment text ##### 
#labeling the sentient data
lb = LabelBinarizer()
#transformed sentiment train data 
sentiment_data = lb.fit_transform(train['sentiment'])
print(sentiment_data.shape)
#transformed sentiment test data 
sentiment_data = lb.fit_transform(test['sentiment'])
print(sentiment_data.shape)

(25000, 1)
(25000, 1)


In [31]:
#Spliting the sentiment data 
sentiment_train = train['sentiment']
sentiment_test = test['sentiment']

# Modeling the dataset

In [32]:
##### logistic regression model for both bag of words and tfidf features #####

In [33]:
#training the model
logit = LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42)
#Fitting the model for Bag of words
logit_bow = logit.fit(cv_train_reviews, sentiment_train)
print(logit_bow)
#Fitting the model for tfidf features
logit_tfidf = logit.fit(tv_train_reviews, sentiment_train)
print(logit_tfidf)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [34]:
##### Logistic regression model performance on test dataset #####
#Predicting the model for bag of words
logit_bow_predict = logit.predict(cv_test_reviews)
print(logit_bow_predict)
##Predicting the model for tfidf features
logit_tfidf_predict = logit.predict(tv_test_reviews)
print(logit_tfidf_predict)

['0' '1' '1' ... '0' '0' '1']
['0' '1' '1' ... '0' '0' '1']


In [35]:
##### Accuracy of the model #####
#bag of words
logit_bow_score = accuracy_score(sentiment_test, logit_bow_predict)
print("logit_bow_score :", logit_bow_score)
#tfidf 
logit_tfidf_score = accuracy_score(sentiment_test, logit_tfidf_predict)
print("logit_tfidf_score :", logit_tfidf_score)

logit_bow_score : 0.71356
logit_tfidf_score : 0.71304


In [36]:
##### classification report #####
#bag of words 
logit_bow_report = classification_report(sentiment_test, logit_bow_predict, target_names=['1','0'])
print(logit_bow_report)

              precision    recall  f1-score   support

           1       0.70      0.74      0.72     12500
           0       0.72      0.69      0.71     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000



In [37]:
#tfidf
logit_tfidf_report = classification_report(sentiment_test, logit_tfidf_predict, target_names=['1','0'])
print(logit_tfidf_report)

              precision    recall  f1-score   support

           1       0.70      0.74      0.72     12500
           0       0.73      0.68      0.70     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000



In [374]:
#Confusion matrix
#confusion matrix for bag of words
cnfmat_bow = confusion_matrix(sentiment_test, logit_bow_predict)
print(cnfmat_bow)
#confusion matrix for tfidf features
cnfmat_tfidf = confusion_matrix(sentiment_test, logit_tfidf_predict)
print(cnfmat_tfidf)

[[9200 3300]
 [3861 8639]]
[[9297 3203]
 [3971 8529]]


In [2]:
##### Linear support vector machines for bag of words and tfidf features #####

In [376]:
#training the linear svm
svm = SGDClassifier(loss = 'hinge', max_iter = 500, random_state = 42)
#fitting the svm for bag of words
svm_bow = svm.fit(cv_train_reviews, sentiment_train)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf = svm.fit(tv_train_reviews, sentiment_train)
print(svm_tfidf)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=500, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=500, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [378]:
##### Model performance on test data #####
#bag of words
svm_bow_predict = svm.predict(cv_test_reviews)
print(svm_bow_predict)
#tfidf
svm_tfidf_predict = svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

['1' '1' '1' ... '1' '0' '1']
['1' '1' '1' ... '1' '1' '1']


In [379]:
##Accuracy of the model
#bag of words
svm_bow_score = accuracy_score(sentiment_test, svm_bow_predict)
print("svm_bow_score :", svm_bow_score)
#tfidf
svm_tfidf_score = accuracy_score(sentiment_test, svm_tfidf_predict)
print("svm_tfidf_score :", svm_tfidf_score)

svm_bow_score : 0.65712
svm_tfidf_score : 0.50392


In [380]:
##Classificiation report 
#Classification report for bag of words 
svm_bow_report=classification_report(sentiment_test, svm_bow_predict, target_names=['1','0'])
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(sentiment_test, svm_tfidf_predict, target_names=['1','0'])
print(svm_tfidf_report)

              precision    recall  f1-score   support

           1       0.83      0.39      0.53     12500
           0       0.60      0.92      0.73     12500

    accuracy                           0.66     25000
   macro avg       0.72      0.66      0.63     25000
weighted avg       0.72      0.66      0.63     25000

              precision    recall  f1-score   support

           1       1.00      0.01      0.02     12500
           0       0.50      1.00      0.67     12500

    accuracy                           0.50     25000
   macro avg       0.75      0.50      0.34     25000
weighted avg       0.75      0.50      0.34     25000



In [381]:
#confusion matrix for bag of words
cnfmat_bow = confusion_matrix(sentiment_test, svm_bow_predict)
print(cnfmat_bow)
#confusion matrix for tfidf features
cnfmat_tfidf = confusion_matrix(sentiment_test, svm_tfidf_predict)
print(cnfmat_tfidf)

[[ 4923  7577]
 [  995 11505]]
[[   98 12402]
 [    0 12500]]
