### COMP 551 P2: Supervised learning models

#### Group  47
#### Authors : Humayun Khan Kakar, Boury Mbodj & Michael Segev
#### Date : Feb 20 2019

##### Subject: The given file contains the different supervised learning models(logistic regression, decision trees support vector machines and neural networks) that we tried during the project as well as their respective performance.

In [1]:
import pandas as pd 
import numpy as np
import os

In [2]:
import re
from bs4 import BeautifulSoup
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

In [3]:
import mglearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [5]:
pos= [x for x in os.listdir("train/pos/") if x.endswith(".txt")]
neg= [x for x in os.listdir("train/neg/") if x.endswith(".txt")]
test= [x for x in os.listdir("test/") if x.endswith(".txt")]

In [6]:
# Data processing
posReviews=[]
for txt in pos:
    with open("train/pos/"+txt, encoding="ISO-8859-1") as f:
        posReviews.append(f.read())
negReviews=[]        
for txt in neg:
    with open("train/neg/"+txt, encoding="ISO-8859-1") as f:
        negReviews.append(f.read())
testReviews=[]        
for txt in test:
    with open("test/"+txt, encoding="ISO-8859-1") as f:
        testReviews.append(f.read())

In [7]:
# end data processing 
reviews = pd.concat([
    pd.DataFrame({"file":pos,"review":posReviews, "label":1}),
    pd.DataFrame({"file":neg,"review":negReviews, "label":0}),
    pd.DataFrame({"file":test,"review":testReviews, "label":-1})
], ignore_index=True).sample(frac=1, random_state=1)

In [8]:
# Examine firts 10 rows
reviews["file"]= reviews["file"].str.split("_", n = 1, expand = True)
reviews["file"]= reviews["file"].str.split(".", n = 1, expand = True)
#reviews.set_index('file',inplace=True)
reviews.head(10)

Unnamed: 0,file,review,label
26247,11119,This movie is a desperate attempt to ride the ...,-1
35067,19058,The first time I ever saw this movie was when ...,-1
34590,18629,"This movie will send chills down your spine, e...",-1
16668,2501,I saw this on TV the other nightÂ or rather I...,0
12196,9728,I am a huge fan of Simon Pegg and have watched...,1
2600,12340,There is indeed much to complain about this mo...,1
9047,6894,"The men can slaver over Lollo, if they like (o...",1
2206,11987,Since it has been some years since I reviewed ...,1
25607,10543,"Wow, what exciting visual effects. I also love...",-1
11606,9197,"This is actually a groovy-neat little flick, m...",1


In [9]:
# Examine the class ditribution 
reviews.label.value_counts()

-1    25000
 1    12500
 0    12500
Name: label, dtype: int64

In [10]:
# Here we process the text
# We use BeautifulSoup library to remove the HTML/XML tags (e.g., <br />) 
lemmatizer = WordNetLemmatizer()

def process_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = BeautifulSoup(text).get_text()
    #text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = " ".join(text)
    return text

reviews['review'] = reviews.review.apply(lambda x: process_text(x))

In [11]:
#Examine the text after feature extractrion
reviews.head(10)

Unnamed: 0,file,review,label
26247,11119,This movie be a desperate attempt to ride the ...,-1
35067,19058,The first time I ever saw this movie wa when I...,-1
34590,18629,This movie will send chill down your spine eve...,-1
16668,2501,I saw this on TV the other nightÂ or rather I...,0
12196,9728,I be a huge fan of Simon Pegg and have watch p...,1
2600,12340,There be indeed much to complain about this mo...,1
9047,6894,The men can slaver over Lollo if they like or ...,1
2206,11987,Since it ha be some year since I review this c...,1
25607,10543,Wow what excite visual effect I also love the ...,-1
11606,9197,This be actually a groovyneat little flick mak...,1


In [12]:
# Define X and y from the review dataset  for use with Countvectorizer
X= reviews[reviews.label!=-1].review
y= reviews[reviews.label!=-1].label
print (X.shape)
print (y.shape)

(25000,)
(25000,)


In [13]:
# Split X and y into training and testing/validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22500,)
(2500,)
(22500,)
(2500,)


In [14]:
#Instantiate the vectorizer for n-garms with TD-IDF with l2 regularization
vect = TfidfVectorizer(min_df=5, max_df = 0.85, sublinear_tf=True, use_idf =True, ngram_range=(1, 4), norm='l2')

In [15]:
#Learn training data vocabulary fit then use to create document term matrix
X_train_dtm= vect.fit_transform(X_train)

In [16]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<2500x247528 sparse matrix of type '<class 'numpy.float64'>'
	with 807961 stored elements in Compressed Sparse Row format>

In [17]:
# Logitic regression model
from sklearn.linear_model import LogisticRegression

In [18]:
logmodel=LogisticRegression()

In [19]:
logmodel.fit(X_train_dtm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
y_pred_log=logmodel.predict(X_test_dtm)

In [21]:
# Decison tree model
from sklearn.tree import DecisionTreeClassifier

In [22]:
dtree= DecisionTreeClassifier()

In [23]:
dtree.fit(X_train_dtm, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
y_pred_dtree=dtree.predict(X_test_dtm)

In [25]:
# Support Vector machines 
from sklearn.svm import SVC

In [26]:
svmmodel = SVC(kernel='rbf', C = 10.0, gamma=0.1)

In [27]:
svmmodel.fit(X_train_dtm, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
y_pred_svm=svmmodel.predict(X_test_dtm)

In [29]:
#Neural network
from sklearn.neural_network import MLPClassifier

In [30]:
mlpcmodel = MLPClassifier(hidden_layer_sizes=(30,30,30))

In [31]:
mlpcmodel.fit(X_train_dtm, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [32]:
y_pred_mlpc=mlpcmodel.predict(X_test_dtm)

In [33]:
# Import metrics to calculate accuracy
from sklearn import metrics
#Now for the classification task we are going to import classification report
from sklearn.metrics import classification_report

In [34]:
# Print accuracy, classification report and confusion metrics for Logistic regr
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_log))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_log))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_log))

Accuracy score for logistic regression :
0.9036
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1248
           1       0.90      0.91      0.90      1252

   micro avg       0.90      0.90      0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

Confusion matrix for logistic regression :
[[1118  130]
 [ 111 1141]]


In [35]:
# Print accuracy, classification report and confusion metrics for model 2
print("Accuracy score for decision trees :")
print(metrics.accuracy_score(y_test,y_pred_dtree))
print("Classification score for decision trees  :")
print(classification_report(y_test,y_pred_dtree))
print("Confusion matrix for decision trees  :")
print(metrics.confusion_matrix(y_test,y_pred_dtree))

Accuracy score for decision trees :
0.7196
Classification score for decision trees  :
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      1248
           1       0.72      0.72      0.72      1252

   micro avg       0.72      0.72      0.72      2500
   macro avg       0.72      0.72      0.72      2500
weighted avg       0.72      0.72      0.72      2500

Confusion matrix for decision trees  :
[[896 352]
 [349 903]]


In [36]:
# Print accuracy, classification report and confusion metrics for model 3
print("Accuracy score for support vector machines :")
print(metrics.accuracy_score(y_test,y_pred_svm))
print("Classification score support vector machines :")
print(classification_report(y_test,y_pred_svm))
print("Confusion matrix support vector machines :")
print(metrics.confusion_matrix(y_test,y_pred_svm))

Accuracy score for support vector machines :
0.9168
Classification score support vector machines :
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1248
           1       0.92      0.92      0.92      1252

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.92      0.92      0.92      2500
weighted avg       0.92      0.92      0.92      2500

Confusion matrix support vector machines :
[[1143  105]
 [ 103 1149]]


In [37]:
# Print accuracy, classification report and confusion metrics for model 4
print("Accuracy score for neural networks :")
print(metrics.accuracy_score(y_test,y_pred_mlpc))
print("Classification score neural networks :")
print(classification_report(y_test,y_pred_mlpc))
print("Confusion matrix neural networks  :")
print(metrics.confusion_matrix(y_test,y_pred_mlpc))

Accuracy score for neural networks :
0.9176
Classification score neural networks :
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1248
           1       0.92      0.92      0.92      1252

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.92      0.92      0.92      2500
weighted avg       0.92      0.92      0.92      2500

Confusion matrix neural networks  :
[[1147  101]
 [ 105 1147]]
