### COMP 551 P2: Feature extraction and validation models

#### Group  47
#### Authors : Humayun Khan Kakar , Boury Mbodj & Michael Segev 
#### Date : Feb 20 2019

##### Subject: The given file contains the different feature extraction models (binary , td-idf) and validation (held out , k-fold validation) models that we  tried during the project as well as their respective performance.

In [1]:
import pandas as pd 
import numpy as np
import os

In [2]:
import re
from bs4 import BeautifulSoup
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

In [3]:
import mglearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [5]:
pos= [x for x in os.listdir("train/pos/") if x.endswith(".txt")]
neg= [x for x in os.listdir("train/neg/") if x.endswith(".txt")]
test= [x for x in os.listdir("test/") if x.endswith(".txt")]

In [6]:
# Data processing
posReviews=[]
for txt in pos:
    with open("train/pos/"+txt, encoding="ISO-8859-1") as f:
        posReviews.append(f.read())
negReviews=[]        
for txt in neg:
    with open("train/neg/"+txt, encoding="ISO-8859-1") as f:
        negReviews.append(f.read())
testReviews=[]        
for txt in test:
    with open("test/"+txt, encoding="ISO-8859-1") as f:
        testReviews.append(f.read())

In [7]:
# end data processing 
reviews = pd.concat([
    pd.DataFrame({"file":pos,"review":posReviews, "label":1}),
    pd.DataFrame({"file":neg,"review":negReviews, "label":0}),
    pd.DataFrame({"file":test,"review":testReviews, "label":-1})
], ignore_index=True).sample(frac=1, random_state=1)

In [8]:
# Examine firts 10 rows
reviews["file"]= reviews["file"].str.split("_", n = 1, expand = True)
reviews["file"]= reviews["file"].str.split(".", n = 1, expand = True)
#reviews.set_index('file',inplace=True)
reviews.head(10)

Unnamed: 0,file,review,label
26247,11119,This movie is a desperate attempt to ride the ...,-1
35067,19058,The first time I ever saw this movie was when ...,-1
34590,18629,"This movie will send chills down your spine, e...",-1
16668,2501,I saw this on TV the other nightÂ or rather I...,0
12196,9728,I am a huge fan of Simon Pegg and have watched...,1
2600,12340,There is indeed much to complain about this mo...,1
9047,6894,"The men can slaver over Lollo, if they like (o...",1
2206,11987,Since it has been some years since I reviewed ...,1
25607,10543,"Wow, what exciting visual effects. I also love...",-1
11606,9197,"This is actually a groovy-neat little flick, m...",1


In [9]:
# Examine the class ditribution 
reviews.label.value_counts()

-1    25000
 1    12500
 0    12500
Name: label, dtype: int64

In [10]:
# Here we process the text
# We use BeautifulSoup library to remove the HTML/XML tags (e.g., <br />) 
lemmatizer = WordNetLemmatizer()

def process_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = BeautifulSoup(text).get_text()
    #text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = " ".join(text)
    return text

reviews['review'] = reviews.review.apply(lambda x: process_text(x))

In [11]:
#Examine the text after feature extractrion
reviews.head(10)

Unnamed: 0,file,review,label
26247,11119,This movie be a desperate attempt to ride the ...,-1
35067,19058,The first time I ever saw this movie wa when I...,-1
34590,18629,This movie will send chill down your spine eve...,-1
16668,2501,I saw this on TV the other nightÂ or rather I...,0
12196,9728,I be a huge fan of Simon Pegg and have watch p...,1
2600,12340,There be indeed much to complain about this mo...,1
9047,6894,The men can slaver over Lollo if they like or ...,1
2206,11987,Since it ha be some year since I review this c...,1
25607,10543,Wow what excite visual effect I also love the ...,-1
11606,9197,This be actually a groovyneat little flick mak...,1


In [12]:
# Define X and y from the review dataset  for use with Countvectorizer
X= reviews[reviews.label!=-1].review
y= reviews[reviews.label!=-1].label
print (X.shape)
print (y.shape)

(25000,)
(25000,)


In [13]:
# Split X and y into training and testing/validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22500,)
(2500,)
(22500,)
(2500,)


In [14]:
#Instantiate the vectorizer  for binary features
vect = CountVectorizer(min_df=5, max_df = 0.85, ngram_range=(1, 4), strip_accents='unicode', binary=True)

In [15]:
#Learn training data vocabulary fit then use to create document term matrix
X_train_dtm= vect.fit_transform(X_train)

In [16]:
# examine the document-term matrix
X_train_dtm

<22500x247549 sparse matrix of type '<class 'numpy.int64'>'
	with 7447438 stored elements in Compressed Sparse Row format>

In [17]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<2500x247549 sparse matrix of type '<class 'numpy.int64'>'
	with 807993 stored elements in Compressed Sparse Row format>

In [18]:
#Instantiate the vectorizer for n-garms without binary 
vect2 = CountVectorizer(min_df=5, max_df = 0.85, ngram_range=(1, 4))

In [19]:
#Learn training data vocabulary fit then use to create document term matrix
X_train_dtm2= vect2.fit_transform(X_train)

In [20]:
# examine the document-term matrix for the second feature design
X_train_dtm2

<22500x247528 sparse matrix of type '<class 'numpy.int64'>'
	with 7447142 stored elements in Compressed Sparse Row format>

In [21]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm2 = vect2.transform(X_test)
X_test_dtm2

<2500x247528 sparse matrix of type '<class 'numpy.int64'>'
	with 807961 stored elements in Compressed Sparse Row format>

In [22]:
#Instantiate the vectorizer for n-garms with TD-IDF 
vect3 = TfidfVectorizer(min_df=5, max_df = 0.85, use_idf =True, ngram_range=(1, 4), norm='l2',strip_accents='unicode')

In [23]:
#Learn training data vocabulary fit then use to create document term matrix
X_train_dtm3= vect3.fit_transform(X_train)

In [24]:
# examine the document-term matrix for the third feature design
X_train_dtm3

<22500x247549 sparse matrix of type '<class 'numpy.float64'>'
	with 7447438 stored elements in Compressed Sparse Row format>

In [25]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm3 = vect3.transform(X_test)
X_test_dtm3

<2500x247549 sparse matrix of type '<class 'numpy.float64'>'
	with 807993 stored elements in Compressed Sparse Row format>

In [26]:
# Logitic regression model
from sklearn.linear_model import LogisticRegression

In [27]:
logmodel=LogisticRegression()

In [28]:
logmodel.fit(X_train_dtm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
y_pred_log=logmodel.predict(X_test_dtm)

In [30]:
logmodel2=LogisticRegression()

In [31]:
logmodel2.fit(X_train_dtm2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
y_pred_log2=logmodel2.predict(X_test_dtm2)

In [33]:
logmodel3=LogisticRegression()

In [34]:
logmodel3.fit(X_train_dtm3, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
y_pred_log3=logmodel3.predict(X_test_dtm)

In [36]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(penalty="l2", C=1), param_grid, cv=5)
grid.fit(X_train_dtm, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
y_pred_grid=grid.predict(X_test_dtm)

In [38]:
grid2 = GridSearchCV(LogisticRegression(penalty="l2", C=1), param_grid, cv=5)
grid2.fit(X_train_dtm2, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
y_pred_grid2=grid2.predict(X_test_dtm2)

In [40]:
grid3 = GridSearchCV(LogisticRegression(penalty="l2", C=1), param_grid, cv=5)
grid3.fit(X_train_dtm3, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [41]:
y_pred_grid3=grid3.predict(X_test_dtm3)

In [42]:
# Import metrics to calculate accuracy
from sklearn import metrics
#Now for the classification task we are going to import classification report
from sklearn.metrics import classification_report

In [43]:
# Print accuracy, classification report and confusion metrics for model 1
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_log))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_log))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_log))

Accuracy score for logistic regression :
0.8964
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1248
           1       0.89      0.90      0.90      1252

   micro avg       0.90      0.90      0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

Confusion matrix for logistic regression :
[[1116  132]
 [ 127 1125]]


In [44]:
# Print accuracy, classification report and confusion metrics for model 2
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_log2))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_log2))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_log2))

Accuracy score for logistic regression :
0.9004
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1248
           1       0.90      0.90      0.90      1252

   micro avg       0.90      0.90      0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

Confusion matrix for logistic regression :
[[1129  119]
 [ 130 1122]]


In [45]:
# Print accuracy, classification report and confusion metrics for model 3
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_log3))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_log3))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_log3))

Accuracy score for logistic regression :
0.8696
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      1248
           1       0.92      0.81      0.86      1252

   micro avg       0.87      0.87      0.87      2500
   macro avg       0.87      0.87      0.87      2500
weighted avg       0.87      0.87      0.87      2500

Confusion matrix for logistic regression :
[[1155   93]
 [ 233 1019]]


In [46]:
# Print accuracy, classification report and confusion metrics for model 3
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_grid))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_grid))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_grid))

Accuracy score for logistic regression :
0.8964
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1248
           1       0.89      0.90      0.90      1252

   micro avg       0.90      0.90      0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

Confusion matrix for logistic regression :
[[1116  132]
 [ 127 1125]]


In [47]:
# Print accuracy, classification report and confusion metrics for model 3
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_grid2))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_grid2))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_grid2))

Accuracy score for logistic regression :
0.9044
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1248
           1       0.90      0.91      0.90      1252

   micro avg       0.90      0.90      0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

Confusion matrix for logistic regression :
[[1126  122]
 [ 117 1135]]


In [48]:
# Print accuracy, classification report and confusion metrics for model 3
print("Accuracy score for logistic regression :")
print(metrics.accuracy_score(y_test,y_pred_grid3))
print("Classification score for logistic regression :")
print(classification_report(y_test,y_pred_grid3))
print("Confusion matrix for logistic regression :")
print(metrics.confusion_matrix(y_test,y_pred_grid3))

Accuracy score for logistic regression :
0.9152
Classification score for logistic regression :
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1248
           1       0.91      0.92      0.92      1252

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.92      0.92      0.92      2500
weighted avg       0.92      0.92      0.92      2500

Confusion matrix for logistic regression :
[[1140  108]
 [ 104 1148]]
