In [1]:
#Importing necessary modules

import pandas as pd
import numpy as np

import re

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/promiskhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/promiskhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Task 1 - data loading and processing

In [2]:
#Loading the data
x_train = pd.read_csv('x_train.csv',header=None)
x_test = pd.read_csv('x_test.csv',header=None)
y_train = pd.read_csv('y_train.csv',header=None)
y_test = pd.read_csv('y_test.csv',header=None)

#### Data Preprocessing

In [3]:
#Setting column headers
x_train.columns=['domain','review']
x_test.columns=['domain','review']
y_train.columns=['sentiment']
y_test.columns=['sentiment']

In [4]:
#Removing unnecessary characters from reviews
x_train['review']= x_train['review'].str.replace('[^A-Za-z0-9\s]+', '')
x_test['review']= x_test['review'].str.replace('[^A-Za-z0-9\s]+', '')

In [5]:
#Converting all the letters to lowercase from reviews
x_train['review']= x_train['review'].str.lower()
x_test['review']= x_test['review'].str.lower()

In [6]:
#For tokenizing the data on whitespace
tokenizer = nltk.tokenize.WhitespaceTokenizer()
#For lemmatizing using wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
#Function to tokenize and lemmatize data
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
x_train.review = x_train['review'].apply(lemmatize_text)
x_test.review = x_test['review'].apply(lemmatize_text)

## Task 2 - feature representation

In [7]:
#Vocabulary size
vect = CountVectorizer()
X_train_1 = vect.fit_transform(x_train.review)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
X_test_1 = vect.transform(x_test.review)

Vocabulary size: 4313


## Task 3 - classification and evaluation

In [8]:
#Creating function for accuracy
def print_acc(model):
    predicted = model.predict(x_test.review)
    target_names = ['positive','negative']
    print('Accuracy: %.2f' % (accuracy_score(predicted,y_test.sentiment)*100))
    print('-'*30)
    print('Confusion matrix: ')
    print(confusion_matrix(predicted, y_test.sentiment))
    print('-'*30)
    print('Classification Report:')
    print(classification_report(predicted,y_test.sentiment, target_names=target_names))

In [9]:
#Importing common words
stop_words = list(stopwords.words('english'))     

In [10]:
#Pipelining count vectorizer with logistic regression 
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.5)),
                     ('clf', LinearSVC())])
#Preparing parameters for grid search cross validation
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2,2)],
    'vect__stop_words' : [None, stop_words],
    'clf__C': [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(text_clf, tuned_parameters, cv=ShuffleSplit(n_splits=10, random_state=123))
#Fitting best parameters to build classifier
svc = grid.fit(x_train.review, y_train.sentiment)
print("Best cross-validation score: {:0.4f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
#Performance on test set
print_acc(svc)

Best cross-validation score: 0.8221
Best parameters:  {'clf__C': 10, 'vect__ngram_range': (1, 2), 'vect__stop_words': None}
Accuracy: 79.67
------------------------------
Confusion matrix: 
[[253  75]
 [ 47 225]]
------------------------------
Classification Report:
              precision    recall  f1-score   support

    positive       0.84      0.77      0.81       328
    negative       0.75      0.83      0.79       272

    accuracy                           0.80       600
   macro avg       0.80      0.80      0.80       600
weighted avg       0.80      0.80      0.80       600



In [11]:
#Pipelining Tfidf vectorizer with logistic regression 
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.5)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC())])
#Preparing parameters for grid search cross validation
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2,2)],
    'vect__stop_words' : [None, stop_words],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(text_clf, tuned_parameters, cv=ShuffleSplit(n_splits=5, random_state=123))
#Fitting best parameters to build classifier
svc = grid.fit(x_train.review, y_train.sentiment)
print("Best cross-validation score: {:0.4f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
#Performance on test set
print_acc(svc)

Best cross-validation score: 0.8225
Best parameters:  {'clf__C': 10, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2), 'vect__stop_words': None}
Accuracy: 83.17
------------------------------
Confusion matrix: 
[[260  61]
 [ 40 239]]
------------------------------
Classification Report:
              precision    recall  f1-score   support

    positive       0.87      0.81      0.84       321
    negative       0.80      0.86      0.83       279

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600



In [12]:
#Final Vocabulary size
vect = CountVectorizer(max_df=0.5, stop_words=stop_words)
X_train_1 = vect.fit_transform(x_train.review)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
X_test_1 = vect.transform(x_test.review)

Vocabulary size: 4201


### Analyzing prediction for the classifier

In [13]:
#Predictions for short simple sentences
review = "This movie is good"
print(svc.predict([review]))
review = "This movie is bad"
print(svc.predict([review]))
review = "This movie is not good"
print(svc.predict([review]))

[1]
[0]
[0]


In [14]:
#Predictions for amazon
print("Review:",x_test.review[0])
print("Sentiment: ",y_test.sentiment[0])
print("Predicted Sentiment: ",svc.predict([x_test.review[0]]))
print("Review:",x_test.review[2])
print("Sentiment: ",y_test.sentiment[2])
print("Predicted Sentiment: ",svc.predict([x_test.review[2]]))
print("Review:",x_test.review[165])
print("Sentiment: ",y_test.sentiment[165])
print("Predicted Sentiment: ",svc.predict([x_test.review[165]]))

Review: it only recognizes the phone a it storage device
Sentiment:  0
Predicted Sentiment:  [1]
Review: the one big drawback of the mp3 player is that the button on the phone front cover that let you pause and skip song lock out after a few second
Sentiment:  0
Predicted Sentiment:  [0]
Review: this phone is pretty sturdy and ive never had any large problem with it
Sentiment:  1
Predicted Sentiment:  [1]


In [15]:
#Predictions for imdb
print("Review:",x_test.review[205])
print("Sentiment: ",y_test.sentiment[205])
print("Predicted Sentiment: ",svc.predict([x_test.review[205]]))
print("Review:",x_test.review[275])
print("Sentiment: ",y_test.sentiment[275])
print("Predicted Sentiment: ",svc.predict([x_test.review[275]]))
print("Review:",x_test.review[388])
print("Sentiment: ",y_test.sentiment[388])
print("Predicted Sentiment: ",svc.predict([x_test.review[388]]))

Review: the result is a film that just dont look right
Sentiment:  0
Predicted Sentiment:  [0]
Review: it came free with a dvd player i bought but i still turned the thing off halfway through because i wa embarrassed for howell
Sentiment:  0
Predicted Sentiment:  [0]
Review: but it wasnt until i watched this film that i realised how great he actually wa
Sentiment:  1
Predicted Sentiment:  [1]


In [16]:
#Predictions for yelp
print("Review:",x_test.review[455])
print("Sentiment: ",y_test.sentiment[455])
print("Predicted Sentiment: ",svc.predict([x_test.review[455]]))
print("Review:",x_test.review[505])
print("Sentiment: ",y_test.sentiment[505])
print("Predicted Sentiment: ",svc.predict([x_test.review[505]]))
print("Review:",x_test.review[595])
print("Sentiment: ",y_test.sentiment[595])
print("Predicted Sentiment: ",svc.predict([x_test.review[595]]))

Review: kind of hard to mess up a steak but they did
Sentiment:  0
Predicted Sentiment:  [1]
Review: i really do recommend this place you can go wrong with this donut place
Sentiment:  1
Predicted Sentiment:  [1]
Review: everything wa fresh and delicious
Sentiment:  1
Predicted Sentiment:  [1]
