### Import Packages and Libraries

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import re
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime

Using TensorFlow backend.


### Import Data File and Cleaning

In [2]:
data_file = "../Data/AllData_Sentiment.csv"
data = pd.read_csv(data_file)

# data.info()
# data.head()

### Building a Model - Count Vector
1. Multinomial NB
2. Bernoulli NB
3. Logistic Regression
4. Support Vector Machine

In [3]:
### done
print (datetime.now())

Classifiers = [MultinomialNB(), BernoulliNB(), LogisticRegression(), SVC()]

reviews = data['Review_splitted']
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

countVectorizer = CountVectorizer(min_df = 4, max_df=0.85)
X_train = countVectorizer.fit_transform(X_train)
X_test = countVectorizer.transform(X_test)

for i in range(len(Classifiers)):
    clf = Classifiers[i]
    clf_name = "Test"
    
    if i == int(0):
        clf_name = "Multinomial Naive Bayes"
        clf = Classifiers[i]
    elif i == int(1):
        clf_name = "Bernoulli Naive Bayes"
        clf = Classifiers[i]
    elif i == int(2):
        clf_name = "Logistic Regression"
        clf = Classifiers[i]
    elif i == int(3):
        clf_name = "Support Vector Machine"
        clf = Classifiers[i]
#         parameters = {'C':[1,2,3,4,5,6,7,8,14], 'gamma':[0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2,3,4,5]}
        parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}
        clf = GridSearchCV(estimator = clf, param_grid = parameters)
        
    clf.fit(X_train, y_train)
    clf_ypred = clf.predict(X_test)
    f1_clf = f1_score(y_test, clf_ypred, average = 'weighted')
    accuracy_clf = accuracy_score(y_test, clf_ypred)
    print ("F1-score of", clf_name, "with Count Vector is:", f1_clf*100)
    print ("Accuracy of", clf_name, "with Count Vector is:", accuracy_clf*100)
    
    if clf_name == "Support Vector Machine":
        print (clf.best_params_)

print (datetime.now())

2020-04-01 13:00:10.892592
F1-score of Multinomial Naive Bayes with Count Vector is: 80.5997418277316
Accuracy of Multinomial Naive Bayes with Count Vector is: 79.90260348379846
F1-score of Bernoulli Naive Bayes with Count Vector is: 73.80429412845307
Accuracy of Bernoulli Naive Bayes with Count Vector is: 72.41056377598801
F1-score of Logistic Regression with Count Vector is: 93.23397712919302
Accuracy of Logistic Regression with Count Vector is: 93.46319535493538
F1-score of Support Vector Machine with Count Vector is: 94.250110680416
Accuracy of Support Vector Machine with Count Vector is: 94.39970031841169
{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'rbf'}
2020-04-01 13:39:05.248449


### Building a Model - TFIDF (use_idf = False)
1. Multinomial NB
2. Bernoulli NB
3. Logistic Regression
4. Support Vector Machine

In [4]:
### done
print (datetime.now())

Classifiers = [MultinomialNB(), BernoulliNB(), LogisticRegression(), SVC()]

reviews = data['Review_splitted']
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tf_Vectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tf_Vectorizer.fit_transform(X_train)
X_test = tf_Vectorizer.transform(X_test)

for i in range(len(Classifiers)):
    clf = Classifiers[i]
    clf_name = "Test"
    
    if i == int(0):
        clf_name = "Multinomial Naive Bayes"
        clf = Classifiers[i]
    elif i == int(1):
        clf_name = "Bernoulli Naive Bayes"
        clf = Classifiers[i]
    elif i == int(2):
        clf_name = "Logistic Regression"
        clf = Classifiers[i]
    elif i == int(3):
        clf_name = "Support Vector Machine"
        clf = Classifiers[i]
#         parameters = {'C':[1,2,3,4,5,6,7,8,14], 'gamma':[0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2,3,4,5]}
        parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}
        clf = GridSearchCV(estimator = clf, param_grid = parameters)
        
    clf.fit(X_train, y_train)
    clf_ypred = clf.predict(X_test)
    f1_clf = f1_score(y_test, clf_ypred, average = 'weighted')
    accuracy_clf = accuracy_score(y_test, clf_ypred)
    print ("F1-score of", clf_name, "with TFIDF (use_idf = False) is:", f1_clf*100)
    print ("Accuracy of", clf_name, "with TFIDF (use_idf = False) is:", accuracy_clf*100)
    
    if clf_name == "Support Vector Machine":
        print (clf.best_params_)
    
print (datetime.now())

2020-04-01 13:39:05.267400
F1-score of Multinomial Naive Bayes with TFIDF (use_idf = False) is: 80.40566768548723
Accuracy of Multinomial Naive Bayes with TFIDF (use_idf = False) is: 81.2886308297434
F1-score of Bernoulli Naive Bayes with TFIDF (use_idf = False) is: 73.80429412845307
Accuracy of Bernoulli Naive Bayes with TFIDF (use_idf = False) is: 72.41056377598801
F1-score of Logistic Regression with TFIDF (use_idf = False) is: 90.10659445145508
Accuracy of Logistic Regression with TFIDF (use_idf = False) is: 90.82225135793219
F1-score of Support Vector Machine with TFIDF (use_idf = False) is: 93.9175918646446
Accuracy of Support Vector Machine with TFIDF (use_idf = False) is: 94.0438284322907
{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'linear'}
2020-04-01 14:01:27.879198


### Building a Model - TFIDF (use_idf = True)
1. Multinomial NB
2. Bernoulli NB
3. Logistic Regression
4. Support Vector Machine

In [5]:
### done
print (datetime.now())

Classifiers = [MultinomialNB(), BernoulliNB(), LogisticRegression(), SVC()]

reviews = data['Review_splitted']
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

for i in range(len(Classifiers)):
    clf = Classifiers[i]
    clf_name = "Test"
    
    if i == int(0):
        clf_name = "Multinomial Naive Bayes"
        clf = Classifiers[i]
    elif i == int(1):
        clf_name = "Bernoulli Naive Bayes"
        clf = Classifiers[i]
    elif i == int(2):
        clf_name = "Logistic Regression"
        clf = Classifiers[i]
    elif i == int(3):
        clf_name = "Support Vector Machine"
        clf = Classifiers[i]
#         parameters = {'C':[1,2,3,4,5,6,7,8,14], 'gamma':[0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2,3,4,5]}
        parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}
        clf = GridSearchCV(estimator = clf, param_grid = parameters)
        
    clf.fit(X_train, y_train)
    clf_ypred = clf.predict(X_test)
    f1_clf = f1_score(y_test, clf_ypred, average = 'weighted')
    accuracy_clf = accuracy_score(y_test, clf_ypred)
    print ("F1-score of", clf_name, "with TFIDF (use_idf = True) is:", f1_clf*100)
    print ("Accuracy of", clf_name, "with TFIDF (use_idf = True) is:", accuracy_clf*100)
    
    if clf_name == "Support Vector Machine":
        print (clf.best_params_)

print (datetime.now())

2020-04-01 14:01:27.894158
F1-score of Multinomial Naive Bayes with TFIDF (use_idf = True) is: 82.08793976199105
Accuracy of Multinomial Naive Bayes with TFIDF (use_idf = True) is: 82.71211837422739
F1-score of Bernoulli Naive Bayes with TFIDF (use_idf = True) is: 73.80429412845307
Accuracy of Bernoulli Naive Bayes with TFIDF (use_idf = True) is: 72.41056377598801
F1-score of Logistic Regression with TFIDF (use_idf = True) is: 91.37239199102267
Accuracy of Logistic Regression with TFIDF (use_idf = True) is: 92.02097771118187
F1-score of Support Vector Machine with TFIDF (use_idf = True) is: 94.2607705165996
Accuracy of Support Vector Machine with TFIDF (use_idf = True) is: 94.32477992133357
{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'linear'}
2020-04-01 14:23:09.804081
