In [1]:
# import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from scipy.stats import norm
import numpy as np
import scipy as sp
import pickle

# Import scikit-learn tools, vectorizers, transformer, and classifiers
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# import CountVectorizer and TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

# import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# import Logistic Regression CV Classifier
from sklearn.linear_model import LogisticRegressionCV

# import LinearSVC classifier
from sklearn.svm import LinearSVC

In [3]:
# Load in data
with open('dfML.pickle', 'rb') as b:
    df = pickle.load(b)

In [4]:
df.head()

Unnamed: 0,name,rating,text,title,clean_text_stem,clean_text_lem,word_count,clean_title_stem,clean_title_lem,word_count_title,...,word_yell,word_yes,word_yet,word_yogurt,word_york,word_you,word_young,word_zero,polarity,subjectivity
28,Hampton Inn Suites National HarborAlexandria Area,positive,Hotel is in the perfect spot at the perfect pr...,THE DC TRIP,hotel perfect spot perfect price not perfect v...,hotel perfect spot perfect price not perfect v...,277,dc trip,dc trip,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250893,0.458185
29,Hampton Inn Suites National HarborAlexandria Area,positive,Excellent experience. Will come again and book...,Nice location,excel experience come book stay futur,excellent experience come book stay future,11,nice locat,nice location,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5625
30,Hampton Inn Suites National HarborAlexandria Area,negative,"heat in room did not work properly, tv remote ...",Hampton Inn,heat room not work properly tv remot wa broken...,heat room not work properly tv remote break ex...,13,hampton inn,hampton inn,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.533333
31,Hampton Inn Suites National HarborAlexandria Area,positive,"Even though we were having problems, i.e. Feat...",Gracious and helpful staff,even though problems e feather allergy flat ti...,even though problem e feather allergy flat tir...,22,graciou help staff,gracious helpful staff,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0125,0.5625
32,Hampton Inn Suites National HarborAlexandria Area,positive,Brand new hotel in brand new retail area on th...,"beautiful, convenient location",brand new hotel brand new retail area water ea...,brand new hotel brand new retail area water ea...,109,beautiful conveni locat,beautiful convenient location,3,...,0.0,0.0,0.0,0.0,0.0,0.178919,0.0,0.0,0.112746,0.637174


In [14]:
reviews = [
    (df['rating'] == 'positive'),
    (df['rating'] == 'negative')
]
rating = [0, 1]
df['rating'] = np.select(reviews, rating, default=2)
df.rating.value_counts()

0    3029
2     552
1     395
Name: rating, dtype: int64

In [15]:
# Store tweet dataset into feature matrix and response vector
X_words = df['clean_text_lem']
y_words = df['rating']

# Instantiate CountVectorizer and TfidfVectorizer
count_vect = CountVectorizer(min_df=1, ngram_range=(1, 2)) 
tfidf_vect = TfidfVectorizer(min_df=1, ngram_range=(1, 2))


# Apply CountVectorizer 
X_count = count_vect.fit_transform(df['clean_text_lem'].apply(str))
X_count = X_count.tocsc() 

# Apply TfidfVectorizer
X_tfidf = tfidf_vect.fit_transform(df['clean_text_lem'].apply(str))
X_tfidf = X_tfidf.tocsc()


# Split train/test data for all data
Xtrain_count, Xtest_count, ytrain_count, ytest_count = train_test_split(X_count, y_words, random_state=17)
Xtrain_tfidf, Xtest_tfidf, ytrain_tfidf, ytest_tfidf = train_test_split(X_tfidf, y_words, random_state=17)

In [16]:
def evaluate_model(xtest, ytest, clf):
    """ 
    This function evaluates the effectiveness of a ML model and outputs F1 Scores, AUC score and Confusion Matrix
    """
    # Make predictions for Xtest
    y_pred = clf.predict(xtest)
    
    # Confusion matrix
    cm = metrics.confusion_matrix(ytest, y_pred)
    
    print(classification_report(ytest, y_pred))
    print('\nConfusion Matrix:\n', cm)

In [17]:
# Instantiate multinomialNB()
nb_words_count = MultinomialNB(alpha=1, fit_prior=True)
nb_words_tfidf = MultinomialNB(alpha=1, fit_prior=True)

# Train model
nb_words_count.fit(Xtrain_count, ytrain_count)
nb_words_tfidf.fit(Xtrain_tfidf, ytrain_tfidf)

In [18]:
evaluate_model(Xtest_count, ytest_count, nb_words_count)


              precision    recall  f1-score   support

           0       0.85      0.98      0.91       783
           1       0.78      0.30      0.43        93
           2       0.40      0.17      0.24       118

    accuracy                           0.82       994
   macro avg       0.68      0.48      0.53       994
weighted avg       0.79      0.82      0.79       994


Confusion Matrix:
 [[770   5   8]
 [ 43  28  22]
 [ 95   3  20]]


In [19]:
evaluate_model(Xtest_tfidf, ytest_tfidf, nb_words_tfidf)


              precision    recall  f1-score   support

           0       0.79      1.00      0.88       783
           1       0.00      0.00      0.00        93
           2       0.00      0.00      0.00       118

    accuracy                           0.79       994
   macro avg       0.26      0.33      0.29       994
weighted avg       0.62      0.79      0.69       994


Confusion Matrix:
 [[783   0   0]
 [ 93   0   0]
 [118   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Instantiate and fit training data to Logistic Regression Model (CountVec)
log_clf_count = LogisticRegressionCV(scoring='accuracy', 
                                     class_weight='balanced', 
                                     cv=5, max_iter=1000).fit(Xtrain_count, ytrain_count)

# Instantiate and fit training data to Logistic Regression Model (TFIDF Vec)
log_clf_tfidf = LogisticRegressionCV(scoring='accuracy', 
                                     class_weight='balanced', 
                                     cv=5, max_iter=1000).fit(Xtrain_tfidf, ytrain_tfidf)

In [21]:
evaluate_model(Xtest_count, ytest_count, log_clf_count)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       783
           1       0.64      0.53      0.58        93
           2       0.41      0.27      0.33       118

    accuracy                           0.83       994
   macro avg       0.65      0.58      0.61       994
weighted avg       0.81      0.83      0.82       994


Confusion Matrix:
 [[747  12  24]
 [ 22  49  22]
 [ 70  16  32]]


In [22]:
evaluate_model(Xtest_tfidf, ytest_tfidf, log_clf_tfidf)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       783
           1       0.65      0.56      0.60        93
           2       0.44      0.31      0.36       118

    accuracy                           0.84       994
   macro avg       0.66      0.61      0.63       994
weighted avg       0.82      0.84      0.83       994


Confusion Matrix:
 [[749  12  22]
 [ 17  52  24]
 [ 66  16  36]]


In [23]:
# Instantiate and fit training data to Random Forest Model (CountVec)
forest_clf_count = RandomForestClassifier(class_weight='balanced',
                                     n_estimators=100).fit(Xtrain_count, ytrain_count)

# Instantiate and fit training data to Random Forest Model (TFIDF Vec)
forest_clf_tfidf = RandomForestClassifier(class_weight='balanced',
                                     n_estimators=100).fit(Xtrain_tfidf, ytrain_tfidf)

In [24]:
evaluate_model(Xtest_count, ytest_count, forest_clf_count)

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       783
           1       0.00      0.00      0.00        93
           2       0.00      0.00      0.00       118

    accuracy                           0.78       994
   macro avg       0.26      0.33      0.29       994
weighted avg       0.62      0.78      0.69       994


Confusion Matrix:
 [[780   0   3]
 [ 93   0   0]
 [118   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
evaluate_model(Xtest_tfidf, ytest_tfidf, forest_clf_tfidf)

              precision    recall  f1-score   support

           0       0.79      0.99      0.88       783
           1       1.00      0.01      0.02        93
           2       0.00      0.00      0.00       118

    accuracy                           0.78       994
   macro avg       0.60      0.33      0.30       994
weighted avg       0.71      0.78      0.69       994


Confusion Matrix:
 [[778   0   5]
 [ 92   1   0]
 [118   0   0]]


In [26]:
# Instantiate and fit training data to Random Forest Model (CountVec)
svc_count = LinearSVC().fit(Xtrain_count, ytrain_count)

# Instantiate and fit training data to Random Forest Model (TFIDF Vec)
svc_tfidf = LinearSVC().fit(Xtrain_tfidf, ytrain_tfidf)

In [27]:
evaluate_model(Xtest_count, ytest_count, svc_count)

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       783
           1       0.71      0.48      0.58        93
           2       0.42      0.28      0.34       118

    accuracy                           0.83       994
   macro avg       0.67      0.57      0.61       994
weighted avg       0.81      0.83      0.81       994


Confusion Matrix:
 [[746   9  28]
 [ 31  45  17]
 [ 76   9  33]]


In [28]:
evaluate_model(Xtest_count, ytest_count, svc_tfidf)

              precision    recall  f1-score   support

           0       0.93      0.77      0.84       783
           1       0.36      0.71      0.48        93
           2       0.26      0.35      0.29       118

    accuracy                           0.72       994
   macro avg       0.51      0.61      0.54       994
weighted avg       0.80      0.72      0.74       994


Confusion Matrix:
 [[604  83  96]
 [  4  66  23]
 [ 42  35  41]]
