# Sentiment Modeling

## Imports

In [1]:
import os

import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt 

from scipy.sparse import save_npz, load_npz

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import (
    MultinomialNB, ComplementNB
)
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier
)

import sys
sys.path.append("../")
from src.sentiment_modeling import * 
from src.feature_engineering import *
from src.utility import *

## Loading Config

## Loading data

In [2]:
df = load_pickle("../data/interim/IMDB_feature_engineered.pkl")
df.head()

Unnamed: 0,review,sentiment,review_charecters_len,review_word_len,has_html,cleaned_review,tokens,cleaned_review_charecter_len,cleaned_review_word_len,cleaned_review_has_html,positive_tokens,negative_tokens,positive_tokens_len,negative_tokens_len
0,One of the other reviewers has mentioned that ...,1,1377,320,True,one reviewer mentioned watching oz episode hoo...,"[one, reviewer, mentioned, watching, oz, episo...",931,162,False,"[right, right, trust, regard, classic, appeal,...","[struck, brutality, faint, timid, punch, priso...",13,20
1,A wonderful little production. <br /><br />The...,1,793,166,True,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...",557,84,False,"[wonderful, comforting, well, seamless, well, ...",[terribly],11,1
2,I thought this was a wonderful way to spend ti...,1,721,172,True,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",495,83,False,"[wonderful, hot, witty, likable, well, impress...","[plot, simplistic, killer, disappointed, risk,...",11,6
3,Basically there's a family where a little boy ...,0,569,141,True,basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, think, ...",362,62,False,"[like, well]","[zombie, slower, kill, ruin, meaningless, ignore]",2,6
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,1032,236,True,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...",725,123,False,"[love, stunning, vivid, success, stylishly, so...","[loneliness, anxiously]",15,2


In [3]:
# "review_charecters_len","review_word_len",
features = ["cleaned_review_charecter_len","cleaned_review_word_len","positive_tokens_len","negative_tokens_len"]
target = "sentiment"

In [4]:
countVectorized = load_npz("../data/interim/count_vectorized_reviwes.npz")
tfidfVectorized = load_npz("../data/interim/tfidf_vectorized_reviwes.npz")

In [5]:
countVectorized, tfidfVectorized

(<Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 5044360 stored elements and shape (50000, 20000)>,
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 5044360 stored elements and shape (50000, 20000)>)

## SSplitting the data

In [6]:
test_size = 0.2

In [7]:
df_X = df[features] 
y = df[target] 

In [8]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, y, test_size=test_size, stratify=y)

In [9]:
df_X_train.shape, df_X_test.shape, df_y_train.shape, df_y_test.shape

((40000, 4), (10000, 4), (40000,), (10000,))

In [10]:
count_X_train, count_X_test, count_y_train, count_y_test = train_test_split(countVectorized, y, test_size=test_size, stratify=y)

In [11]:
count_X_train.shape, count_X_test.shape, count_y_train.shape, count_y_test.shape

((40000, 20000), (10000, 20000), (40000,), (10000,))

In [12]:
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidfVectorized, y, test_size=test_size, stratify=y)

In [13]:
tfidf_X_train.shape, tfidf_X_test.shape, tfidf_y_train.shape, tfidf_y_test.shape

((40000, 20000), (10000, 20000), (40000,), (10000,))

## Models

In [14]:
models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000), 
    "LinearSVC": LinearSVC(random_state=42, max_iter=5000), 
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1), 
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}
scoring_metrics = [
    'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
]

folds = 5
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Traning models on Numeric cols in df

In [15]:
df_models, df_results = train_and_eval_models(
    data=(df_X_train, df_X_test, df_y_train, df_y_test),
    models=models.copy()
)
save_data_csv(df_results, "../results/metrics/df_results.csv")
save_pickle(df_models, "../models/df_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...

Evaluation Complete.


In [16]:
df_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,0.321393,0.7322,0.731598,0.7335,0.732548,0.798751,0.7355,0.734983,0.7366,0.735791,0.796612
1,LinearSVC,0.067192,0.7321,0.731452,0.7335,0.732475,0.798658,0.7353,0.734877,0.7362,0.735538,0.796604
2,MultinomialNB,0.007934,0.730175,0.719685,0.75405,0.736467,0.796235,0.7293,0.720015,0.7504,0.734894,0.793526
3,ComplementNB,0.009014,0.730175,0.719685,0.75405,0.736467,0.796235,0.7293,0.720015,0.7504,0.734894,0.793526
4,RandomForestClassifier,3.428587,0.99595,0.996198,0.9957,0.995949,0.999944,0.7019,0.706738,0.6902,0.698371,0.770325
5,ExtraTreesClassifier,2.358495,0.995975,0.999899,0.99205,0.995959,0.999968,0.685,0.689705,0.6726,0.681045,0.748989


In [17]:
df_cv_models, df_cv_results = train_and_eval_models(
    data=(df_X, y),
    models=models.copy(),
    cv_strategy=cv_strategy
)
save_data_csv(df_cv_results, "../results/metrics/df_cv_results.csv")
save_pickle(df_cv_models, "../models/df_cv_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...

Evaluation Complete.


In [18]:
df_cv_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,0.367471,0.732591,0.732472,0.732849,0.73266,0.798258,0.73272,0.732596,0.733,0.732784,0.798209
1,LinearSVC,0.172744,0.732356,0.732468,0.732116,0.732291,0.798171,0.73258,0.732651,0.73244,0.73253,0.798123
2,MultinomialNB,0.025101,0.730098,0.719887,0.753316,0.736222,0.795684,0.73002,0.719767,0.75336,0.736172,0.795656
3,ComplementNB,0.021932,0.730098,0.719887,0.753316,0.736222,0.795684,0.73002,0.719767,0.75336,0.736172,0.795656
4,RandomForestClassifier,14.488067,0.995296,0.995452,0.995138,0.995295,0.999922,0.69966,0.705607,0.68524,0.695261,0.767708
5,ExtraTreesClassifier,10.076627,0.995358,0.999807,0.990907,0.995337,0.999957,0.6861,0.690795,0.67384,0.682178,0.747613


## Traning models on count vectors

In [19]:
count_models, count_results = train_and_eval_models(
    data=(count_X_train, count_X_test, count_y_train, count_y_test),
    models=models.copy()
)
save_data_csv(count_results, "../results/metrics/count_results.csv")
save_pickle(count_models, "../models/count_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...

Evaluation Complete.


In [20]:
count_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,19.190822,0.99755,0.997252,0.99785,0.997551,0.99991,0.8822,0.873534,0.8938,0.883551,0.946014
1,LinearSVC,73.512671,1.0,1.0,1.0,1.0,1.0,0.8615,0.855737,0.8696,0.862613,0.930814
2,MultinomialNB,0.040512,0.88125,0.879315,0.8838,0.881552,0.94354,0.8662,0.858037,0.8776,0.867708,0.929543
3,ComplementNB,0.05169,0.88125,0.879315,0.8838,0.881552,0.943541,0.8662,0.858037,0.8776,0.867708,0.929543
4,RandomForestClassifier,84.281877,1.0,1.0,1.0,1.0,1.0,0.8525,0.851166,0.8544,0.85278,0.926828
5,ExtraTreesClassifier,117.579944,1.0,1.0,1.0,1.0,1.0,0.8692,0.86861,0.87,0.869305,0.940544


In [21]:
count_cv_models, count_cv_results = train_and_eval_models(
    data=(countVectorized, y),
    models=models.copy(),
    cv_strategy=cv_strategy
)
save_data_csv(count_cv_results, "../results/metrics/count_cv_results.csv")
save_pickle(count_cv_models, "../models/count_cv_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...

Evaluation Complete.


In [22]:
count_cv_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,11.886929,0.996071,0.995521,0.996627,0.996073,0.999864,0.88134,0.878941,0.8846,0.881722,0.946948
1,LinearSVC,132.02237,1.0,1.0,1.0,1.0,1.0,0.86008,0.859869,0.86044,0.860115,0.92978
2,MultinomialNB,0.170873,0.880358,0.877934,0.883564,0.88074,0.942489,0.86554,0.862454,0.8698,0.866087,0.92963
3,ComplementNB,0.168179,0.880358,0.877934,0.883564,0.88074,0.942488,0.86554,0.862454,0.8698,0.866087,0.92963
4,RandomForestClassifier,376.245873,1.0,1.0,1.0,1.0,1.0,0.856,0.854219,0.85856,0.856368,0.931241
5,ExtraTreesClassifier,513.715366,1.0,1.0,1.0,1.0,1.0,0.87492,0.876682,0.8726,0.874625,0.94394


# Traning models on tfidf vectors

In [23]:
tfidf_models, tfidf_results = train_and_eval_models(
    data=(tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test),
    models=models.copy()
)
save_data_csv(tfidf_results, "../results/metrics/tfidf_results.csv")
save_pickle(tfidf_models, "../models/tfidf_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...

Evaluation Complete.


In [24]:
tfidf_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,1.600286,0.9313,0.924633,0.93915,0.931835,0.980914,0.8965,0.887586,0.908,0.897677,0.962865
1,LinearSVC,1.683593,0.9815,0.979391,0.9837,0.981541,0.99814,0.89,0.888756,0.8916,0.890176,0.959033
2,MultinomialNB,0.040874,0.89435,0.886088,0.90505,0.895468,0.959475,0.8728,0.862223,0.8874,0.87463,0.945757
3,ComplementNB,0.041525,0.89435,0.886088,0.90505,0.895468,0.959475,0.8728,0.862223,0.8874,0.87463,0.945757
4,RandomForestClassifier,78.765965,1.0,1.0,1.0,1.0,1.0,0.8588,0.858657,0.859,0.858828,0.93471
5,ExtraTreesClassifier,117.173415,1.0,1.0,1.0,1.0,1.0,0.8698,0.869504,0.8702,0.869852,0.941463


In [25]:
tfidf_cv_models, tfidf_cv_results = train_and_eval_models(
    data=(tfidfVectorized, y),
    models=models.copy(),
    cv_strategy=cv_strategy
)
save_data_csv(tfidf_cv_results, "../results/metrics/tfidf_cv_results.csv")
save_pickle(tfidf_cv_models, "../models/tfidf_cv_models.pkl")

Evaluating LogisticRegression...
Evaluating LinearSVC...
Evaluating MultinomialNB...
Evaluating ComplementNB...
Evaluating RandomForestClassifier...
Evaluating ExtraTreesClassifier...





Evaluation Complete.


In [26]:
tfidf_cv_results

Unnamed: 0,Model,Fit_Time_sec,Train_Accuracy,Train_Precision,Train_Recall,Train_F1,Train_Roc_auc,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_Roc_auc
0,LogisticRegression,1.548119,0.930924,0.924049,0.939031,0.93148,0.980964,0.8988,0.890782,0.90908,0.899826,0.962854
1,LinearSVC,4.233737,0.979162,0.976867,0.981569,0.979212,0.997744,0.89308,0.888821,0.8986,0.893658,0.959203
2,MultinomialNB,0.183561,0.892602,0.883037,0.905089,0.893927,0.958285,0.87484,0.864539,0.889,0.876574,0.945481
3,ComplementNB,0.174376,0.892602,0.883037,0.905089,0.893927,0.958285,0.87484,0.864539,0.889,0.876574,0.945481
4,RandomForestClassifier,350.379317,1.0,1.0,1.0,1.0,1.0,0.8556,0.855997,0.85508,0.855525,0.932736
5,ExtraTreesClassifier,517.930759,1.0,1.0,1.0,1.0,1.0,0.86846,0.867887,0.86928,0.868557,0.940811


## Train on w2v