# Sentiment Data Test Model Vector Space Model (Full)

## Load Data

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
moviereaction = pd.read_json('data/MovieReactionDS.json')

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from gensim.parsing.preprocessing import preprocess_string
from sklearn import utils
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tqdm import tqdm
import multiprocessing

In [6]:
class Doc2VecTransformer(BaseEstimator): 
    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() 

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(preprocess_string(row['input']), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) 

        for epoch in range(self.epochs):
            model.train(utils.shuffle([x for x in tqdm(tagged_x)]), 
        total_examples=len(tagged_x), epochs=1)
        model.alpha -= self.learning_rate
        model.min_alpha = model.alpha 

        self._model = model
        return self 
    
    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(preprocess_string(row['input'])) for index, row in df_x.iterrows()]))

In [7]:
def train_and_build_model():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']])

    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer(vector_size=220)),('pca', PCA(n_components=100)),('logistic', LogisticRegression())])
    pl.fit(train_x_df[['input']], train_y_df[['output']])
    predictions_y = pl.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y)) 

In [8]:
def train_short_range_grid_search():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']])

    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()),
    ('pca', PCA()),
    ('logistic', LogisticRegression())])

    param_grid = {
    'doc2vec__vector_size': [200, 220, 250],
    'pca__n_components': [50, 75, 100]
    }
    gs_cv = GridSearchCV(estimator=pl, param_grid=param_grid, cv=3, n_jobs=-1,
    scoring="accuracy")
    gs_cv.fit(train_x_df[['input']], train_y_df[['output']])

    print("Best parameter (CV score=%0.3f):" % gs_cv.best_score_)
    print(gs_cv.best_params_)
    predictions_y = gs_cv.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y)) 

In [9]:
def train_long_range_grid_search():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']]) 
    
    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()),('pca', PCA()),('logistic', LogisticRegression())]) 
    
    param_grid = {
    'doc2vec__vector_size': [x for x in range(100, 250)],
    'pca__n_components': [x for x in range(1, 50)]
    }
    gs_cv = GridSearchCV(estimator=pl, param_grid=param_grid, cv=5, n_jobs=-1,
    scoring="accuracy")
    gs_cv.fit(train_x_df[['input']], train_y_df[['output']]) 
    
    print("Best parameter (CV score=%0.3f):" % gs_cv.best_score_)
    print(gs_cv.best_params_)
    predictions_y = gs_cv.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y))

In [10]:
train_and_build_model()

100%|██████████| 18750/18750 [00:00<00:00, 6243010.24it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683373.03it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244497.38it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244497.38it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6242514.68it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243505.87it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243505.87it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683373.03it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244497.38it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6245489.20it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683651.96it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6245489.20it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243505.87it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244993.25it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6242514.68it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/

Accuracy:  0.71712


Confusion Matrix: 
 [[2548  552]
 [1216 1934]]
Classification Report: 
               precision    recall  f1-score   support

    negative       0.68      0.82      0.74      3100
    positive       0.78      0.61      0.69      3150

    accuracy                           0.72      6250
   macro avg       0.73      0.72      0.71      6250
weighted avg       0.73      0.72      0.71      6250



In [11]:
train_short_range_grid_search()

100%|██████████| 18750/18750 [00:00<00:00, 4682815.29it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683094.15it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6245985.23it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244497.38it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683373.03it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244497.38it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243010.24it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6246481.33it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4684209.90it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243505.87it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6243505.87it/s]
100%|██████████| 18750/18750 [00:00<00:00, 4683373.03it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244001.59it/s]
100%|██████████| 18750/18750 [00:00<00:00, 6244993.25it/

Best parameter (CV score=0.725):
{'doc2vec__vector_size': 200, 'pca__n_components': 75}
Accuracy:  0.7232


Confusion Matrix: 
 [[2559  587]
 [1143 1961]]
Classification Report: 
               precision    recall  f1-score   support

    negative       0.69      0.81      0.75      3146
    positive       0.77      0.63      0.69      3104

    accuracy                           0.72      6250
   macro avg       0.73      0.72      0.72      6250
weighted avg       0.73      0.72      0.72      6250



In [12]:
# train_long_range_grid_search()