# Sentiment Data Test Model Vector Space Model (MEDIUM)

## Load Data

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
moviereaction = pd.read_json('data/MovieReactionDS_medium.json')


In [5]:
moviereaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   300 non-null    object
 1   output  300 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from gensim.parsing.preprocessing import preprocess_string
from sklearn import utils
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tqdm import tqdm
import multiprocessing

In [22]:
class Doc2VecTransformer(BaseEstimator): 
    def __init__(self, vector_size=100, learning_rate=0.1, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() 

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(preprocess_string(row['input']), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) 

        for epoch in range(self.epochs):
            model.train(utils.shuffle([x for x in tqdm(tagged_x)]), 
        total_examples=len(tagged_x), epochs=1)
        model.alpha -= self.learning_rate
        model.min_alpha = model.alpha 

        self._model = model
        return self 
    
    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(preprocess_string(row['input'])) for index, row in df_x.iterrows()]))

In [16]:
def train_and_build_model():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']])

    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer(vector_size=220)),('pca', PCA(n_components=100)),('logistic', LogisticRegression())])
    pl.fit(train_x_df[['input']], train_y_df[['output']].values.ravel())
    predictions_y = pl.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y)) 

In [23]:
def train_short_range_grid_search():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']])

    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()), ('pca', PCA()), ('logistic', LogisticRegression())])

    param_grid = {
    'doc2vec__vector_size': [200, 220, 250],
    'pca__n_components': [50, 75, 100]
    }
    gs_cv = GridSearchCV(estimator=pl, param_grid=param_grid, cv=3, n_jobs=-1,
    scoring="accuracy")
    gs_cv.fit(train_x_df[['input']], train_y_df[['output']])

    print("Best parameter (CV score=%0.3f):" % gs_cv.best_score_)
    print(gs_cv.best_params_)
    predictions_y = gs_cv.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y))

In [10]:
def train_long_range_grid_search():
    all_reviews_df = moviereaction
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_reviews_df[['input']], all_reviews_df[['output']]) 
    
    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()),('pca', PCA()),('logistic', LogisticRegression())]) 
    
    param_grid = {
    'doc2vec__vector_size': [x for x in range(100, 250)],
    'pca__n_components': [x for x in range(1, 50)]
    }
    gs_cv = GridSearchCV(estimator=pl, param_grid=param_grid, cv=5, n_jobs=-1,
    scoring="accuracy")
    gs_cv.fit(train_x_df[['input']], train_y_df[['output']]) 
    
    print("Best parameter (CV score=%0.3f):" % gs_cv.best_score_)
    print(gs_cv.best_params_)
    predictions_y = gs_cv.predict(test_x_df[['input']])
    print('Accuracy: ', accuracy_score(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('\n')
    print('Confusion Matrix:', '\n',confusion_matrix(y_true=test_y_df[['output']], y_pred=predictions_y))
    print('Classification Report:', '\n',classification_report(y_true=test_y_df[['output']], y_pred=predictions_y))

In [19]:
train_and_build_model()

100%|██████████| 225/225 [00:00<00:00, 224855.47it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<00:00, 224855.47it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<00:00, 224641.37it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<00:00, 224694.86it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]


Accuracy:  0.7066666666666667


Confusion Matrix: 
 [[34  4]
 [18 19]]
Classification Report: 
               precision    recall  f1-score   support

    negative       0.65      0.89      0.76        38
    positive       0.83      0.51      0.63        37

    accuracy                           0.71        75
   macro avg       0.74      0.70      0.69        75
weighted avg       0.74      0.71      0.70        75



In [25]:
train_short_range_grid_search()

100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<00:00, 224801.91it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
  return f(*args, **kwargs)


Best parameter (CV score=0.618):
{'doc2vec__vector_size': 200, 'pca__n_components': 100}
Accuracy:  0.6133333333333333


Confusion Matrix: 
 [[25 14]
 [15 21]]
Classification Report: 
               precision    recall  f1-score   support

    negative       0.62      0.64      0.63        39
    positive       0.60      0.58      0.59        36

    accuracy                           0.61        75
   macro avg       0.61      0.61      0.61        75
weighted avg       0.61      0.61      0.61        75



In [14]:
train_long_range_grid_search()

100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<00:00, 224748.37it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]
100%|██████████| 225/225 [00:00<?, ?it/s]


Best parameter (CV score=0.613):
{'doc2vec__vector_size': 186, 'pca__n_components': 25}
Accuracy:  0.5466666666666666


Confusion Matrix: 
 [[27 10]
 [24 14]]
Classification Report: 
               precision    recall  f1-score   support

    negative       0.53      0.73      0.61        37
    positive       0.58      0.37      0.45        38

    accuracy                           0.55        75
   macro avg       0.56      0.55      0.53        75
weighted avg       0.56      0.55      0.53        75



  return f(*args, **kwargs)
