In [1]:
import pandas as pd

filepath_dict = {'yelp':'sentiment labelled sentences/yelp_labelled.txt',
                 'amazon':'sentiment labelled sentences/amazon_cells_labelled.txt',
                 'imdb':'sentiment labelled sentences/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
    
df = pd.concat(df_list)
df = df[df['source'] == 'yelp']
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

SVC_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

def train(classifier):
    df_yelp = df[df['source'] == 'yelp']
    sentences = df_yelp['sentence'].values
    y = df_yelp['label'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)

    X_train = vectorizer.transform(sentences_train)
    X_test = vectorizer.transform(sentences_test)

    classifier.fit(X_train.toarray(), y_train)
    score = classifier.score(X_test.toarray(), y_test)
    print("accuracy: ", score)
    return score

In [3]:
SVM_score = train(SVC_clf)
NB_score = train(GaussianNB())
LR_score = train(LogisticRegression())

accuracy:  0.672
accuracy:  0.664
accuracy:  0.796


In [4]:
import pathlib as pl
import multiprocessing
import numpy as np
import sklearn.metrics as metrics

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.parsing.preprocessing import preprocess_string
from sklearn import utils
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV

In [5]:
class Doc2VecTransformer(BaseEstimator):
    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count()
        
    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(preprocess_string(row['sentence']), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)
        
        for epoch in range(self.epochs):
            model.train(utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha
        
        self._model = model
        return self
    
    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(preprocess_string(row['sentence']))
                                    for index, row in df_x.iterrows()]))
        

In [6]:
def train_and_build_model(classifier):
    all_df = df
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_df[['sentence']],
                                                                    all_df[['label']])
    
    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()),
                         ('pca', PCA()),
                         ('classifier', classifier)
                         ])
    pl.fit(train_x_df[['sentence']], train_y_df[['label']])
    predictions_y = pl.predict(test_x_df[['sentence']])
    print('Accuracy: ', metrics.accuracy_score(y_true=test_y_df[['label']], y_pred=predictions_y))
    return metrics.accuracy_score(y_true=test_y_df[['label']], y_pred=predictions_y)


In [7]:
SVM_score_with_PCA = train_and_build_model(SVC_clf)
NB_score_with_PCA = train_and_build_model(GaussianNB())
LR_score_with_PCA = train_and_build_model(LogisticRegression())

100%|██████████| 750/750 [00:00<00:00, 752386.51it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 754914.33it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 752206.60it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
  y = column_or_1d(y, warn=True)


Accuracy:  0.532


100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 1499393.71it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 749696.85it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 753287.36it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
  y = column_or_1d(y, warn=True)


Accuracy:  0.48


100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 750591.27it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
  y = column_or_1d(y, warn=True)


Accuracy:  0.492




In [8]:
def train_short_range_grid_search(classifier):
    all_df = df
    train_x_df, test_x_df, train_y_df, test_y_df = train_test_split(all_df[['sentence']],
                                                                    all_df[['label']])
    
    pl = Pipeline(steps=[('doc2vec', Doc2VecTransformer()),
                         ('pca', PCA()),
                         ('classifier', classifier)
                         ])
    param_grid = {
        'doc2vec__vector_size': [200, 220, 250],
        'pca__n_components': [50, 75, 100]
    }
    gs_cv = GridSearchCV(estimator=pl, param_grid=param_grid, cv=3, n_jobs=-1,
                         scoring='accuracy')
    gs_cv.fit(train_x_df[['sentence']], train_y_df[['label']])
    
    print('Best parameter (CV score=%0.3f):' % gs_cv.best_score_)
    print(gs_cv.best_params_)
    predictions_y = gs_cv.predict(test_x_df[['sentence']])
    print('Accuracy: ', metrics.accuracy_score(y_true=test_y_df[['label']], y_pred=predictions_y))
    return metrics.accuracy_score(y_true=test_y_df[['label']], y_pred=predictions_y)


In [9]:
SVM_score_with_PCA_tuning = train_short_range_grid_search(SVC_clf)
NB_score_with_PCA_tuning = train_short_range_grid_search(GaussianNB())
LR_score_with_PCA_tuning = train_short_range_grid_search(LogisticRegression())

100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 748092.27it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 755639.68it/s]
100%|██████████| 750/750 [00:00<00:00, 218362.35it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 749696.85it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 755095.54it/s]
100%|██████████| 750/750 [00:00<00:00, 752746.59it/s]
100%|██████████| 750/750 [00:00<00:00, 749875.57it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 754914.33it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
  y = column_or_1d(y, warn=True)


Best parameter (CV score=0.531):
{'doc2vec__vector_size': 250, 'pca__n_components': 75}
Accuracy:  0.52


100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 754914.33it/s]
100%|██████████| 750/750 [00:00<00:00, 752566.51it/s]
100%|██████████| 750/750 [00:00<00:00, 752206.60it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 4017532.57it/s]
100%|██████████| 750/750 [00:00<00:00, 752566.51it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 751308.34it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 750412.21it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
  y = column_or_1d(y, warn=True)


Best parameter (CV score=0.556):
{'doc2vec__vector_size': 220, 'pca__n_components': 75}
Accuracy:  0.536


100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 749161.23it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 5607358.29it/s]
100%|██████████| 750/750 [00:00<00:00, 756002.88it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 753107.01it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<00:00, 751847.04it/s]
100%|██████████| 750/750 [00:00<00:00, 752386.51it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]
100%|██████████| 750/750 [00:00<?, ?it/s]


Best parameter (CV score=0.543):
{'doc2vec__vector_size': 220, 'pca__n_components': 100}
Accuracy:  0.508


  y = column_or_1d(y, warn=True)


In [10]:
print('SVM accuracy: ',SVM_score)
print('SVM + PCA accuracy: ',SVM_score_with_PCA)
print('SVM + PCA + tuning hyperparameters accuracy: ',SVM_score_with_PCA_tuning)
print('')
print('Naive Bayes accuracy: ',NB_score)
print('Naive Bayes + PCA accuracy: ',NB_score_with_PCA)
print('Naive Bayes + PCA + tuning hyperparameters accuracy: ',NB_score_with_PCA_tuning)
print('')
print('Logistic Regression accuracy: ',LR_score)
print('Logistic Regression + PCA accuracy: ',LR_score_with_PCA)
print('Logistic Regression + PCA + tuning hyperparameters accuracy: ',LR_score_with_PCA_tuning)


SVM accuracy:  0.672
SVM + PCA accuracy:  0.532
SVM + PCA + tuning hyperparameters accuracy:  0.52

Naive Bayes accuracy:  0.664
Naive Bayes + PCA accuracy:  0.48
Naive Bayes + PCA + tuning hyperparameters accuracy:  0.536

Logistic Regression accuracy:  0.796
Logistic Regression + PCA accuracy:  0.492
Logistic Regression + PCA + tuning hyperparameters accuracy:  0.508


# Conclusion
I used the yelp dataset. The PCA made results worse. Tuning the hyperparameters using gridsearch made the accuracy of Naive Bayes and Logistic Regression better, but the accuracy of SVM worse.