## Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import gensim
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

## Data Processing for TF-IDF

In [2]:
df = pd.read_csv("/kaggle/input/opinrank-dataset-processed/ModelTrain.csv")
X = df['Review'].to_numpy()
y = df['Sentiment'].to_numpy()
df = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X = None
y = None
print(X_train.shape, X_test.shape)

(98697,) (24675,)


In [3]:
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,10))
X_train = vectorizer.fit_transform(X_train)
vocab = vectorizer.vocabulary_
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,10), vocabulary=vocab)
X_test = vectorizer.fit_transform(X_test)

## Model 1: Logistic Regression (TF-IDF)

In [54]:
clf = SGDClassifier(loss="log_loss")
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

In [55]:
print("Training Accuracy: %.4f"%(accuracy_score(y_train, preds_train)))
print("Testing Accuracy: %.4f"%(accuracy_score(y_test, preds_test)))

Training Accuracy: 0.9067
Testing Accuracy: 0.9033


In [56]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

    NEGATIVE       0.92      0.89      0.90     12148
    POSITIVE       0.89      0.92      0.91     12527

    accuracy                           0.90     24675
   macro avg       0.90      0.90      0.90     24675
weighted avg       0.90      0.90      0.90     24675



In [57]:
import pickle
with open("preds_LRT", "wb") as f:
    pickle.dump(preds_test, f)

## Model 2: SVM (TF-IDF)

In [58]:
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

In [59]:
print("Training Accuracy: %.4f"%(accuracy_score(y_train, preds_train)))
print("Testing Accuracy: %.4f"%(accuracy_score(y_test, preds_test)))

Training Accuracy: 0.9127
Testing Accuracy: 0.9097


In [60]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

    NEGATIVE       0.92      0.89      0.91     12148
    POSITIVE       0.90      0.93      0.91     12527

    accuracy                           0.91     24675
   macro avg       0.91      0.91      0.91     24675
weighted avg       0.91      0.91      0.91     24675



In [61]:
import pickle
with open("preds_SVMT", "wb") as f:
    pickle.dump(preds_test, f)

## Model 3: Random Forest

In [16]:
clf = RandomForestClassifier(max_depth=20, n_estimators=500, n_jobs=-1)
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

In [17]:
print("Training Accuracy: %.4f"%(accuracy_score(y_train, preds_train)))
print("Testing Accuracy: %.4f"%(accuracy_score(y_test, preds_test)))

Training Accuracy: 0.9252
Testing Accuracy: 0.8702


In [18]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

    NEGATIVE       0.89      0.84      0.86     12148
    POSITIVE       0.85      0.90      0.88     12527

    accuracy                           0.87     24675
   macro avg       0.87      0.87      0.87     24675
weighted avg       0.87      0.87      0.87     24675



In [19]:
import pickle
with open("preds_RF", "wb") as f:
    pickle.dump(preds_test, f)

## Data Processing for Doc2Vec

In [3]:
df = pd.read_csv("/kaggle/input/opinrank-dataset-processed/ModelTrain.csv")
X = df['Review'].to_numpy()
y = df['Sentiment'].to_numpy()
df = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X = None
y = None
print(X_train.shape, X_test.shape)

(98697,) (24675,)


In [4]:
def read_corpus(f, tokens_only=False):
    for i, line in tqdm(enumerate(f)):
        tokens = word_tokenize(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [5]:
Xtrain = list(read_corpus(X_train))
Xtest = list(read_corpus(X_test, tokens_only=True))

98697it [01:21, 1212.99it/s]
24675it [00:20, 1228.41it/s]


In [28]:
print(len(Xtrain), len(Xtest))

98697 24675


In [38]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=4, window=10)
model.build_vocab(Xtrain)
model.train(Xtrain, total_examples=model.corpus_count, epochs=model.epochs)
model.save("doc2vec.model")

In [39]:
X_train = []
X_test = []
for i in tqdm(range(len(Xtrain))):
    X_train.append(model.infer_vector(Xtrain[i].words))
    
for i in tqdm(range(len(Xtest))):
    X_test.append(model.infer_vector(Xtest[i]))

100%|██████████| 98697/98697 [06:10<00:00, 266.70it/s]
100%|██████████| 24675/24675 [01:31<00:00, 268.47it/s]


In [40]:
len(X_test)

24675

## Model 4: Logistic Regression (Word2Vec)

In [42]:
clf = SGDClassifier(loss="log_loss")
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test_LRW = clf.predict(X_test)

In [45]:
print("Training Accuracy: %.4f"%(accuracy_score(y_train, preds_train)))
print("Testing Accuracy: %.4f"%(accuracy_score(y_test, preds_test_LRW)))

Training Accuracy: 0.8196
Testing Accuracy: 0.8190


In [46]:
print(classification_report(y_test, preds_test_LRW))

              precision    recall  f1-score   support

    NEGATIVE       0.82      0.80      0.81     12148
    POSITIVE       0.81      0.83      0.82     12527

    accuracy                           0.82     24675
   macro avg       0.82      0.82      0.82     24675
weighted avg       0.82      0.82      0.82     24675



In [47]:
import pickle
with open("preds_LRW", "wb") as f:
    pickle.dump(preds_test, f)

## Model 5: SVM (Word2Vec)

In [48]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)

In [49]:
print("Training Accuracy: %.4f"%(accuracy_score(y_train, preds_train)))
print("Testing Accuracy: %.4f"%(accuracy_score(y_test, preds_test)))

Training Accuracy: 0.8253
Testing Accuracy: 0.8244


In [50]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

    NEGATIVE       0.83      0.80      0.82     12148
    POSITIVE       0.82      0.85      0.83     12527

    accuracy                           0.82     24675
   macro avg       0.82      0.82      0.82     24675
weighted avg       0.82      0.82      0.82     24675



In [51]:
import pickle
with open("preds_SVMW", "wb") as f:
    pickle.dump(preds_test, f)