In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Dataset
The dataset which I used in this model is "IMDB Dataset of 50k Movies Reviews"

https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [5]:
dataset = pd.read_csv('/dataset/IMDB Dataset.csv')

In [6]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}')

Rows: 2
Columns: 50000


In [8]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


### Text Operation 

In [9]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [10]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

### Transformation and Vectorization

In [11]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [12]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

### Split the Dataset

In [13]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

### Logistic Regression

In [14]:
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)   
LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,LRpred)}')
print(f'Accuracy: {accuracy_score(y_test,LRpred)*100}%')
pickle.dump(LRmodel, open('/saved_model/LinearRegression_model.sav', 'wb'))
print('Logistic Regression trained Model Saved')



Confusion Matrix:
[[4453  598]
 [ 505 4444]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      5051
    positive       0.88      0.90      0.89      4949

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy: 88.97%
Logistic Regression trained Model Saved


In [15]:
# Another random review
pre = LRmodel.predict(["Production has an incredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}')

Prediction: positive


### Random Forest

In [16]:
RFclassifier = RandomForestClassifier(n_estimators = 200)
RFmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', RFclassifier)])

# Train the Model
RFmodel.fit(X_train,y_train)   
RFpred = RFmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,RFpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,RFpred)}')
print(f'Accuracy: {accuracy_score(y_test,RFpred)*100}%')
pickle.dump(RFmodel, open('/saved_model/RandomForest_model.sav', 'wb'))
print('RandomForest trained Model Saved')

Confusion Matrix:
[[4359  692]
 [ 591 4358]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.86      0.87      5051
    positive       0.86      0.88      0.87      4949

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy: 87.17%
RandomForest trained Model Saved


In [17]:
# Another random review
pre = RFmodel.predict(["I think this is my first review. This series is so bad I had to write one. I don't understand the good score. I have tried on 2 separate occasions to watch this show. Haven't even gotten past the 2nd episode because it is SO BORING."])
print(f'Prediction: {pre[0]}')

Prediction: negative


### LinearSVC

In [18]:
SVCclassifier = LinearSVC()
SVCmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', SVCclassifier)])

# Train the Model
SVCmodel.fit(X_train,y_train)   
SVCpred = SVCmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,SVCpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,SVCpred)}')
print(f'Accuracy: {accuracy_score(y_test,SVCpred)*100}%')
pickle.dump(SVCmodel, open('/saved_model/LinearSVC_model.sav', 'wb'))
print('LinearSVC trained Model Saved')



Confusion Matrix:
[[4371  680]
 [ 589 4360]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.87      0.87      5051
    positive       0.87      0.88      0.87      4949

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy: 87.31%
LinearSVC trained Model Saved


In [21]:
pre = SVCmodel.predict(["Henry cavill nailed the role perfectly. The fight scenes, the music, the cinematography, the whole atmosphere is beyond amazing. Netflix did it again"])
print(f'Prediction: {pre[0]}')

Prediction: positive


### Conclusion
The accuracy of all the algorithm is nearly the same and Logistic Regression accuracy is best among them with an accuracy of 88.97%