In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['sentiment_num'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment,sentiment_num
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

In [10]:
nlp.vocab['not'].is_stop = False
def preprocess_texts(texts, batch_size=128, n_process=4):
    results = []
    #since the dataset is large, using nlp.pipe will be better for faster and parallel processing
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=n_process):
        tokens = []
        for token in doc:
            if token.is_stop or token.is_punct or token.like_num:
                continue
            tokens.append(token.lemma_.lower())
        results.append(' '.join(tokens))
    return results  

In [11]:
df['preprocessed_review'] = preprocess_texts(df['review'].tolist())
df.head()

Unnamed: 0,review,sentiment,sentiment_num,preprocessed_review
0,One of the other reviewers has mentioned that ...,positive,1,reviewer mention watch oz episode hook right e...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production < br /><br />the f...
2,I thought this was a wonderful way to spend ti...,positive,1,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [14]:
x_train, x_test, y_train, y_test = train_test_split(df['preprocessed_review'], df['sentiment_num'], test_size=0.2, random_state=42)

In [20]:
lr_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic regression', LogisticRegression(max_iter=1000))
])

lr_clf.fit(x_train, y_train)
y_pred_lr = lr_clf.predict(x_test)
print("Logistic Regression Classification Report:\n",  classification_report(y_test, y_pred_lr))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [21]:
nb_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())    
])

nb_clf.fit(x_train, y_train)
y_prid_nb = nb_clf.predict(x_test)
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_prid_nb))

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.85      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



After checking the metrics of both the Logistic Regression and the Multinomial Naive Bayes models, the logistic regression appears to be a slightly better model overall. 
Naive Bayes performed a bit worse because it assumes words act independently and doesn’t fit as well with TF-IDF values. On the other hand, Logistic Regression can handle these patterns more accurately and is better suited with TF-IDF than Naive Bayes.