### Dataset : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/reviews.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
nlp = spacy.load("en_core_web_sm")
df['review'] = df['review'].str.replace(r'<[^<>]*>', '', regex=True)
df['review'] = df.review.apply(
    lambda text: 
        " ".join(
            token.lemma_ for token in nlp(str(text))
                if not token.is_punct and token.lemma_.lower()
        )
)

df.sentiment = [1 if s == 'positive' else 0 for s in df.sentiment]

In [115]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.sentiment,test_size=0.2,random_state=42)

In [127]:
log_pipeline = Pipeline([("tfidf",TfidfVectorizer()),("model",LogisticRegression(solver='liblinear'))])
log_pipeline.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('model', LogisticRegression(solver='liblinear'))])

In [135]:
y_preds = log_pipeline.predict(X_test)
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [136]:
confusion_matrix(y_test,y_preds)

array([[4378,  583],
       [ 455, 4584]], dtype=int64)

In [139]:
pickle.dump(log_pipeline, open("sentiment_analysis_model.pkl", 'wb'))