In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('data/cleaned_reviews.csv')
df.head()

Unnamed: 0,cleaned_review,sentiment
0,one other reviewers has mentioned after watchi...,1
1,wonderful little production filming technique ...,1
2,i thought wonderful way spend time too hot sum...,1
3,basically theres family where little boy jake ...,0
4,petter matteis love time money visually stunni...,1


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier  # BEGIN:

from sklearn.svm import LinearSVC  # END:

In [4]:
X_text = df["cleaned_review"]
y = df["sentiment"]

In [5]:
vector=joblib.load("models/tfidf_vectorizer.pkl")
X=vector.fit_transform(X_text)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["liblinear", "saga"]
}

logreg = LogisticRegression(max_iter=1000)
grid = GridSearchCV(logreg, param_grid, cv=5, scoring="accuracy", verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

# 6. Best model
best_model = grid.best_estimator_
print("✅ Best Hyperparameters:", grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
✅ Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


In [11]:
model= best_model
model.score(X_test, y_test)

0.89712

In [12]:
arr=model.predict(X_test)
arr

array([0, 1, 0, ..., 0, 1, 1])

In [14]:
#classification report
print(classification_report(y_test, arr))


              precision    recall  f1-score   support

           0       0.90      0.89      0.89      6157
           1       0.89      0.91      0.90      6343

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500



In [18]:
review="This movie is amazing! I love it so much. It has changed my life for the better."
def predict_sentiment(review):
    review_vectorized = vector.transform([review])
    prediction = model.predict(review_vectorized)
    x=prediction[0]
    if x==1:
        print("Positive")
    else:
        print("Negative")
predict_sentiment(review)

Positive


In [19]:
joblib.dump(model, "models/sentiment_model.pkl")
print("Model saved as sentiment_model.pkl")

Model saved as sentiment_model.pkl
