In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib


In [25]:
df = pd.read_csv("data/imdb_reviews.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [26]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [27]:
X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [28]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape


((40000, 50000), (10000, 50000))

In [29]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_tfidf, y_train)

log_accuracy = log_reg.score(X_test_tfidf, y_test)
log_accuracy


0.8946

In [30]:
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

svm_accuracy = svm_model.score(X_test_tfidf, y_test)
svm_accuracy


0.8921

In [31]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

nb_accuracy = nb_model.score(X_test_tfidf, y_test)
nb_accuracy


0.8643

In [32]:
results = pd.DataFrame({
    "Model": ["Naive Bayes", "Logistic Regression", "Linear SVM"],
    "Accuracy": [nb_accuracy, log_accuracy, svm_accuracy]
})

results


Unnamed: 0,Model,Accuracy
0,Naive Bayes,0.8643
1,Logistic Regression,0.8946
2,Linear SVM,0.8921


In [33]:
joblib.dump(vectorizer, "model/vectorizer.pkl")
joblib.dump(log_reg, "model/log_reg.pkl")
joblib.dump(svm_model, "model/svm_model.pkl")


['model/svm_model.pkl']

In [34]:
def predict_sentiment(text):
    text_tfidf = vectorizer.transform([text])

    pred_lr = log_reg.predict(text_tfidf)[0]
    pred_svm = svm_model.predict(text_tfidf)[0]

    # If both models agree → accept the decision
    if pred_lr == pred_svm:
        return "Positive" if pred_lr == 1 else "Negative"
    
    # If models disagree → prefer Logistic Regression (better calibrated)
    return "Positive" if pred_lr == 1 else "Negative"


In [35]:
print(predict_sentiment("The movie was amazing and emotional."))
print(predict_sentiment("This was the worst movie I’ve ever seen."))
print(predict_sentiment("It was okay, not great but not terrible."))


Positive
Negative
Negative
