In [7]:
import pandas as pd

df = pd.read_csv("data/imdb_reviews.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['review_text']
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert text to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape


((6, 24), (2, 24))

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

accuracy = model.score(X_test_tfidf, y_test)
accuracy


0.0

In [5]:
model.predict(X_test_tfidf), y_test.values


(array([1, 1]), array([0, 0]))

In [6]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

nb_accuracy = nb_model.score(X_test_tfidf, y_test)
nb_accuracy


0.0

In [8]:
import pandas as pd

df = pd.read_csv("data/imdb_reviews.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

accuracy = nb.score(X_test_tfidf, y_test)
accuracy


0.8643

In [11]:
def predict_sentiment(text):
    text_tfidf = vectorizer.transform([text])
    prediction = nb.predict(text_tfidf)[0]
    return "Positive" if prediction == 1 else "Negative"

print(predict_sentiment("I loved the movie, it was amazing!"))
print(predict_sentiment("The movie was boring and terrible."))
print(predict_sentiment("The story was okay but acting felt weak."))


Positive
Negative
Negative


In [12]:
def predict_sentiment(text):
    text_tfidf = vectorizer.transform([text])
    prediction = nb.predict(text_tfidf)[0]
    return "Positive" if prediction == 1 else "Negative"

print(predict_sentiment("I think we should give credit to Peter was suggesting this movie"))
print(predict_sentiment("Harry took revenge on us by giving us front seats of this movie"))
print(predict_sentiment("i think these were longest 2 hours of my life"))


Positive
Positive
Negative


In [13]:
import pickle

with open("model/sentiment_model.pkl", "wb") as f:
    pickle.dump(nb, f)

with open("model/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [14]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_tfidf, y_train)

log_accuracy = log_reg.score(X_test_tfidf, y_test)
log_accuracy


0.8946

In [15]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

svm_accuracy = svm_model.score(X_test_tfidf, y_test)
svm_accuracy


0.8921

In [16]:
import joblib

joblib.dump(log_reg, "model/sentiment_model.pkl")
joblib.dump(vectorizer, "model/tfidf_vectorizer.pkl")


['model/tfidf_vectorizer.pkl']

In [17]:
def predict_sentiment(text):
    text_tfidf = vectorizer.transform([text])
    pred = log_reg.predict(text_tfidf)[0]
    return "Positive" if pred == 1 else "Negative"


In [19]:
print(predict_sentiment("The movie was amazing but painfully slow"))
print(predict_sentiment("Actually i hate the actors but i have to appreciate their job."))


Positive
Positive


In [20]:
def predict_sentiment(text):
    text_tfidf = vectorizer.transform([text])

    pred_lr = log_reg.predict(text_tfidf)[0]
    pred_svm = svm_model.predict(text_tfidf)[0]

    # If both agree → return that
    if pred_lr == pred_svm:
        return "Positive" if pred_lr == 1 else "Negative"
    
    # If they disagree → take Logistic Regression (more calibrated)
    return "Positive" if pred_lr == 1 else "Negative"


In [22]:
print(predict_sentiment("The movie was painfully slow"))
print(predict_sentiment(" I have to appreciate their job."))


Negative
Positive


In [23]:
import joblib

joblib.dump(vectorizer, "model/vectorizer.pkl")
joblib.dump(log_reg, "model/log_reg.pkl")
joblib.dump(svm_model, "model/svm_model.pkl")


['model/svm_model.pkl']