In [1]:
# Importing  libraries
import pandas as pd
import bz2
import string
import nltk
import joblib

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#Download NLTK stopwords (run once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sejal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#loading the compressed data
rows=[]
with bz2.open("train.ft.txt.bz2",mode="rt",encoding="utf-8")as file:
    for i,line in enumerate(file):
        if not line.strip():continue  #skip empty lines
        label, text = line.split(" ", 1)  # split label and text
        label = int(label.replace("__label__", ""))  # clean label
        rows.append((text.strip(), label))  # save tuple (text, rating)
        if i >= 200000: break  

#create dataframe
df=pd.DataFrame(rows,columns=["text","labels"])

# Show first 10 labels
print(" Sample ratings extracted:", [r[1] for r in rows[:20]])


👀 Sample ratings extracted: [2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1]


In [22]:
#convert labels to s sentiment

def label_sentiment(labels):
    if labels == 1:
        return "negative"
    elif labels == 2:
        return "positive"
    

df["sentiment"] = df["labels"].apply(label_sentiment)

print(df["labels"].value_counts())
print(df["sentiment"].value_counts())

2    101167
1     98834
Name: labels, dtype: int64
positive    101167
negative     98834
Name: sentiment, dtype: int64


In [23]:
#clean the text
def clean_text(text):
    text = text.lower()  # lowercase
    text = "".join([char for char in text if char not in string.punctuation])  # remove punctuation
    words = text.split()  # split into words
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return " ".join(words)

df["clean_text"] = df["text"].apply(clean_text)

In [None]:
#splitting dataset into training and testing dataset
X = df["clean_text"]  
y = df["sentiment"]   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print("y_train class counts:\n", y_train.value_counts())
print("y_test class counts:\n", y_test.value_counts())

#  Create naive bayes pipeline
nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", MultinomialNB())
])

#  create logistic regression pipeline
lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", LogisticRegression(max_iter=200))
])

#  train both models
nb_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)


y_train class counts:
 positive    80933
negative    79067
Name: sentiment, dtype: int64
y_test class counts:
 positive    20234
negative    19767
Name: sentiment, dtype: int64


In [25]:
# 📈 Step 9: Evaluate both models
nb_report = classification_report(y_test, nb_pipeline.predict(X_test))
lr_report = classification_report(y_test, lr_pipeline.predict(X_test))

print("📊 Naive Bayes Report:\n", nb_report)
print("📊 Logistic Regression Report:\n", lr_report)


📊 Naive Bayes Report:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85     19767
    positive       0.85      0.85      0.85     20234

    accuracy                           0.85     40001
   macro avg       0.85      0.85      0.85     40001
weighted avg       0.85      0.85      0.85     40001

📊 Logistic Regression Report:
               precision    recall  f1-score   support

    negative       0.89      0.88      0.89     19767
    positive       0.89      0.89      0.89     20234

    accuracy                           0.89     40001
   macro avg       0.89      0.89      0.89     40001
weighted avg       0.89      0.89      0.89     40001



In [26]:
# 💾 Step 10: Save both trained models for Flask later
joblib.dump(nb_pipeline, "naive_model.pkl")
joblib.dump(lr_pipeline, "logistic_model.pkl")
print(" Models saved successfully!")


 Models saved successfully!
