In [2]:
import nltk  # NLP toolkit that contains the movie_reviews dataset


In [3]:
# Downloads the built-in movie_reviews corpus.
# Needs internet the first time.
nltk.download('movie_reviews')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/saibit/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
from nltk.corpus import movie_reviews  # Provides access to review text + labels
import pandas as pd                    # Used for building the DataFrame


In [5]:
# These lists will hold the raw movie review text and sentiment labels.
texts = []
labels = []


In [6]:
# Should show: ['pos', 'neg']
movie_reviews.categories()


['neg', 'pos']

In [7]:
# Shows how many positive reviews exist.
len(movie_reviews.fileids('pos'))


1000

In [8]:
# Loop through each sentiment category: "pos" and "neg"
for category in movie_reviews.categories():
    
    # Loop through each individual review file in that category
    for fileid in movie_reviews.fileids(category):
        
        # movie_reviews.words(fileid) gives a list of tokens
        words = movie_reviews.words(fileid)
        
        # Join tokens into a single string to reconstruct the review
        text = " ".join(words)
        
        # Store review text and its label
        texts.append(text)
        labels.append(category)


In [9]:
# Build a structured table where each row is:
# review text + its sentiment label
df = pd.DataFrame({
    "review": texts,
    "label": labels
})


In [10]:
df.head()  # Should show readable reviews + pos/neg labels


Unnamed: 0,review,label
0,"plot : two teen couples go to a church party ,...",neg
1,the happy bastard ' s quick movie review damn ...,neg
2,it is movies like these that make a jaded movi...,neg
3,""" quest for camelot "" is warner bros . ' first...",neg
4,synopsis : a mentally unstable man undergoing ...,neg


In [11]:
# Expect ~1000 'pos' and ~1000 'neg'
df["label"].value_counts()


label
neg    1000
pos    1000
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text to numeric features
from sklearn.linear_model import LogisticRegression  # Our classifier


In [13]:
# X contains raw review text
# y contains the labels (pos/neg)
X = df["review"].values
y = df["label"].values


In [14]:
# Split data into training and test sets.
# test_size=0.2 means 80% training, 20% testing.
# stratify=y ensures class balance is preserved.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [15]:
# TF-IDF converts raw text into numerical feature vectors.
# ngram_range=(1,2) includes unigrams and bigrams.
# stop_words="english" removes common filler words.
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)


In [16]:
# Learn vocabulary from training text only.
X_train_tfidf = vectorizer.fit_transform(X_train)


In [17]:
# Convert test text to TF-IDF using the same learned vocabulary.
X_test_tfidf = vectorizer.transform(X_test)


In [18]:
# Train the classifier using the TF-IDF features.
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [19]:
from sklearn.metrics import accuracy_score, classification_report


In [20]:
# Generate predictions for the held-out test set.
y_pred = model.predict(X_test_tfidf)


In [21]:
# Simple scalar score: proportion of correct predictions.
acc = accuracy_score(y_test, y_pred)
acc


0.83

In [22]:
# Shows precision, recall, f1-score for each class (pos/neg).
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         neg       0.85      0.81      0.83       200
         pos       0.81      0.85      0.83       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



In [23]:
import joblib  # Used for saving and loading sklearn models
import os

# Create a folder to hold the saved files (if it doesn't already exist)
os.makedirs("models", exist_ok=True)


In [24]:
# Save the vectorizer so Streamlit can load it later
joblib.dump(vectorizer, "models/tfidf_vectorizer.joblib")


['models/tfidf_vectorizer.joblib']

In [25]:
# Save the classifier
joblib.dump(model, "models/sentiment_model.joblib")


['models/sentiment_model.joblib']