In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, names
from nltk import pos_tag, word_tokenize, sent_tokenize
from nltk.corpus import movie_reviews
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from statistics import mean
from random import shuffle

In [2]:
nltk.download([
    'movie_reviews',
    'punkt',
    'stopwords',
    'names',
    'averaged_perceptron_tagger',
    'vader_lexicon'
])

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Labib\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already

True

# Step 2: Split Data First (No Leakage!)
We split before learning anything.

In [3]:
# Get all review IDs
pos_ids = movie_reviews.fileids("pos")
neg_ids = movie_reviews.fileids("neg")

# Shuffle them
shuffle(pos_ids)
shuffle(neg_ids)

# Define split point (80% train)
train_ratio = 0.8
train_cutoff_pos = int(len(pos_ids) * train_ratio)
train_cutoff_neg = int(len(neg_ids) * train_ratio)

# Training IDs (80%)
train_pos_ids = pos_ids[:train_cutoff_pos]   # 4,000
train_neg_ids = neg_ids[:train_cutoff_neg]   # 4,000

# Test IDs (20%)
test_pos_ids = pos_ids[train_cutoff_pos:]    # 1,000
test_neg_ids = neg_ids[train_cutoff_neg:]    # 1,000

# Step 3: Build top_100_positive from Training Data Only
This is the key fix — no leakage.

In [4]:
# Define unwanted words
unwanted = set(stopwords.words("english"))
unwanted.update([name.lower() for name in names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word.lower() in unwanted:
        return False
    if tag.startswith("NN"):  # Skip nouns
        return False
    return True

# Extract words from training positive reviews only
train_words_pos = [
    word for word, tag in filter(
        skip_unwanted,
        pos_tag(movie_reviews.words(train_pos_ids))
    )
]

# Build frequency distribution
positive_fd = nltk.FreqDist(train_words_pos)

# Top 100 positive words (lowercase!)
top_100_positive = {word.lower() for word, count in positive_fd.most_common(100)}

# Step : 4 Define extract_features()

In [5]:
sia = SentimentIntensityAnalyzer()

def extract_features(text):
    features = {}
    wordcount = 0
    compound_scores = []
    positive_scores = []

    for sentence in sent_tokenize(text):
        for word in word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

# 🔧 Step 5: Build Features for Training and Test Sets

In [6]:
# Training features
train_features = []

# Add positive training reviews
for review_id in train_pos_ids:
    text = movie_reviews.raw(review_id)
    feats = extract_features(text)
    train_features.append((feats, "pos"))

# Add negative training reviews
for review_id in train_neg_ids:
    text = movie_reviews.raw(review_id)
    feats = extract_features(text)
    train_features.append((feats, "neg"))

# Shuffle training data
shuffle(train_features)

In [7]:
# Test features
test_features = []

for review_id in test_pos_ids:
    text = movie_reviews.raw(review_id)
    feats = extract_features(text)
    test_features.append((feats, "pos"))

for review_id in test_neg_ids:
    text = movie_reviews.raw(review_id)
    feats = extract_features(text)
    test_features.append((feats, "neg"))

# No need to shuffle test set — we’re just evaluating

# 🔧 Step 6: Train and Evaluate

In [8]:
# Train on training features
classifier = NaiveBayesClassifier.train(train_features)

# Evaluate on test features (unseen!)
acc = accuracy(classifier, test_features)
print(f"✅ Fair Accuracy: {acc:.3f}")  # Should be ~0.70–0.78

✅ Fair Accuracy: 0.458


# 🔧 Step 7: Save the Model (So You Can Use It Later!)

In [9]:
import pickle

# Save the model
with open("movie_sentiment_classifier.pkl", "wb") as f:
    pickle.dump(classifier, f)

# Save top_100_positive too
with open("top_100_positive.pkl", "wb") as f:
    pickle.dump(top_100_positive, f)

# 🔧 Step 8: Load and Use Later
Later, in a new script:

In [12]:
import pickle
from statistics import mean
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the saved objects
with open("movie_sentiment_classifier.pkl", "rb") as f:
    classifier = pickle.load(f)

with open("top_100_positive.pkl", "rb") as f:
    top_100_positive = pickle.load(f)

sia = SentimentIntensityAnalyzer()

# Define extract_features again
def extract_features(text):
    features = {}
    wordcount = 0
    compound_scores = []
    positive_scores = []

    for sentence in sent_tokenize(text):
        for word in word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

# Classify new review
new_review = "This movie was absolutely bad. I hate it!"
new_features = extract_features(new_review)
print("Prediction:", classifier.classify(new_features))

Prediction: neg
