In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
import random
import string

# Download required NLTK resources (run once)
nltk.download('movie_reviews')
nltk.download('stopwords')

# -------------------------------
# 1. Load Dataset
# -------------------------------
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle dataset
random.shuffle(documents)

# -------------------------------
# 2. Preprocessing Function
# -------------------------------

stop_words = set(stopwords.words("english"))
punct = set(string.punctuation)

def preprocess(words):
    return [w.lower() for w in words if w.lower() not in stop_words and w not in punct]

# -------------------------------
# 3. Feature Extraction
# -------------------------------

# Create a frequency distribution of all words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]    # Using top 2000 words as features

def extract_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[f"contains({w})"] = (w in words)
    return features

# Create feature sets
feature_sets = [(extract_features(preprocess(doc)), category) for (doc, category) in documents]

# Train/Test Split (80/20)
train_size = int(len(feature_sets) * 0.8)
train_set = feature_sets[:train_size]
test_set = feature_sets[train_size:]

# -------------------------------
# 4. Train Naive Bayes Classifier
# -------------------------------
classifier = NaiveBayesClassifier.train(train_set)

# -------------------------------
# 5. Evaluate Model
# -------------------------------
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

# Show informative features
classifier.show_most_informative_features(10)

# -------------------------------
# 6. Predict for a New Sentence
# -------------------------------

def predict_sentiment(text):
    words = preprocess(text.split())
    feats = extract_features(words)
    return classifier.classify(feats)

# Test the model
sample_text = "I really loved the movie, it was fantastic and exciting!"
result = predict_sentiment(sample_text)
print("\nPrediction for sample text:", result)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.8425
Most Informative Features
   contains(outstanding) = True              pos : neg    =      9.5 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      7.4 : 1.0
         contains(mulan) = True              pos : neg    =      6.3 : 1.0
        contains(poorly) = True              neg : pos    =      5.7 : 1.0
         contains(awful) = True              neg : pos    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
          contains(lame) = True              neg : pos    =      5.4 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.3 : 1.0
         contains(damon) = True              pos : neg    =      5.0 : 1.0

Prediction for sample text: neg
