In [3]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk import FreqDist
import string
import random

In [4]:
# Download required NLTK data
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/ravireddy/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /Users/ravireddy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravireddy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ravireddy/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ravireddy/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
def load_data():
    """Load and prepare the dataset"""
    print("Loading data...")
    
    # Load movie reviews dataset
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    
    # Shuffle for randomness
    random.shuffle(documents)
    
    print(f"Total documents: {len(documents)}")
    print(f"Categories: {movie_reviews.categories()}")
    
    return documents

In [7]:
def clean_text(tokens):
    """Clean text data"""
    
    # Convert to lowercase
    tokens = [w.lower() for w in tokens]
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    
    # Remove short words (length < 3)
    tokens = [word for word in tokens if len(word) >= 3]
    
    return tokens

In [8]:
def preprocess_text(tokens):
    """Apply various preprocessing techniques"""
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    
    # Lemmatization (choose either lemmatization OR stemming)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    # Alternative: Stemming (uncomment to use instead of lemmatization)
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(w) for w in tokens]
    
    return tokens

In [9]:
def extract_features(document, word_features):
    """Extract features for classification"""
    document_words = set(document)
    features = {}
    
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    
    return features

In [10]:
def get_top_words(all_words, n=2000):
    """Get top N most frequent words as features"""
    freq_dist = FreqDist(all_words)
    return list(freq_dist.keys())[:n]

In [11]:
def prepare_data_for_training(documents):
    """Complete preprocessing pipeline"""
    print("\nPreprocessing data...")
    
    # Clean and preprocess all documents
    processed_docs = []
    all_words = []
    
    for tokens, category in documents:
        # Clean
        cleaned = clean_text(tokens)
        # Preprocess
        processed = preprocess_text(cleaned)
        
        processed_docs.append((processed, category))
        all_words.extend(processed)
    
    # Get word features
    word_features = get_top_words(all_words, n=2000)
    
    # Create feature sets
    featuresets = [(extract_features(doc, word_features), cat) 
                   for (doc, cat) in processed_docs]
    
    return featuresets, word_features

In [12]:
def split_data(featuresets, train_ratio=0.8):
    """Split data into training and testing sets"""
    train_size = int(len(featuresets) * train_ratio)
    
    train_set = featuresets[:train_size]
    test_set = featuresets[train_size:]
    
    print(f"\nTraining set size: {len(train_set)}")
    print(f"Testing set size: {len(test_set)}")
    
    return train_set, test_set

In [13]:
def train_models(train_set):
    """Train different classification models"""
    print("\nTraining models...")
    
    # Naive Bayes Classifier
    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    # Decision Tree Classifier
    dt_classifier = nltk.DecisionTreeClassifier.train(train_set)
    
    # Maximum Entropy Classifier (may take longer)
    # maxent_classifier = nltk.MaxentClassifier.train(train_set, max_iter=3)
    
    return nb_classifier, dt_classifier

In [None]:
def evaluate_models(classifiers, test_set):
    """Evaluate trained models"""
    print("\nModel Performance:")
    print("=" * 50)
    
    names = ['Naive Bayes', 'Decision Tree']
    
    for name, classifier in zip(names, classifiers):
        accuracy = nltk.classify.accuracy(classifier, test_set) * 100
        print(f"{name} Accuracy: {accuracy:.2f}%")
    
    # Show most informative features for Naive Bayes
    print("\nMost Informative Features (Naive Bayes):")
    print("=" * 50)
    classifiers[0].show_most_informative_features(15)

In [15]:
def predict_sentiment(text, classifier, word_features):
    """Predict sentiment of new text"""
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Clean
    cleaned = clean_text(tokens)
    
    # Preprocess
    processed = preprocess_text(cleaned)
    
    # Extract features
    features = extract_features(processed, word_features)
    
    # Predict
    sentiment = classifier.classify(features)
    prob_dist = classifier.prob_classify(features)
    
    return sentiment, prob_dist.prob(sentiment)

In [16]:


# Step 1: Load data
documents = load_data()

# Step 2-6: Prepare data
featuresets, word_features = prepare_data_for_training(documents)

# Step 7: Split data
train_set, test_set = split_data(featuresets)

# Step 8: Train models
nb_classifier, dt_classifier = train_models(train_set)

# Step 9: Evaluate
evaluate_models([nb_classifier, dt_classifier], test_set)

# Step 10: Test predictions
print("\n" + "=" * 50)
print("TESTING PREDICTIONS")
print("=" * 50)

test_reviews = [
    "This movie was absolutely fantastic! Great acting and plot.",
    "Terrible film. Waste of time and money.",
    "An okay movie, nothing special but not bad either."
]

for review in test_reviews:
    sentiment, confidence = predict_sentiment(review, nb_classifier, word_features)
    print(f"\nReview: {review}")
    print(f"Predicted Sentiment: {sentiment.upper()}")
    print(f"Confidence: {confidence:.2%}")

print("\n" + "=" * 50)
print("Pipeline Complete!")
print("=" * 50)

Loading data...
Total documents: 2000
Categories: ['neg', 'pos']

Preprocessing data...

Training set size: 1600
Testing set size: 400

Training models...

Model Performance:
Naive Bayes Accuracy: 74.75%
Decision Tree Accuracy: 62.50%

Most Informative Features (Naive Bayes):
Most Informative Features
     contains(maintains) = True              pos : neg    =     10.1 : 1.0
   contains(outstanding) = True              pos : neg    =      9.6 : 1.0
     contains(marvelous) = True              pos : neg    =      9.4 : 1.0
          contains(chip) = True              pos : neg    =      8.7 : 1.0
      contains(thematic) = True              pos : neg    =      8.0 : 1.0
     contains(balancing) = True              pos : neg    =      7.3 : 1.0
contains(claustrophobic) = True              pos : neg    =      7.3 : 1.0
   contains(fascination) = True              pos : neg    =      7.3 : 1.0
        contains(primal) = True              pos : neg    =      7.3 : 1.0
          contains(lam

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/ravireddy/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
