In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Download required NLTK data
import nltk

# Download punkt_tab for tokenization (newer version)
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print('\nNLTK data downloaded successfully!')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...



NLTK data downloaded successfully!


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Load the movie reviews dataset
df = pd.read_csv('movie_reviews_dataset.csv')

print(f'Dataset shape: {df.shape}')
print(f'\nFirst few rows:')
print(df.head())
print(f'\nSentiment distribution:')
print(df['sentiment'].value_counts())


Dataset shape: (10, 2)

First few rows:
                                                text sentiment
0  I absolutely loved this movie. The acting was ...  positive
1                      Terrible film. Waste of time.  negative
2  What a fantastic experience, I’ll watch it again!  positive
3            Worst movie ever, I couldn’t finish it.  negative
4   Beautiful direction and story. Highly recommend!  positive

Sentiment distribution:
sentiment
positive    5
negative    5
Name: count, dtype: int64


In [5]:
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess text with tokenization, stemming, lemmatization, and POS tagging
    
    Args:
        text: Input text string
    
    Returns:
        Tuple of (cleaned_text, pos_tags)
    """
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove non-alphabetic tokens and stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    # POS Tagging
    pos_tags = pos_tag(lemmatized)
    
    # Return cleaned text and POS tags
    return ' '.join(lemmatized), pos_tags

print('Preprocessing function defined successfully!')


Preprocessing function defined successfully!


In [6]:
# Apply preprocessing to the dataset
print('Applying preprocessing to dataset...')
df['clean_text'], df['pos_tags'] = zip(*df['text'].apply(preprocess_text))

print('\nPreprocessing complete!')
print('\nSample of original vs. preprocessed text:')
for i in range(3):
    print(f'\nOriginal: {df.iloc[i]["text"]}')
    print(f'Cleaned:  {df.iloc[i]["clean_text"]}')
    print(f'POS Tags: {df.iloc[i]["pos_tags"][:5]}...')  # Show first 5 POS tags


Applying preprocessing to dataset...

Preprocessing complete!

Sample of original vs. preprocessed text:

Original: I absolutely loved this movie. The acting was great!
Cleaned:  absolut love movi act great
POS Tags: [('absolut', 'RB'), ('love', 'VB'), ('movi', 'NN'), ('act', 'NN'), ('great', 'JJ')]...

Original: Terrible film. Waste of time.
Cleaned:  terribl film wast time
POS Tags: [('terribl', 'NN'), ('film', 'NN'), ('wast', 'NN'), ('time', 'NN')]...

Original: What a fantastic experience, I’ll watch it again!
Cleaned:  fantast experi watch
POS Tags: [('fantast', 'JJ'), ('experi', 'NN'), ('watch', 'NN')]...


In [7]:
# Method 1: Count Vectorization
count_vectorizer = CountVectorizer(max_features=100)
X_count = count_vectorizer.fit_transform(df['clean_text'])

print(f'Count Vectorization shape: {X_count.shape}')
print(f'Feature names (first 10): {count_vectorizer.get_feature_names_out()[:10]}')

# Method 2: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])

print(f'\nTF-IDF Vectorization shape: {X_tfidf.shape}')
print(f'Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10]}')


Count Vectorization shape: (10, 37)
Feature names (first 10): ['absolut' 'act' 'aw' 'bad' 'beauti' 'bit' 'bore' 'brilliant' 'cast'
 'cinematographi']

TF-IDF Vectorization shape: (10, 37)
Feature names (first 10): ['absolut' 'act' 'aw' 'bad' 'beauti' 'bit' 'bore' 'brilliant' 'cast'
 'cinematographi']


In [8]:
# Prepare target variable
y = df['sentiment']

# Split data into training and testing sets (80-20 split)
X_train_count, X_test_count, y_train, y_test = train_test_split(
    X_count, y, test_size=0.2, random_state=42, stratify=y
)

X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Training set size: {X_train_count.shape[0]}')
print(f'Testing set size: {X_test_count.shape[0]}')
print(f'\nTraining set sentiment distribution:')
print(y_train.value_counts())


Training set size: 8
Testing set size: 2

Training set sentiment distribution:
sentiment
positive    4
negative    4
Name: count, dtype: int64


In [9]:
# Train Naive Bayes with Count Vectorization
print('Training Naive Bayes with Count Vectorization...')
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train)
print('Training complete!')

# Train Naive Bayes with TF-IDF
print('\nTraining Naive Bayes with TF-IDF...')
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
print('Training complete!')

# Train Logistic Regression with Count Vectorization
print('\nTraining Logistic Regression with Count Vectorization...')
lr_count = LogisticRegression(max_iter=1000, random_state=42)
lr_count.fit(X_train_count, y_train)
print('Training complete!')

# Train Logistic Regression with TF-IDF
print('\nTraining Logistic Regression with TF-IDF...')
lr_tfidf = LogisticRegression(max_iter=1000, random_state=42)
lr_tfidf.fit(X_train_tfidf, y_train)
print('Training complete!')


Training Naive Bayes with Count Vectorization...
Training complete!

Training Naive Bayes with TF-IDF...
Training complete!

Training Logistic Regression with Count Vectorization...
Training complete!

Training Logistic Regression with TF-IDF...
Training complete!


In [10]:
# Function to evaluate model
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'\n{"="*60}')
    print(f'{model_name}')
    print(f'{"="*60}')
    print(f'Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)')
    print(f'\nClassification Report:')
    print(classification_report(y_test, y_pred))
    print(f'\nConfusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    return accuracy

# Evaluate all models
results = {}
results['NB_Count'] = evaluate_model(nb_count, X_test_count, y_test, 'Naive Bayes with Count Vectorization')
results['NB_TFIDF'] = evaluate_model(nb_tfidf, X_test_tfidf, y_test, 'Naive Bayes with TF-IDF')
results['LR_Count'] = evaluate_model(lr_count, X_test_count, y_test, 'Logistic Regression with Count Vectorization')
results['LR_TFIDF'] = evaluate_model(lr_tfidf, X_test_tfidf, y_test, 'Logistic Regression with TF-IDF')



Naive Bayes with Count Vectorization
Accuracy: 1.0000 (100.00%)

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
    positive       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
[[1 0]
 [0 1]]

Naive Bayes with TF-IDF
Accuracy: 1.0000 (100.00%)

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
    positive       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
[[1 0]
 [0 1]]

Logistic Regression with Count Vectorization
Accuracy: 1.0000 (100.00%)

Classification Report:
              pr

In [11]:
# Compare all models
print('\n' + '='*60)
print('MODEL COMPARISON SUMMARY')
print('='*60)
for model_name, accuracy in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f'{model_name:30s}: {accuracy:.4f} ({accuracy*100:.2f}%)')

best_model = max(results, key=results.get)
print(f'\nBest Model: {best_model} with accuracy {results[best_model]:.4f}')



MODEL COMPARISON SUMMARY
NB_Count                      : 1.0000 (100.00%)
NB_TFIDF                      : 1.0000 (100.00%)
LR_Count                      : 1.0000 (100.00%)
LR_TFIDF                      : 0.5000 (50.00%)

Best Model: NB_Count with accuracy 1.0000


In [None]:
# Function to predict sentiment of new text
def predict_sentiment(text, model, vectorizer):
    # Preprocess the text
    cleaned_text, _ = preprocess_text(text)
    
    # Vectorize
    text_vectorized = vectorizer.transform([cleaned_text])
    
    # Predict
    prediction = model.predict(text_vectorized)[0]
    
    return prediction

# Test with example reviews
test_reviews = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "Terrible movie, complete waste of time and money.",
    "The acting was brilliant and the story was captivating.",
    "Boring and predictable. Would not recommend."
]

print('Testing with custom reviews using best performing model (Logistic Regression + TF-IDF):\n')
for review in test_reviews:
    sentiment = predict_sentiment(review, lr_tfidf, tfidf_vectorizer)
    print(f'Review: "{review}"')
    print(f'Predicted Sentiment: {sentiment.upper()}')
    print()
