In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec, KeyedVectors
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import time
import pickle
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
nltk.download('punkt.tab')

[nltk_data] Error loading punkt.tab: Package 'punkt.tab' not found in
[nltk_data]     index


False

In [6]:
# Set random seed for reproducibility
np.random.seed(42)

In [7]:
def create_sample_data(n_samples=2000):
    np.random.seed(42)
    positive_phrases = [
        "loved it", "fantastic movie", "excellent acting", "brilliant performance",
        "amazing plot", "great direction", "wonderful cinematography", "highly recommend",
        "touching story", "emotional journey", "masterpiece", "outstanding film",
        "captivating storyline", "superb cast", "visually stunning"
    ]

    negative_phrases = [
        "waste of time", "terrible acting", "awful script", "poorly directed",
        "boring plot", "disappointing ending", "bad cinematography", "wouldn't recommend",
        "predictable story", "wooden performances", "lackluster", "dreadful film",
        "uninspired storyline", "mediocre cast", "visually dull"
    ]

    reviews = []
    sentiments = []

    for _ in range(n_samples // 2):
        # Generate positive review
        n_positive = np.random.randint(1, 4)
        positive_indices = np.random.choice(len(positive_phrases), n_positive, replace=False)
        pos_review = " ".join([positive_phrases[i] for i in positive_indices])
        filler_words = np.random.randint(5, 20)
        pos_review = f"I watched this movie last weekend. {pos_review} It was a {np.random.choice(['good', 'great', 'fantastic'])} experience."
        reviews.append(pos_review)
        sentiments.append("positive")

        # Generate negative review
        n_negative = np.random.randint(1, 4)
        negative_indices = np.random.choice(len(negative_phrases), n_negative, replace=False)
        neg_review = " ".join([negative_phrases[i] for i in negative_indices])
        filler_words = np.random.randint(5, 20)
        neg_review = f"I saw this film recently. {neg_review} It was a {np.random.choice(['bad', 'terrible', 'disappointing'])} experience."
        reviews.append(neg_review)
        sentiments.append("negative")

    # Create DataFrame
    df = pd.DataFrame({
        'review': reviews,
        'sentiment': sentiments
    })

    # Shuffle the data
    return df.sample(frac=1).reset_index(drop=True)

In [8]:
# Create or load data
df = create_sample_data(n_samples=2000)
print(f"Dataset shape: {df.shape}")
print(df.head())

# Check class distribution
print("\nClass distribution:")
print(df['sentiment'].value_counts())

Dataset shape: (2000, 2)
                                              review sentiment
0  I saw this film recently. disappointing ending...  negative
1  I watched this movie last weekend. masterpiece...  positive
2  I saw this film recently. waste of time It was...  negative
3  I saw this film recently. wooden performances ...  negative
4  I saw this film recently. boring plot dreadful...  negative

Class distribution:
sentiment
negative    1000
positive    1000
Name: count, dtype: int64


In [9]:
# Text preprocessing
def preprocess_text(text):
    """Clean and preprocess text data"""
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Remove short words (length < 3)
    tokens = [word for word in tokens if len(word) >= 3]

    return tokens

In [12]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
# Apply preprocessing
print("\nPreprocessing text data...")
start_time = time.time()
df['processed_text'] = df['review'].apply(preprocess_text)
print(f"Preprocessing completed in {time.time() - start_time:.2f} seconds")

# Check a sample of processed text
print("\nSample of processed text:")
print(df['processed_text'].head())


Preprocessing text data...
Preprocessing completed in 4.99 seconds

Sample of processed text:
0    [saw, film, recently, disappointing, ending, w...
1    [watched, movie, last, weekend, masterpiece, g...
2    [saw, film, recently, waste, time, bad, experi...
3    [saw, film, recently, wooden, performance, ter...
4    [saw, film, recently, boring, plot, dreadful, ...
Name: processed_text, dtype: object


In [14]:
# Train Word2Vec model
print("\nTraining Word2Vec model...")
start_time = time.time()

# Set Word2Vec parameters
vector_size = 100  # Dimensionality of word vectors
window_size = 5    # Context window size
min_count = 2      # Minimum word count threshold
sg = 1             # Training algorithm: 1 for skip-gram; 0 for CBOW

# Train model
w2v_model = Word2Vec(
    sentences=df['processed_text'].tolist(),
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    sg=sg,
    workers=4
)

print(f"Word2Vec model trained in {time.time() - start_time:.2f} seconds")
print(f"Vocabulary size: {len(w2v_model.wv.key_to_index)}")


Training Word2Vec model...
Word2Vec model trained in 0.09 seconds
Vocabulary size: 54


In [15]:
# Save the model
w2v_model.save("movie_reviews_word2vec.model")
print("Word2Vec model saved to disk")

# Alternatively, we could load pre-trained vectors:
# w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


Word2Vec model saved to disk


In [16]:
# Feature extraction: Create document vectors
def document_vector(doc, model):
    """Create document vectors by averaging word vectors"""
    # Filter words in vocabulary
    doc_words = [word for word in doc if word in model.wv]

    if len(doc_words) == 0:
        # If no words are in vocabulary, return zero vector
        return np.zeros(model.vector_size)

    # Return average of word vectors
    return np.mean([model.wv[word] for word in doc_words], axis=0)


In [17]:
# Convert documents to vectors
print("\nConverting documents to feature vectors...")
start_time = time.time()
df['document_vector'] = df['processed_text'].apply(lambda x: document_vector(x, w2v_model))
print(f"Document vectors created in {time.time() - start_time:.2f} seconds")



Converting documents to feature vectors...
Document vectors created in 0.07 seconds


In [18]:
# Create feature matrix and target variable
X = np.array(df['document_vector'].tolist())
y = df['sentiment'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (1600, 100)
Testing set shape: (400, 100)


In [19]:
# Compare different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [20]:
results = {}


In [21]:
print("\nTraining and evaluating classifiers:")
for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_time

    # Predict and evaluate
    start_time = time.time()
    y_pred = clf.predict(X_test)
    predict_time = time.time() - start_time

    # Store results
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'training_time': train_time,
        'prediction_time': predict_time,
        'classifier': clf,
        'predictions': y_pred
    }

    print(f"{name} accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    print(f"Prediction time: {predict_time:.2f} seconds")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))



Training and evaluating classifiers:

Training Logistic Regression...
Logistic Regression accuracy: 1.0000
Training time: 0.02 seconds
Prediction time: 0.00 seconds

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       200
    positive       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Training SVM...
SVM accuracy: 1.0000
Training time: 0.01 seconds
Prediction time: 0.00 seconds

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       200
    positive       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Training Random Forest...
Random Fo

In [22]:
# Find the best classifier
best_clf_name = max(results, key=lambda x: results[x]['accuracy'])
best_clf = results[best_clf_name]['classifier']
best_accuracy = results[best_clf_name]['accuracy']

print(f"\nBest classifier: {best_clf_name} with accuracy: {best_accuracy:.4f}")



Best classifier: Logistic Regression with accuracy: 1.0000


In [23]:
# Visualize confusion matrix for the best classifier
best_predictions = results[best_clf_name]['predictions']
cm = confusion_matrix(y_test, best_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Confusion Matrix - {best_clf_name}')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

print("\nConfusion matrix saved as 'confusion_matrix.png'")


Confusion matrix saved as 'confusion_matrix.png'


In [24]:
# Save the best model
with open(f"sentiment_classifier_{best_clf_name.replace(' ', '_').lower()}.pkl", "wb") as f:
    pickle.dump(best_clf, f)
print(f"Best model saved as 'sentiment_classifier_{best_clf_name.replace(' ', '_').lower()}.pkl'")


Best model saved as 'sentiment_classifier_logistic_regression.pkl'


In [25]:
# Hyperparameter tuning for the best classifier
print("\nPerforming hyperparameter tuning for the best classifier...")

if best_clf_name == "Logistic Regression":
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
elif best_clf_name == "SVM":
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto', 0.1, 1]
    }
else:  # Random Forest
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }



Performing hyperparameter tuning for the best classifier...


In [26]:
# Perform grid search
grid_search = GridSearchCV(best_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")


Best parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 1.0000


In [27]:
# Evaluate tuned model
tuned_clf = grid_search.best_estimator_
y_pred_tuned = tuned_clf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)

print(f"\nTuned model accuracy: {tuned_accuracy:.4f}")
print("\nClassification Report for tuned model:")
print(classification_report(y_test, y_pred_tuned))



Tuned model accuracy: 1.0000

Classification Report for tuned model:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       200
    positive       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [28]:
# Save the tuned model
with open(f"tuned_sentiment_classifier_{best_clf_name.replace(' ', '_').lower()}.pkl", "wb") as f:
    pickle.dump(tuned_clf, f)
print(f"Tuned model saved as 'tuned_sentiment_classifier_{best_clf_name.replace(' ', '_').lower()}.pkl'")


Tuned model saved as 'tuned_sentiment_classifier_logistic_regression.pkl'


In [29]:
# Create a function to predict sentiment for new reviews
def predict_sentiment(review_text, w2v_model=w2v_model, classifier=tuned_clf):
    """Predict sentiment for a new movie review"""
    # Preprocess the review
    processed_review = preprocess_text(review_text)

    # Convert to document vector
    review_vector = document_vector(processed_review, w2v_model)

    # Reshape for prediction
    review_vector = review_vector.reshape(1, -1)

    # Predict sentiment
    prediction = classifier.predict(review_vector)[0]

    # Get probability scores if classifier supports it
    try:
        probability = classifier.predict_proba(review_vector)[0]
        confidence = max(probability)
    except:
        # Some classifiers like SVM don't have predict_proba
        confidence = None

    return {
        'sentiment': prediction,
        'confidence': confidence
    }

In [30]:
# Example usage of the sentiment prediction function
example_reviews = [
    "This movie was absolutely fantastic! The director did an amazing job.",
    "I found the plot boring and the acting was terrible.",
    "The movie was okay, not great but not terrible either.",
    "A masterpiece that will stand the test of time. Incredible performances by all."
]

In [31]:
print("\nPredicting sentiment for example reviews:")
for review in example_reviews:
    result = predict_sentiment(review)
    print(f"\nReview: {review}")
    print(f"Predicted sentiment: {result['sentiment']}")
    if result['confidence'] is not None:
        print(f"Confidence: {result['confidence']:.4f}")




Predicting sentiment for example reviews:

Review: This movie was absolutely fantastic! The director did an amazing job.
Predicted sentiment: positive
Confidence: 0.8148

Review: I found the plot boring and the acting was terrible.
Predicted sentiment: negative
Confidence: 0.5886

Review: The movie was okay, not great but not terrible either.
Predicted sentiment: positive
Confidence: 0.6781

Review: A masterpiece that will stand the test of time. Incredible performances by all.
Predicted sentiment: positive
Confidence: 0.5294


In [32]:
# Create a simple evaluation function to assess the model's performance
# on different types of reviews
def evaluate_model_on_edge_cases():
    """Test the model on specific edge cases and challenging reviews"""
    edge_cases = [
        # Mixed sentiment
        "The movie had great acting but the plot was confusing.",
        "Brilliant special effects couldn't save the terrible storyline.",

        # Negation
        "This movie wasn't as bad as people say.",
        "The film wasn't great but it wasn't terrible either.",

        # Sarcasm
        "Oh sure, just what we needed, another superhero movie.",
        "If you enjoy watching paint dry, you'll love this film.",

        # Short reviews
        "Loved it!",
        "Terrible.",

        # Implicit sentiment
        "I fell asleep halfway through.",
        "I've watched it three times already."
    ]

    print("\nEvaluating model on challenging reviews:")
    for review in edge_cases:
        result = predict_sentiment(review)
        print(f"\nReview: {review}")
        print(f"Predicted sentiment: {result['sentiment']}")
        if result['confidence'] is not None:
            print(f"Confidence: {result['confidence']:.4f}")


In [33]:
# Run the edge case evaluation
evaluate_model_on_edge_cases()

print("\nExploring the Word2Vec model:")
# Find most similar words to positive sentiment words
positive_words = ["excellent", "amazing", "great"]
print("\nMost similar words to positive sentiment words:")
for word in positive_words:
    if word in w2v_model.wv:
        similar_words = w2v_model.wv.most_similar(word, topn=5)
        print(f"\nWords similar to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"  {similar_word}: {similarity:.4f}")
    else:
        print(f"\nWord '{word}' not in vocabulary")



Evaluating model on challenging reviews:

Review: The movie had great acting but the plot was confusing.
Predicted sentiment: positive
Confidence: 0.7197

Review: Brilliant special effects couldn't save the terrible storyline.
Predicted sentiment: positive
Confidence: 0.5316

Review: This movie wasn't as bad as people say.
Predicted sentiment: positive
Confidence: 0.5909

Review: The film wasn't great but it wasn't terrible either.
Predicted sentiment: negative
Confidence: 0.5399

Review: Oh sure, just what we needed, another superhero movie.
Predicted sentiment: positive
Confidence: 0.8465

Review: If you enjoy watching paint dry, you'll love this film.
Predicted sentiment: negative
Confidence: 0.7327

Review: Loved it!
Predicted sentiment: positive
Confidence: 0.7733

Review: Terrible.
Predicted sentiment: negative
Confidence: 0.7377

Review: I fell asleep halfway through.
Predicted sentiment: negative
Confidence: 0.5000

Review: I've watched it three times already.
Predicted sentim

In [34]:
# Find most similar words to negative sentiment words
negative_words = ["terrible", "boring", "awful"]
print("\nMost similar words to negative sentiment words:")
for word in negative_words:
    if word in w2v_model.wv:
        similar_words = w2v_model.wv.most_similar(word, topn=5)
        print(f"\nWords similar to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"  {similar_word}: {similarity:.4f}")
    else:
        print(f"\nWord '{word}' not in vocabulary")



Most similar words to negative sentiment words:

Words similar to 'terrible':
  lackluster: 0.9981
  bad: 0.9979
  disappointing: 0.9975
  boring: 0.9969
  film: 0.9968

Words similar to 'boring':
  lackluster: 0.9982
  bad: 0.9972
  terrible: 0.9969
  wooden: 0.9966
  dreadful: 0.9966

Words similar to 'awful':
  saw: 0.9968
  dreadful: 0.9966
  disappointing: 0.9960
  recently: 0.9957
  ending: 0.9955


In [35]:
# Word analogies
try:
    print("\nWord analogies:")
    # Classic example: king - man + woman = queen
    result = w2v_model.wv.most_similar(positive=['good', 'terrible'], negative=['bad'], topn=1)
    print(f"good - bad + terrible = {result[0][0]} (similarity: {result[0][1]:.4f})")
except:
    print("\nCould not perform word analogies (vocabulary might be too small)")

print("\nModel Pipeline Complete!")


Word analogies:
good - bad + terrible = excellent (similarity: 0.9967)

Model Pipeline Complete!
