In [1]:
from utils import load_movie_features_with_encoded_genres, split_train_test
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [2]:
# Load the data in
from utils import genres, emotions, load_movie_features_with_encoded_genres, prepare_for_naive_bayes

df_original, df_with_genres = load_movie_features_with_encoded_genres('movie_features.csv')

print(f"Genres: {genres}")
print(f"Emotions: {emotions}")

print()
# Display first row of ORIGINAL (has emotion sequences, not counts)
print("=" * 80)
print("First row of ORIGINAL DataFrame:")
print("=" * 80)
first_row = df_original.iloc[0]
print(f"Title: {first_row['title']}")
print(f"Genres: {first_row['genres']}")
print(f"\nEmotion sequence (first 20): {first_row['emotion_sequence'][:20]}")
print(f"Sequence length: {first_row['sequence_length']}")

print()
# Display first row of ENCODED (has genre columns + emotion sequences)
print("=" * 80)
print("First row of ENCODED DataFrame:")
print("=" * 80)
first_row_encoded = df_with_genres.iloc[0]
print(f"Title: {first_row_encoded['title']}")
print(f"Genres: {first_row_encoded['genres']}")
print(f"\nEmotion sequence (first 20): {first_row_encoded['emotion_sequence'][:20]}")

print(f"\nOne-hot encoded genres:")
for genre in genres:
    print(f"  {genre}: {first_row_encoded[genre]}")

print()
# For Naive Bayes - need to convert sequences to counts
df_nb = prepare_for_naive_bayes(df_with_genres)
print("=" * 80)
print("First row prepared for NAIVE BAYES (with emotion counts):")
print("=" * 80)
first_row_nb = df_nb.iloc[0]
print(f"Title: {first_row_nb['title']}")
print(f"\nEmotion counts:")
for emotion in emotions:
    print(f"  {emotion}: {first_row_nb[emotion]}")
print("=" * 80)

Loaded 1643 movies
Parsed emotion sequences to lists
One-hot encoded 23 genres
Genres: ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']
Emotions: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']

First row of ORIGINAL DataFrame:
Title: 10 Cloverfield Lane
Genres: Action, Drama, Horror, Mystery, Sci-Fi, Thriller

Emotion sequence (first 20): ['neutral', 'neutral', 'fear', 'neutral', 'neutral', 'neutral', 'sadness', 'neutral', 'neutral', 'disgust', 'disgust', 'neutral', 'disgust', 'sadness', 'surprise', 'surprise', 'neutral', 'disgust', 'neutral', 'neutral']
Sequence length: 2036

First row of ENCODED DataFrame:
Title: 10 Cloverfield Lane
Genres: Action, Drama, Horror, Mystery, Sci-Fi, Thriller

Emotion sequence (first 20): ['neutral', 'neutral', 'fear', 'neutral', 'neutr

In [3]:
# Generate the train test split
df_train, df_test = split_train_test(df_nb, test_size=0.2, random_state=42)

Split dataset:
  Training: 1314 movies (80%)
  Testing:  329 movies (20%)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Prepare features (emotion counts) and labels (genre columns)
emotion_cols = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
X_train = df_train[emotion_cols].values
y_train = df_train[genres].values  # Binary matrix of all genres

# Create model
model = OneVsRestClassifier(MultinomialNB())

# K-fold cross validation (k=5 is common)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

print(f"5-Fold Cross Validation Scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")
print(f"Std Deviation: {scores.std():.4f}")

# NOW actually fit the model on full training set
model.fit(X_train, y_train)
print("\nModel fitted on full training set")

5-Fold Cross Validation Scores: [0.00380228 0.00380228 0.00380228 0.01140684 0.00381679]
Mean Accuracy: 0.0053
Std Deviation: 0.0030

Model fitted on full training set


In [5]:
from sklearn.metrics import hamming_loss, jaccard_score, f1_score
import numpy as np

# Prepare test data
emotion_cols = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
X_test = df_test[emotion_cols].values
y_test = df_test[genres].values

# Predict
y_pred = model.predict(X_test)

# Per-label accuracy (4/5 correct = 80%)
per_label_accuracy = 1 - hamming_loss(y_test, y_pred)
print(f"Per-Label Accuracy: {per_label_accuracy:.4f}")

# Jaccard score (intersection over union)
jaccard = jaccard_score(y_test, y_pred, average='samples')
print(f"Jaccard Score (avg overlap): {jaccard:.4f}")

# F1 score
f1 = f1_score(y_test, y_pred, average='samples')
print(f"F1 Score: {f1:.4f}")

# Exact match (for comparison)
exact_match = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
print(f"Exact Match: {exact_match:.4f}")

Per-Label Accuracy: 0.7275
Jaccard Score (avg overlap): 0.2589
F1 Score: 0.3883
Exact Match: 0.0030
