In [None]:
from utils import load_movie_features_with_encoded_genres, split_train_test, genres, prepare_for_naive_bayes
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Load data
df_original, df_with_genres = load_movie_features_with_encoded_genres('movie_features.csv')
df_nb = prepare_for_naive_bayes(df_with_genres)

# Split
df_train, df_test = split_train_test(df_nb, test_size=0.2, random_state=42)

print(f"Training set: {len(df_train)} movies")
print(f"Test set: {len(df_test)} movies")

In [None]:
from sklearn.metrics import hamming_loss, jaccard_score, f1_score

# Prepare data
emotion_cols = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
X_train = df_train[emotion_cols].values
y_train = df_train[genres].values
X_test = df_test[emotion_cols].values
y_test = df_test[genres].values

# Random baseline model
np.random.seed(42)  # For reproducibility

print("="*80)
print("RANDOM BASELINE MODEL")
print("="*80)

# Calculate average number of genres per movie in training set
avg_genres_per_movie = y_train.sum(axis=1).mean()
print(f"\nAverage genres per movie in training set: {avg_genres_per_movie:.2f}")

# Generate random predictions
y_pred_random = np.zeros_like(y_test)

for i in range(len(y_test)):
    # Randomly decide how many genres (using Poisson distribution)
    num_genres = np.random.poisson(avg_genres_per_movie)
    num_genres = max(1, min(num_genres, len(genres)))  # Clamp between 1 and 23
    
    # Randomly select which genres
    selected_genres = np.random.choice(len(genres), size=num_genres, replace=False)
    y_pred_random[i, selected_genres] = 1

print(f"Average predicted genres per movie: {y_pred_random.sum(axis=1).mean():.2f}")

In [None]:
# Evaluate
per_label_accuracy = 1 - hamming_loss(y_test, y_pred_random)
jaccard = jaccard_score(y_test, y_pred_random, average='samples')
f1 = f1_score(y_test, y_pred_random, average='samples', zero_division=0)
exact_match = np.mean([np.array_equal(y_test[i], y_pred_random[i]) for i in range(len(y_test))])

print("\n" + "="*80)
print("RANDOM BASELINE PERFORMANCE")
print("="*80)
print(f"Per-Label Accuracy: {per_label_accuracy:.4f}")
print(f"Jaccard Score: {jaccard:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Exact Match: {exact_match:.4f}")