In [1]:
!pip install pandas




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [3]:

def split_train_test(df, test_size=0.2, random_state=42):
    """
    Split dataset into train and test sets.
    
    Args:
        df: DataFrame to split
        test_size: Proportion of data for test set (0.0 to 1.0)
        random_state: Random seed for reproducibility
        
    Returns:
        df_train: Training set DataFrame
        df_test: Test set DataFrame
    """
    df_train, df_test = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state
    )
    
    print(f"Split dataset:")
    print(f"  Training: {len(df_train)} movies ({(1-test_size)*100:.0f}%)")
    print(f"  Testing:  {len(df_test)} movies ({test_size*100:.0f}%)")
    
    return df_train, df_test



In [4]:

def load_movie_features_with_encoded_genres(filename):
    """
    Load movie features CSV and one-hot encode genres.
    
    Args:
        filename: Path to movie_features.csv
        
    Returns:
        df_original: DataFrame with original string genres
        df_with_genres: DataFrame with one-hot encoded genre columns and parsed emotion sequences
    """
    df_original = pd.read_csv(filename)
    print(f"Loaded {len(df_original)} movies")
    
    # Parse emotion_sequence from string to actual list
    if 'emotion_sequence' in df_original.columns:
        df_original['emotion_sequence'] = df_original['emotion_sequence'].apply(eval)
        print("Parsed emotion sequences to lists")
    
    # Create binary columns for each genre
    df_with_genres = df_original.copy()
    for genre in genres:
        df_with_genres[genre] = df_with_genres['genres'].apply(
            lambda x: 1 if pd.notna(x) and genre in str(x) else 0
        )
    
    print(f"One-hot encoded {len(genres)} genres")
    
    return df_original, df_with_genres



In [5]:
def downsample_long_sequences(df_lstm, max_length=2000):
    """
    Downsample emotion sequences that exceed max_length.
    
    Args:
        df_lstm: DataFrame with emotion_sequence_int column
        max_length: Maximum desired sequence length
        
    Returns:
        DataFrame with downsampled sequences
    """
    df_downsampled = df_lstm.copy()
    
    def adaptive_downsample(seq):
        if len(seq) > max_length:
            # Calculate downsample factor
            downsample_factor = len(seq) // max_length + 1
            return seq[::downsample_factor]
        else:
            return seq
    
    # Apply downsampling
    df_downsampled['emotion_sequence_int'] = df_downsampled['emotion_sequence_int'].apply(adaptive_downsample)
    
    # Update sequence lengths
    df_downsampled['sequence_length'] = df_downsampled['emotion_sequence_int'].apply(len)
        
    return df_downsampled

In [6]:

def prepare_for_lstm(df_with_genres):
    """
    Prepare data for LSTM by converting emotion sequences to integer sequences.
    
    Args:
        df_with_genres: DataFrame with emotion_sequence column
        
    Returns:
        DataFrame with emotion_sequence_int column (integer encoded sequences)
    """
    df_lstm = df_with_genres.copy()
    
    # Encode emotions as integers
    emotion_to_int = {emotion: i for i, emotion in enumerate(emotions)}
    
    # Convert emotion sequences to integer sequences
    df_lstm['emotion_sequence_int'] = df_lstm['emotion_sequence'].apply(
        lambda seq: [emotion_to_int[e] for e in seq]
    )
    
    print("Converted emotion sequences to integer encoding for LSTM")
    
    return df_lstm


In [7]:
genres =  ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']
emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']


# Load data
df_original, df_with_genres = load_movie_features_with_encoded_genres('movie_features.csv')

# Prepare for LSTM
df_lstm = prepare_for_lstm(df_with_genres)

#df_lstm = downsample_long_sequences(df_lstm, max_length=2000)

# Split
df_train, df_test = split_train_test(df_lstm, test_size=0.2, random_state=42)

# Convert to X, y
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_seqs = df_train['emotion_sequence_int'].tolist()
X_test_seqs = df_test['emotion_sequence_int'].tolist()

max_length = max(max(len(seq) for seq in X_train_seqs), max(len(seq) for seq in X_test_seqs))

X_train = pad_sequences(X_train_seqs, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test_seqs, maxlen=max_length, padding='post')

y_train = df_train[genres].values
y_test = df_test[genres].values

Loaded 1643 movies
Parsed emotion sequences to lists
One-hot encoded 23 genres
Converted emotion sequences to integer encoding for LSTM


NameError: name 'train_test_split' is not defined

In [None]:
from collections import Counter

# Count genre occurrences
genre_counts = Counter()

for genres_str in df_lstm['genres'].dropna():
    genre_list = [g.strip() for g in str(genres_str).split(',')]
    genre_counts.update(genre_list)

# Sort by frequency
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)

print("Genre frequencies:")
for genre, count in sorted_genres:
    percentage = (count / len(df_lstm)) * 100
    print(f"{genre:15} {count:4} ({percentage:.1f}%)")

print(f"\nTotal movies: {len(df_lstm)}")

In [None]:
print(f"Max sequence length: {max_length}")
print(f"Training data shape: {X_train.shape}")
print(f"Memory estimate: {X_train.nbytes / (1024**3):.2f} GB")

In [None]:
# Sanity check

import tensorflow as tf
print("TF OK:", tf.__version__)
print("GPU:", tf.config.list_physical_devices("GPU"))


In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices('GPU'))
print("CUDA available:", tf.test.is_built_with_cuda())

# Check if GPU is actually being used
import torch
print("\nPyTorch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("PyTorch GPU:", torch.cuda.get_device_name(0))

In [None]:
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import genres, emotions
import numpy as np
from tensorflow.keras.metrics import Precision, Recall, AUC
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Force CPU

# Load and prepare data
from utils import load_movie_features_with_encoded_genres, prepare_for_lstm, split_train_test

df_original, df_with_genres = load_movie_features_with_encoded_genres('movie_features.csv')
df_lstm = prepare_for_lstm(df_with_genres)

# Split (no downsampling)
df_train, df_test = split_train_test(df_lstm, test_size=0.2, random_state=42)

# Get sequences
X_train_seqs = df_train['emotion_sequence_int'].tolist()
X_test_seqs = df_test['emotion_sequence_int'].tolist()

# Pad to full max length
max_length = max(max(len(seq) for seq in X_train_seqs), max(len(seq) for seq in X_test_seqs))
print(f"Max sequence length: {max_length}")

X_train = pad_sequences(X_train_seqs, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test_seqs, maxlen=max_length, padding='post')

y_train = df_train[genres].values
y_test = df_test[genres].values

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")

# Create binary LSTM model
def create_binary_lstm(max_length):
    model = models.Sequential([
        layers.Embedding(input_dim=len(emotions), output_dim=32, input_length=max_length),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')  # Binary output
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            Precision(name='precision'),
            Recall(name='recall'),
            AUC(name='auc')
        ]
    )
    return model

# Train one model per genre
trained_models = {}
histories = {}

for i, genre in enumerate(genres):
    print(f"\n{'='*80}")
    print(f"Training model {i+1}/{len(genres)}: {genre}")
    print(f"{'='*80}")
    
    # Check class balance
    pos_count = y_train[:, i].sum()
    neg_count = len(y_train) - pos_count
    print(f"Positive samples: {pos_count} ({pos_count/len(y_train)*100:.1f}%)")
    print(f"Negative samples: {neg_count} ({neg_count/len(y_train)*100:.1f}%)")
    
    # Create and train model
    model = create_binary_lstm(max_length)
    
    history = model.fit(
        X_train,
        y_train[:, i],  # Single binary column
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    
    # Store model and history
    trained_models[genre] = model
    histories[genre] = history
    
    # Save model
    model.save(f'lstm_binary_{genre.replace("-", "_")}.keras')
    print(f"Saved model for {genre}")

print("\n" + "="*80)
print("All models trained")
print("="*80)

# Make predictions
print("\nGenerating predictions on test set...")
predictions_dict = {}

for genre in genres:
    print(f"Predicting {genre}...")
    model = trained_models[genre]
    pred_proba = model.predict(X_test, verbose=0).flatten()
    predictions_dict[genre] = pred_proba

# Convert to array format
y_pred_proba = np.column_stack([predictions_dict[g] for g in genres])
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate
from sklearn.metrics import hamming_loss, jaccard_score, f1_score

per_label_accuracy = 1 - hamming_loss(y_test, y_pred)
jaccard = jaccard_score(y_test, y_pred, average='samples')
f1 = f1_score(y_test, y_pred, average='samples', zero_division=0)
exact_match = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])

print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)
print(f"Per-Label Accuracy: {per_label_accuracy:.4f}")
print(f"Jaccard Score: {jaccard:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Exact Match: {exact_match:.4f}")
print(f"\nAverage predicted genres per movie: {y_pred.sum(axis=1).mean():.2f}")
print(f"Average true genres per movie: {y_test.sum(axis=1).mean():.2f}")

In [None]:
from sklearn.metrics import hamming_loss, jaccard_score, f1_score
import numpy as np

# Predict (X_test, y_test already created from previous steps)
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Per-label accuracy
per_label_accuracy = 1 - hamming_loss(y_test, y_pred)
print(f"Per-Label Accuracy: {per_label_accuracy:.4f}")

# Jaccard score
jaccard = jaccard_score(y_test, y_pred, average='samples')
print(f"Jaccard Score (avg overlap): {jaccard:.4f}")

# F1 score
f1 = f1_score(y_test, y_pred, average='samples', zero_division=0)
print(f"F1 Score: {f1:.4f}")

# Exact match
exact_match = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
print(f"Exact Match: {exact_match:.4f}")

In [None]:
import numpy as np
from utils import genres

print("=" * 80)
print("LSTM DIAGNOSTIC REPORT")
print("=" * 80)

# 1. Check input sequences are different
print("\n1. INPUT SEQUENCES (first 20 emotions):")
print(f"   Test movie 0: {X_test[0][:20]}")
print(f"   Test movie 1: {X_test[1][:20]}")
print(f"   Test movie 2: {X_test[2][:20]}")
print(f"   Sequences identical? {np.array_equal(X_test[0], X_test[1])}")

# 2. Check embeddings are different
from tensorflow.keras import Model
embedding_model = Model(inputs=model.input, outputs=model.layers[0].output)
embed_0 = embedding_model.predict(X_test[0:1], verbose=0)
embed_1 = embedding_model.predict(X_test[1:2], verbose=0)
print(f"\n2. EMBEDDING LAYER:")
print(f"   Output 0 sample: {embed_0[0][0][:5]}")
print(f"   Output 1 sample: {embed_1[0][0][:5]}")
print(f"   Embeddings identical? {np.allclose(embed_0, embed_1)}")

# 3. Check LSTM outputs
lstm_model = Model(inputs=model.input, outputs=model.layers[1].output)
lstm_0 = lstm_model.predict(X_test[0:1], verbose=0)
lstm_1 = lstm_model.predict(X_test[1:2], verbose=0)
print(f"\n3. LSTM LAYER:")
print(f"   Output 0: {lstm_0[0][:10]}")
print(f"   Output 1: {lstm_1[0][:10]}")
print(f"   LSTM outputs identical? {np.allclose(lstm_0, lstm_1)}")

# 4. Check final predictions
pred_0 = model.predict(X_test[0:1], verbose=0)[0]
pred_1 = model.predict(X_test[1:2], verbose=0)[0]
pred_2 = model.predict(X_test[2:3], verbose=0)[0]
print(f"\n4. FINAL PREDICTIONS (probabilities):")
print(f"   Movie 0: {pred_0[:10]}")
print(f"   Movie 1: {pred_1[:10]}")
print(f"   Movie 2: {pred_2[:10]}")
print(f"   All predictions identical? {np.allclose(pred_0, pred_1) and np.allclose(pred_1, pred_2)}")

# 5. Check prediction behavior
pred_0_binary = (pred_0 > 0.5).astype(int)
pred_1_binary = (pred_1 > 0.5).astype(int)
print(f"\n5. BINARY PREDICTIONS:")
print(f"   Movie 0 genres: {[genres[i] for i in range(len(genres)) if pred_0_binary[i] == 1]}")
print(f"   Movie 1 genres: {[genres[i] for i in range(len(genres)) if pred_1_binary[i] == 1]}")
print(f"   Always predicting same genre? {np.array_equal(pred_0_binary, pred_1_binary)}")

# 6. Overall test predictions
y_pred_all = model.predict(X_test, verbose=0)
y_pred_binary_all = (y_pred_all > 0.5).astype(int)
print(f"\n6. OVERALL TEST SET:")
print(f"   Average predicted genres per movie: {y_pred_binary_all.sum(axis=1).mean():.2f}")
print(f"   Average true genres per movie: {y_test.sum(axis=1).mean():.2f}")
print(f"   Most common prediction: {genres[np.argmax(y_pred_all.mean(axis=0))]}")

print("\n" + "=" * 80)
print("CONCLUSION:")
print("  Inputs: DIFFERENT")
print("  Embeddings: DIFFERENT")
print("  LSTM outputs: IDENTICAL <<< BROKEN HERE")
print("  Final predictions: IDENTICAL")
print("\n  The LSTM layer is not processing inputs - outputting constant vector.")
print("  Likely cause: Vanishing gradients with 6808 timestep sequences.")
print("  Fix: Downsample sequences to 1500-2000 and retrain.")
print("=" * 80)

In [None]:
from sklearn.metrics import classification_report

y_pred_lstm = (model.predict(X_test, verbose=0) > 0.5).astype(int)

print("\n" + "=" * 80)
print("LSTM - PER-GENRE PERFORMANCE")
print("=" * 80)
print(classification_report(y_test, y_pred_lstm, target_names=genres, zero_division=0))

In [None]:
# Save model 

#model.save('lstm_model.keras')
#print("Model saved to lstm_model.keras")

In [None]:
# Load model 
""" from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('lstm_model.keras')
print("Model loaded successfully")

# Now you can use it for predictions
y_pred = model.predict(X_test) """

In [None]:
import numpy as np
from utils import genres

# Pick random sample
random_idx = np.random.randint(0, len(X_test))

# Get data
random_test = X_test[random_idx:random_idx+1]
y_true = y_test[random_idx]



y_pred = model.predict(random_test)[0]
y_pred_binary = (y_pred > 0.5).astype(int)

print(f"True {y_true}")
print(f"Pred {y_pred_binary}")
