In [1]:
from utils import load_movie_features_with_encoded_genres, split_train_test
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [2]:
def downsample_long_sequences(df_lstm, max_length=2000):
    """
    Downsample emotion sequences that exceed max_length.
    
    Args:
        df_lstm: DataFrame with emotion_sequence_int column
        max_length: Maximum desired sequence length
        
    Returns:
        DataFrame with downsampled sequences
    """
    df_downsampled = df_lstm.copy()
    
    def adaptive_downsample(seq):
        if len(seq) > max_length:
            # Calculate downsample factor
            downsample_factor = len(seq) // max_length + 1
            return seq[::downsample_factor]
        else:
            return seq
    
    # Apply downsampling
    df_downsampled['emotion_sequence_int'] = df_downsampled['emotion_sequence_int'].apply(adaptive_downsample)
    
    # Update sequence lengths
    df_downsampled['sequence_length'] = df_downsampled['emotion_sequence_int'].apply(len)
        
    return df_downsampled

In [None]:
from utils import load_movie_features_with_encoded_genres, prepare_for_lstm, split_train_test, genres

# Load data
df_original, df_with_genres = load_movie_features_with_encoded_genres('movie_features.csv')

# Prepare for LSTM
df_lstm = prepare_for_lstm(df_with_genres)

df_lstm = downsample_long_sequences(df_lstm, max_length=2000)

# Split
df_train, df_test = split_train_test(df_lstm, test_size=0.2, random_state=42)

# Convert to X, y
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_seqs = df_train['emotion_sequence_int'].tolist()
X_test_seqs = df_test['emotion_sequence_int'].tolist()

max_length = max(max(len(seq) for seq in X_train_seqs), max(len(seq) for seq in X_test_seqs))

X_train = pad_sequences(X_train_seqs, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test_seqs, maxlen=max_length, padding='post')

y_train = df_train[genres].values
y_test = df_test[genres].values

Loaded 1643 movies
Parsed emotion sequences to lists
One-hot encoded 23 genres
Converted emotion sequences to integer encoding for LSTM
Split dataset:
  Training: 1314 movies (80%)
  Testing:  329 movies (20%)


2025-11-29 19:05:30.492772: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-29 19:05:30.707179: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-29 19:05:30.707308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-29 19:05:30.748576: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-29 19:05:30.820416: I tensorflow/core/platform/cpu_feature_guar

In [34]:
from collections import Counter

# Count genre occurrences
genre_counts = Counter()

for genres_str in df_lstm['genres'].dropna():
    genre_list = [g.strip() for g in str(genres_str).split(',')]
    genre_counts.update(genre_list)

# Sort by frequency
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)

print("Genre frequencies:")
for genre, count in sorted_genres:
    percentage = (count / len(df_lstm)) * 100
    print(f"{genre:15} {count:4} ({percentage:.1f}%)")

print(f"\nTotal movies: {len(df_lstm)}")

Genre frequencies:
Drama            988 (60.1%)
Thriller         550 (33.5%)
Comedy           519 (31.6%)
Action           405 (24.7%)
Crime            337 (20.5%)
Romance          327 (19.9%)
Adventure        284 (17.3%)
Sci-Fi           234 (14.2%)
Horror           218 (13.3%)
Mystery          213 (13.0%)
Fantasy          190 (11.6%)
Biography        162 (9.9%)
Family            86 (5.2%)
History           81 (4.9%)
War               68 (4.1%)
Music             58 (3.5%)
Animation         49 (3.0%)
Sport             45 (2.7%)
Musical           32 (1.9%)
Western           22 (1.3%)
Short             11 (0.7%)
Film-Noir         10 (0.6%)
Documentary        3 (0.2%)

Total movies: 1643


In [4]:
print(f"Max sequence length: {max_length}")
print(f"Training data shape: {X_train.shape}")
print(f"Memory estimate: {X_train.nbytes / (1024**3):.2f} GB")

Max sequence length: 6808
Training data shape: (1314, 6808)
Memory estimate: 0.03 GB


In [5]:
# Sanity check

import tensorflow as tf
print("TF OK:", tf.__version__)
print("GPU:", tf.config.list_physical_devices("GPU"))


TF OK: 2.15.0
GPU: []


2025-11-29 19:05:33.234424: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:c4:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-11-29 19:05:33.608784: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [None]:
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import genres, emotions
import numpy as np
from tensorflow.keras.metrics import Precision, Recall, AUC

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Force CPU

import tensorflow as tf

# Prepare data (X_train, y_train already created from previous steps)
print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")

# Build LSTM model
model = models.Sequential([
    layers.Embedding(input_dim=len(emotions), output_dim=32, input_length=max_length),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(genres), activation='sigmoid')
])



model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        Precision(name='precision'),
        Recall(name='recall'),
        AUC(name='auc', multi_label=True)
    ]
)

print("\nModel Summary:")
model.summary()

# Train with validation split (similar to cross-validation)
print("\nTraining with 20% validation split...")
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

print("\nModel trained on full training set")
print(f"Final Training Precision: {history.history['precision'][-1]:.4f}")
print(f"Final Training Recall: {history.history['recall'][-1]:.4f}")
print(f"Final Validation Precision: {history.history['val_precision'][-1]:.4f}")
print(f"Final Validation Recall: {history.history['val_recall'][-1]:.4f}")

Training data shape: (1314, 6808)
Training labels shape: (1314, 23)

Model Summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6808, 32)          224       
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 23)                1495      
                                                                 
Total params: 30711 (119.96 KB)
Trainable params: 30711 (119.96 KB)
Non-trainable params: 0 (0.00 Byte)


KeyError: 'accuracy'

In [8]:
from sklearn.metrics import hamming_loss, jaccard_score, f1_score
import numpy as np

# Predict (X_test, y_test already created from previous steps)
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Per-label accuracy
per_label_accuracy = 1 - hamming_loss(y_test, y_pred)
print(f"Per-Label Accuracy: {per_label_accuracy:.4f}")

# Jaccard score
jaccard = jaccard_score(y_test, y_pred, average='samples')
print(f"Jaccard Score (avg overlap): {jaccard:.4f}")

# F1 score
f1 = f1_score(y_test, y_pred, average='samples', zero_division=0)
print(f"F1 Score: {f1:.4f}")

# Exact match
exact_match = np.mean([np.array_equal(y_test[i], y_pred[i]) for i in range(len(y_test))])
print(f"Exact Match: {exact_match:.4f}")

Per-Label Accuracy: 0.8778
Jaccard Score (avg overlap): 0.2467
F1 Score: 0.3328
Exact Match: 0.0608


In [35]:
import numpy as np
from utils import genres

print("=" * 80)
print("LSTM DIAGNOSTIC REPORT")
print("=" * 80)

# 1. Check input sequences are different
print("\n1. INPUT SEQUENCES (first 20 emotions):")
print(f"   Test movie 0: {X_test[0][:20]}")
print(f"   Test movie 1: {X_test[1][:20]}")
print(f"   Test movie 2: {X_test[2][:20]}")
print(f"   Sequences identical? {np.array_equal(X_test[0], X_test[1])}")

# 2. Check embeddings are different
from tensorflow.keras import Model
embedding_model = Model(inputs=model.input, outputs=model.layers[0].output)
embed_0 = embedding_model.predict(X_test[0:1], verbose=0)
embed_1 = embedding_model.predict(X_test[1:2], verbose=0)
print(f"\n2. EMBEDDING LAYER:")
print(f"   Output 0 sample: {embed_0[0][0][:5]}")
print(f"   Output 1 sample: {embed_1[0][0][:5]}")
print(f"   Embeddings identical? {np.allclose(embed_0, embed_1)}")

# 3. Check LSTM outputs
lstm_model = Model(inputs=model.input, outputs=model.layers[1].output)
lstm_0 = lstm_model.predict(X_test[0:1], verbose=0)
lstm_1 = lstm_model.predict(X_test[1:2], verbose=0)
print(f"\n3. LSTM LAYER:")
print(f"   Output 0: {lstm_0[0][:10]}")
print(f"   Output 1: {lstm_1[0][:10]}")
print(f"   LSTM outputs identical? {np.allclose(lstm_0, lstm_1)}")

# 4. Check final predictions
pred_0 = model.predict(X_test[0:1], verbose=0)[0]
pred_1 = model.predict(X_test[1:2], verbose=0)[0]
pred_2 = model.predict(X_test[2:3], verbose=0)[0]
print(f"\n4. FINAL PREDICTIONS (probabilities):")
print(f"   Movie 0: {pred_0[:10]}")
print(f"   Movie 1: {pred_1[:10]}")
print(f"   Movie 2: {pred_2[:10]}")
print(f"   All predictions identical? {np.allclose(pred_0, pred_1) and np.allclose(pred_1, pred_2)}")

# 5. Check prediction behavior
pred_0_binary = (pred_0 > 0.5).astype(int)
pred_1_binary = (pred_1 > 0.5).astype(int)
print(f"\n5. BINARY PREDICTIONS:")
print(f"   Movie 0 genres: {[genres[i] for i in range(len(genres)) if pred_0_binary[i] == 1]}")
print(f"   Movie 1 genres: {[genres[i] for i in range(len(genres)) if pred_1_binary[i] == 1]}")
print(f"   Always predicting same genre? {np.array_equal(pred_0_binary, pred_1_binary)}")

# 6. Overall test predictions
y_pred_all = model.predict(X_test, verbose=0)
y_pred_binary_all = (y_pred_all > 0.5).astype(int)
print(f"\n6. OVERALL TEST SET:")
print(f"   Average predicted genres per movie: {y_pred_binary_all.sum(axis=1).mean():.2f}")
print(f"   Average true genres per movie: {y_test.sum(axis=1).mean():.2f}")
print(f"   Most common prediction: {genres[np.argmax(y_pred_all.mean(axis=0))]}")

print("\n" + "=" * 80)
print("CONCLUSION:")
print("  Inputs: DIFFERENT")
print("  Embeddings: DIFFERENT")
print("  LSTM outputs: IDENTICAL <<< BROKEN HERE")
print("  Final predictions: IDENTICAL")
print("\n  The LSTM layer is not processing inputs - outputting constant vector.")
print("  Likely cause: Vanishing gradients with 6808 timestep sequences.")
print("  Fix: Downsample sequences to 1500-2000 and retrain.")
print("=" * 80)

LSTM DIAGNOSTIC REPORT

1. INPUT SEQUENCES (first 20 emotions):
   Test movie 0: [6 6 1 2 0 2 6 6 6 6 6 0 6 6 6 2 2 2 5 1]
   Test movie 1: [2 5 6 2 2 2 6 6 5 4 0 0 0 2 6 6 6 6 4 6]
   Test movie 2: [3 6 3 6 6 3 5 1 1 6 6 3 0 6 3 6 3 2 5 4]
   Sequences identical? False

2. EMBEDDING LAYER:
   Output 0 sample: [-0.04548707 -0.00129836 -0.0230341   0.00783323  0.02307265]
   Output 1 sample: [-0.02388824  0.028193    0.04022922 -0.02470715 -0.02971494]
   Embeddings identical? False

3. LSTM LAYER:
   Output 0: [-0.04494485 -0.4543881  -0.49469566 -0.43969    -0.4204751   0.18315892
  0.37721047 -0.40435907 -0.10088176  0.3099661 ]
   Output 1: [-0.04494485 -0.45438802 -0.49469572 -0.43969    -0.42047504  0.18315892
  0.37721053 -0.40435907 -0.10088176  0.30996606]
   LSTM outputs identical? True

4. FINAL PREDICTIONS (probabilities):
   Movie 0: [0.23888312 0.17057236 0.0254903  0.09667966 0.30561358 0.20302154
 0.01370833 0.60313576 0.05355994 0.10688362]
   Movie 1: [0.23888312 0.170

In [None]:
# Save model 

#model.save('lstm_model.keras')
#print("Model saved to lstm_model.keras")

In [None]:
# Load model 
""" from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('lstm_model.keras')
print("Model loaded successfully")

# Now you can use it for predictions
y_pred = model.predict(X_test) """

In [None]:
import numpy as np
from utils import genres

# Pick random sample
random_idx = np.random.randint(0, len(X_test))

# Get data
random_test = X_test[random_idx:random_idx+1]
y_true = y_test[random_idx]



y_pred = model.predict(random_test)[0]
y_pred_binary = (y_pred > 0.5).astype(int)

print(f"True {y_true}")
print(f"Pred {y_pred_binary}")


True [0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0]
Pred [0.23888311 0.17057236 0.0254903  0.09667966 0.30561358 0.20302154
 0.01370833 0.60313576 0.05355995 0.1068836  0.01418324 0.04999492
 0.12211634 0.04637704 0.01881817 0.13074085 0.19230515 0.13748014
 0.01391637 0.03432674 0.33136564 0.05314127 0.017088  ]
