In [None]:
import pandas as pd
import numpy as np
import json
import ast # Used sometimes if json.loads fails unexpectedly

# --- PHASE 1: Loading, Merging, and Initial Cleaning ---

# Merging the TMDB 5000 Movie Dataset and TMDB 5000 Credits Dataset
movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

# 1. Merge the two DataFrames on 'id' (from movies_df) and 'movie_id' (from credits_df)
merged_df = pd.merge(movies_df, credits_df, left_on='id', right_on='movie_id', how='inner')

# 2. Drop redundant and unnecessary columns
merged_df.drop('movie_id', axis=1, inplace=True) # Redundant ID column
merged_df.drop('title_y', axis=1, inplace=True) # Redundant title from credits file
merged_df.drop('original_title', axis=1, inplace=True) # Keeping only English 'title'

# 3. Rename columns for clarity
merged_df = merged_df.rename(columns={'title_x': 'title'})

# 4. Critical: Ensure 'id' remains a column, NOT an index.
if merged_df.index.name == 'id':
    merged_df.reset_index(inplace=True)

# 5. Handle simple missing values
merged_df['tagline'] = merged_df['tagline'].fillna('')
merged_df['overview'] = merged_df['overview'].fillna('')
merged_df['homepage'] = merged_df['homepage'].fillna('')

# 6. Handle release_date and runtime
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')
# Drop rows where release_date failed (1 missing row in original data)
merged_df.dropna(subset=['release_date'], inplace=True)

# Fill missing runtime with the median value
merged_df['runtime'] = merged_df['runtime'].fillna(merged_df['runtime'].median())

print("Initial Merging and Cleaning complete.")
print(f"Current TMDB ID column type: {merged_df['id'].dtype}")

# --- PHASE 2: JSON Parsing and Flattening ---

# Define a safe parser function for JSON columns (using ast.literal_eval as a fallback for safety)
def safe_json_parse(json_string):
    try:
        # Tries standard JSON load
        return json.loads(json_string)
    except:
        # Fallback for poorly formatted strings
        try:
            return ast.literal_eval(json_string)
        except:
            return []

# Helper function to extract and join names from JSON list
def extract_names(json_list):
    return ', '.join([item['name'] for item in json_list])

# --- Apply Parsers ---

# 1. Genres
merged_df['genres'] = merged_df['genres'].apply(safe_json_parse).apply(extract_names)

# 2. Keywords
merged_df['keywords'] = merged_df['keywords'].apply(safe_json_parse).apply(extract_names)

# 3. Production Companies
merged_df['production_companies'] = merged_df['production_companies'].apply(safe_json_parse).apply(extract_names)

# 4. Spoken Languages
merged_df['spoken_languages'] = merged_df['spoken_languages'].apply(safe_json_parse).apply(extract_names)

# 5. Production Countries
merged_df['production_countries'] = merged_df['production_countries'].apply(safe_json_parse).apply(extract_names)

# 6. Cast (Top 6 actors/actresses)
def parse_cast_top_n(x, n=6):
    cast = safe_json_parse(x)
    return ', '.join([actor['name'] for actor in cast[:n]])
merged_df['cast'] = merged_df['cast'].apply(parse_cast_top_n)

# 7. Crew (All crew members with job description)
def parse_crew_full(x):
    crew = safe_json_parse(x)
    return ', '.join([f"{member['name']} ({member['job']})" for member in crew])
merged_df['crew'] = merged_df['crew'].apply(parse_crew_full)

# --- PHASE 3: Final Save ---

output_filename = 'clean_parsed_tmdb_5000.csv'

# FINAL CHECK: Ensure 'id' is included in the saved file.
merged_df.to_csv(output_filename, index=False)

print(f"\nSuccessfully created Clean TMDB Data.")
print(f"Saved to: {output_filename}")
print(f"Final DataFrame shape: {merged_df.shape}")
print(f"The 'id' column is successfully preserved as a column.")

Initial Merging and Cleaning complete.
Current TMDB ID column type: int64

Successfully created Clean TMDB Data.
Saved to: clean_parsed_tmdb_5000.csv
Final DataFrame shape: (4802, 21)
The 'id' column is successfully preserved as a column.


In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files # Used for displaying the files sidebar upload utility (optional, but helpful for context)

# --- Step 1: Load Data ---
LINKS_FILE = 'links.csv'
RATINGS_FILE = 'ratings.csv'
OUTPUT_FILE = 'ml_ratings_with_tmdb_id.csv'

try:
    # Read files directly from the current Colab directory
    links_df = pd.read_csv(LINKS_FILE)
    ratings_df = pd.read_csv(RATINGS_FILE)
    print("MovieLens datasets loaded successfully from the Colab environment.")
except FileNotFoundError:
    print(f"Error: Could not find one or both files ({LINKS_FILE}, {RATINGS_FILE}).")
    print("Please ensure the files are uploaded to the root directory of your Colab session.")
    # Exit execution if files are missing
    exit()

# --- Step 2: Clean and Prepare Links Data ---
# links.csv contains movieId, imdbId, tmdbId.
# We need 'movieId' to link with ratings and 'tmdbId' to link with the TMDB 5000 dataset.

# Drop rows where tmdbId is missing, as we need this ID for joining with TMDB 5000.
links_df.dropna(subset=['tmdbId'], inplace=True)

# Convert tmdbId to integer type (it's often stored as float due to NaNs)
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

# Select only the necessary columns
links_cleaned = links_df[['movieId', 'tmdbId']]
print(f"Links data cleaned. Retained {links_cleaned.shape[0]} records with valid TMDB IDs.")

# --- Step 3: Clean and Prepare Ratings Data ---
# ratings.csv contains userId, movieId, rating, timestamp.
# For collaborative filtering, we primarily need userId, movieId, and rating.
ratings_cleaned = ratings_df[['userId', 'movieId', 'rating']]
print(f"Ratings data cleaned. Retained {ratings_cleaned.shape[0]} rating records.")

# --- Step 4: Merge DataFrames ---
# Merge the ratings data with the TMDB ID based on the common 'movieId'.
# An inner merge ensures we only keep ratings for movies that have a valid tmdbId in the links file.
merged_ratings_tmdb = pd.merge(
    ratings_cleaned,
    links_cleaned,
    on='movieId',
    how='inner'
)

# Rename the tmdbId column to 'id' to match the column name in the 'clean_parsed_tmdb_5000.csv'
merged_ratings_tmdb.rename(columns={'tmdbId': 'id'}, inplace=True)

print("\nMovieLens Ratings successfully merged with TMDB ID.")
print(f"Final merged dataset shape: {merged_ratings_tmdb.shape}")
print("First 5 rows of the merged data:")
print(merged_ratings_tmdb.head())

# --- Step 5: Save the Result ---
# Save the new DataFrame containing userId, movieId, rating, and the TMDB 'id'.
merged_ratings_tmdb.to_csv(OUTPUT_FILE, index=False)
print(f"\nSuccessfully saved the final ratings-to-TMDB-ID mapping file to: {OUTPUT_FILE}")
print("This file can now be used for collaborative filtering or merged with your TMDB 5000 movie features.")

MovieLens datasets loaded successfully from the Colab environment.
Links data cleaned. Retained 9734 records with valid TMDB IDs.
Ratings data cleaned. Retained 100836 rating records.

MovieLens Ratings successfully merged with TMDB ID.
Final merged dataset shape: (100823, 4)
First 5 rows of the merged data:
   userId  movieId  rating     id
0       1        1     4.0    862
1       1        3     4.0  15602
2       1        6     4.0    949
3       1       47     5.0    807
4       1       50     5.0    629

Successfully saved the final ratings-to-TMDB-ID mapping file to: ml_ratings_with_tmdb_id.csv
This file can now be used for collaborative filtering or merged with your TMDB 5000 movie features.


In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Add, Concatenate, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

# --- Configuration ---
RATINGS_FILE = 'ml_ratings_with_tmdb_id.csv'
LATENT_DIM = 50
EPOCHS = 20
BATCH_SIZE = 64
SEED = 42
REG_L2 = 0.005

# Ensure reproducibility
tf.random.set_seed(SEED)
np.random.seed(SEED)

# --- 1. Load Data ---
try:
    ratings_df = pd.read_csv(RATINGS_FILE)
    # Use MovieLens movieId for encoding, as it's the original identifier for rating records
    ratings = ratings_df[['userId', 'movieId', 'rating']]
except FileNotFoundError:
    print(f"File {RATINGS_FILE} not found. Using a smaller default dataset for demonstration.")
    data_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small/ratings.csv"
    ratings = pd.read_csv(data_url)
    ratings = ratings[['userId', 'movieId', 'rating']]

# --- 2. Prepare Data and Encoding ---
# Convert original IDs to category codes (0 to N-1) for embedding layer indexing
user_ids = ratings['userId'].astype('category').cat.codes
movie_ids = ratings['movieId'].astype('category').cat.codes

# Create mapping dictionaries to get original IDs back later
user_map = dict(enumerate(ratings['userId'].astype('category').cat.categories))
movie_map = dict(enumerate(ratings['movieId'].astype('category').cat.categories))

num_users = len(user_ids.unique())
num_movies = len(movie_ids.unique())
global_mean = ratings['rating'].mean() # Calculate Global Mean (mu)

X = pd.DataFrame({'user_id': user_ids, 'movie_id': movie_ids})
y = ratings['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

print(f"Global Mean Rating: {global_mean:.4f}")
print(f"Using L2 Regularization factor: {REG_L2}")

# --- 3. Build Keras Matrix Factorization Model with Biases and Regularization ---
def build_svd_model_with_biases(num_users, num_movies, latent_dim, reg_l2, global_mean):

    # --- Shared Components: User and Movie Inputs ---
    user_input = Input(shape=(1,), name='user_input')
    movie_input = Input(shape=(1,), name='movie_input')

    # --- 1. User and Movie Embeddings (Latent Factors P and Q) with L2 Regularization ---
    # User Embeddings (P)
    user_embedding = Embedding(input_dim=num_users,
                               output_dim=latent_dim,
                               embeddings_regularizer=l2(reg_l2),
                               name='user_factors')(user_input)
    user_vec = Flatten(name='flatten_user')(user_embedding)

    # Movie Embeddings (Q)
    movie_embedding = Embedding(input_dim=num_movies,
                                 output_dim=latent_dim,
                                 embeddings_regularizer=l2(reg_l2),
                                 name='movie_factors')(movie_input)
    movie_vec = Flatten(name='flatten_movie')(movie_embedding)

    # --- 2. User and Movie Biases (B_u and B_i) with L2 Regularization ---
    # User Bias (b_u) - output_dim=1 for a scalar bias
    user_bias = Embedding(input_dim=num_users,
                              output_dim=1,
                              embeddings_regularizer=l2(reg_l2),
                              name='user_bias')(user_input)
    user_bias_flat = Flatten(name='flatten_user_bias')(user_bias)

    # Movie Bias (b_i) - output_dim=1 for a scalar bias
    movie_bias = Embedding(input_dim=num_movies,
                            output_dim=1,
                            embeddings_regularizer=l2(reg_l2),
                            name='movie_bias')(movie_input)
    movie_bias_flat = Flatten(name='flatten_movie_bias')(movie_bias)

    # --- 3. Prediction Formulation (SVD/Matrix Factorization Formula) ---
    # R_hat = mu + b_u + b_i + P_u * Q_i^T

    # P_u * Q_i^T (Dot product of latent factors)
    latent_dot_product = Dot(axes=1, name='latent_dot_product')([user_vec, movie_vec])

    # A layer representing the constant global mean (mu)
    global_mean_layer = Lambda(lambda x: x + global_mean, name='global_mean_add')(latent_dot_product)

    # Add the biases to the latent dot product and global mean
    output = Add(name='predicted_rating')([global_mean_layer, user_bias_flat, movie_bias_flat])

    # Build the model
    model = Model(inputs=[user_input, movie_input], outputs=output)

    # Compile the model
    model.compile(optimizer=Adam(0.001),
                  loss='mse',
                  metrics=[tf.keras.metrics.RootMeanSquaredError(name='RMSE'), 'mae']) # Added MAE to metrics

    return model

# Create and summarize the enhanced model
model_svd = build_svd_model_with_biases(num_users, num_movies, LATENT_DIM, REG_L2, global_mean)
model_svd.summary()


# --- 4. Train the Model ---
print("\n--- Starting Enhanced Model Training ---")

# Train the model
history = model_svd.fit(
    [X_train['user_id'], X_train['movie_id']],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_data=([X_test['user_id'], X_test['movie_id']], y_test)
)

print("\n--- Training Complete ---")

# Evaluate the model on the test set
metrics = model_svd.evaluate([X_test['user_id'], X_test['movie_id']], y_test, verbose=0)
loss = metrics[0] # MSE
rmse = metrics[1] # RMSE
mae = metrics[2] # MAE

print(f"Final Test RMSE (Enhanced Model): {rmse:.4f}")
print(f"Final Test MAE (Enhanced Model): {mae:.4f}")
print(f"Final Test MSE (Loss): {loss:.4f}")


# --- 5. Make a Prediction ---
original_user_id = 1
original_movie_id = 302

try:
    encoded_user_id = user_ids[ratings['userId'] == original_user_id].iloc[0]
    encoded_movie_id = movie_ids[ratings['movieId'] == original_movie_id].iloc[0]
except IndexError:
    print(f"\nUser ID {original_user_id} or Movie ID {original_movie_id} not found in the training data.")
    encoded_user_id = X_test['user_id'].iloc[0]
    encoded_movie_id = X_test['movie_id'].iloc[0]
    original_user_id = user_map[encoded_user_id]
    original_movie_id = movie_map[encoded_movie_id]
    print(f"Using test example: User ID {original_user_id}, Movie ID {original_movie_id}")

user_input_array = np.array([encoded_user_id])
movie_input_array = np.array([encoded_movie_id])

predicted_rating = model_svd.predict([user_input_array, movie_input_array])[0][0]

print(f"\n--- Specific Rating Prediction (Enhanced SVD Model) ---")
print(f"Predicted rating for User ID **{original_user_id}** and Movie ID **{original_movie_id}**: **{predicted_rating:.4f}**")

Global Mean Rating: 3.5016
Using L2 Regularization factor: 0.005



--- Starting Enhanced Model Training ---
Epoch 1/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - RMSE: 1.0197 - loss: 1.1776 - mae: 0.8123 - val_RMSE: 0.9785 - val_loss: 0.9928 - val_mae: 0.7767
Epoch 2/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - RMSE: 0.9768 - loss: 0.9945 - mae: 0.7764 - val_RMSE: 0.9672 - val_loss: 0.9859 - val_mae: 0.7667
Epoch 3/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - RMSE: 0.9687 - loss: 0.9904 - mae: 0.7696 - val_RMSE: 0.9646 - val_loss: 0.9853 - val_mae: 0.7647
Epoch 4/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - RMSE: 0.9669 - loss: 0.9900 - mae: 0.7682 - val_RMSE: 0.9640 - val_loss: 0.9852 - val_mae: 0.7642
Epoch 5/20
[1m1261/1261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - RMSE: 0.9664 - loss: 0.9900 - mae: 0.7679 - val_RMSE: 0.9638 - val_loss: 0.9853 - val_mae: 0.7641
Epoch 6/