In [154]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [155]:
users = pd.read_csv("Dataset/users.csv")
movies = pd.read_csv("Dataset/movies.csv")
watch_history = pd.read_csv("Dataset/watch_history.csv")

In [156]:
# --- Missing Data Summary ---
# Define function to summarize missing values and print missing data for each dataset.

def missing_summary(df, feat):
    missing_count = df.isna().sum()
    missing_percent = (missing_count/len(df))*100
    summary = pd.DataFrame({'Missing Count': missing_count, 'Missing %': missing_percent})
    summary = summary[summary["Missing Count"] > 0]
    print(f"--- Missing Data in {feat} ---")
    print(f"{summary}\n")
    return summary

miss_users = missing_summary(users, "Users")
miss_movies = missing_summary(movies, "Movies")
miss_watch = missing_summary(watch_history, "Watch History")

--- Missing Data in Users ---
                Missing Count  Missing %
age                      1229  11.932039
gender                    824   8.000000
monthly_spend            1017   9.873786
household_size           1545  15.000000

--- Missing Data in Movies ---
                    Missing Count  Missing %
genre_secondary               667  64.134615
imdb_rating                   150  14.423077
production_budget             675  64.903846
box_office_revenue            709  68.173077
number_of_seasons             751  72.211538
number_of_episodes            719  69.134615

--- Missing Data in Watch History ---
                        Missing Count  Missing %
watch_duration_minutes          12332  11.744762
progress_percentage              8514   8.108571
user_rating                     83903  79.907619



In [157]:
# --- Data Cleaning/Preprocessing ---
# Drop irrelevant columns and fill missing numeric/categorical values for users, movies and watch_history.

movies = movies.drop_duplicates(subset=['movie_id', 'title']).reset_index(drop=True)
movies = movies.drop(columns=['production_budget', 'box_office_revenue','number_of_seasons', 'number_of_episodes'])
movies['imdb_rating'] = movies['imdb_rating'].fillna(movies['imdb_rating'].mean())

users['age'] = users['age'].fillna(users['age'].median())
users['gender'] = users['gender'].fillna('Unknown')

watch_history = watch_history.drop(columns=['user_rating'], errors='ignore')
watch_history['watch_duration_minutes'] = watch_history['watch_duration_minutes'].fillna(watch_history['watch_duration_minutes'].median())

In [158]:
# --- User Genre Profiles ---
# Aggregate watch history by user and genre to create normalized user genre preferences and join with user averages.

user_profiles = (
    watch_history.groupby('user_id')
    .agg({'watch_duration_minutes': 'mean',
          'progress_percentage': 'mean'})
    .rename(columns={'watch_duration_minutes': 'avg_watch_duration',
                     'progress_percentage': 'avg_progress'})
)

user_genres = (
    watch_history.merge(movies[['movie_id', 'genre_primary']], on='movie_id', how='left')
    .groupby(['user_id', 'genre_primary']).size().unstack(fill_value=0)
)
user_genres = user_genres.div(user_genres.sum(axis=1), axis=0)
user_profiles = user_profiles.join(user_genres, how='left').fillna(0)

In [159]:
# --- Movie Feature Processing ---
# Select movie features, standardize release year, one-hot encode primary genres, and prepare movie feature matrix.

movie_features = movies[['movie_id', 'imdb_rating', 'release_year', 'genre_primary']].copy()
movie_features['year'] = (movie_features['release_year'] - movie_features['release_year'].mean()) / movie_features['release_year'].std()

genre_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
genre_encoded = genre_encoder.fit_transform(movie_features[['genre_primary']])
genre_df = pd.DataFrame(genre_encoded, columns=genre_encoder.get_feature_names_out(['genre_primary']))

movie_features = pd.concat([movie_features.drop(columns=['genre_primary']).reset_index(drop=True), genre_df], axis=1)

In [160]:
# --- Merge Data for Modeling ---
# Combine watch history with user profiles and movie features; drop rows with missing target values.

data = (
    watch_history
    .merge(user_profiles, on='user_id', how='left')
    .merge(movie_features, on='movie_id', how='left')
)
data = data.dropna(subset=['watch_duration_minutes'])



In [161]:
# --- Feature Preparation ---
# Select numeric features, fill missing values, scale features, and add bias column.

X_user = data[user_profiles.columns].reset_index(drop=True)
X_movie = data[movie_features.drop(columns=['movie_id']).columns].reset_index(drop=True)

interaction_terms = X_user.values[:, :, None] * X_movie.values[:, None, :]
interaction_terms = interaction_terms.reshape(X_user.shape[0], -1)
interaction_cols = [f"{u}*{m}" for u in X_user.columns for m in X_movie.columns]

X_all = np.hstack([X_user.values, X_movie.values, interaction_terms])
X_all_cols = list(X_user.columns) + list(X_movie.columns) + interaction_cols

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)
X = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])

y = data['watch_duration_minutes'].values.reshape(-1, 1)

In [162]:
# --- Target and Train/Test Split ---
# Extract watch duration as target and split data into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [163]:
# --- Gradient Descent Function ---
# Implement basic gradient descent to train linear regression model on scaled features.

def gradient_descent(X, y, lr=0.01, epochs=1000):
    n_samples, n_features = X.shape
    theta = np.zeros((n_features, 1))
    
    for epoch in range(epochs):
        y_pred = X @ theta
        error = y_pred - y
        gradient = (2/n_samples) * X.T @ error
        theta -= lr * gradient
        
        if epoch % 100 == 0:
            loss = np.mean(error ** 2)
            print(f"Epoch {epoch}, MSE: {loss:.3f}")
    
    return theta

In [164]:
# --- Train Model ---
# Run gradient descent on training data to learn feature weights for predicting watch duration.

theta = gradient_descent(X_train, y_train, lr=0.01, epochs=1000)

y_pred = X_test @ theta
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Final RMSE: {rmse:.3f}")
print(f"Final R²: {r2:.3f}")

Epoch 0, MSE: 8289.704
Epoch 100, MSE: 3794.285
Epoch 200, MSE: 3720.077
Epoch 300, MSE: 3717.338
Epoch 400, MSE: 3716.308
Epoch 500, MSE: 3715.568
Epoch 600, MSE: 3715.015
Epoch 700, MSE: 3714.593
Epoch 800, MSE: 3714.269
Epoch 900, MSE: 3714.016
Final RMSE: 60.366
Final R²: 0.092


In [153]:
# --- Recommendation Function ---
# Predict top N movies for a user using the trained gradient descent model and user/movie features.

def recommend(user_id, n=5):
    if user_id not in user_profiles.index:
        print(f"User {user_id} not found.")
        return []

    user_row = user_profiles.loc[[user_id]].reset_index(drop=True)
    movie_matrix = movie_features.drop(columns=['movie_id']).reset_index(drop=True)

    X_user_vals = user_row.values.repeat(len(movie_matrix), axis=0)
    X_movie_vals = movie_matrix.values

    interaction_vals = X_user_vals[:, :, None] * X_movie_vals[:, None, :]
    interaction_vals = interaction_vals.reshape(len(movie_matrix), -1)

    X_input_all = np.hstack([X_user_vals, X_movie_vals, interaction_vals])
    X_input_scaled = scaler.transform(X_input_all)
    X_input = np.hstack([np.ones((X_input_scaled.shape[0], 1)), X_input_scaled])

    pred_duration = X_input @ theta

    watched_movies = watch_history[watch_history['user_id'] == user_id]['movie_id'].values
    top_indices = np.argsort(pred_duration.flatten())[::-1]
    top_indices = [i for i in top_indices if movie_features['movie_id'].iloc[i] not in watched_movies][:n]

    return movies.iloc[top_indices][['title', 'genre_primary', 'release_year', 'imdb_rating']]

recommend('user_00001', 10)


Unnamed: 0,title,genre_primary,release_year,imdb_rating
327,Mystery Day,Horror,2024,9.3
994,Big Day,Horror,2024,7.9
985,Love Day,Horror,2022,8.5
284,Dragon Dragon,Horror,2024,6.281425
986,Phoenix Storm,Horror,2023,6.8
720,Dream Dream,Horror,2022,5.9
950,Big House,Horror,2020,6.7
782,Love Journey,Horror,2015,7.9
991,The Love,Horror,2018,6.2
600,Secret Mission,Horror,2016,6.5
