# Movie recommender system

In this project, I aim to build and train a content-based filtering recommender system, which can give movie recommendations to users based on their ratings

In [35]:
import pandas as pd, numpy as np, tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate, numpy.ma as ma
from sklearn.metrics.pairwise import cosine_similarity

## Loading the MovieLens-small dataset

In [36]:
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

## Feature engineering

In [37]:
movie_genres = movies["genres"].str.get_dummies("|")
movie_data = pd.concat([movies[["movieId"]], movie_genres], axis=1)

user_movie = ratings.merge(movie_genres, left_on="movieId", right_index=True)
for g in movie_genres.columns:
    user_movie[g] = user_movie[g] * user_movie["rating"]
user_data = user_movie.groupby("userId")[movie_genres.columns].mean().reset_index()
user_data.head()

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.685345,0.474138,0.189655,0.336207,1.224138,0.5,0.107759,2.025862,...,0.060345,0.560345,0.0,0.146552,0.288793,0.762931,0.314655,0.909483,0.189655,0.025862
1,2,0.0,1.833333,0.0,0.0,0.0,1.083333,1.333333,0.0,2.666667,...,0.0,0.0,0.0,0.0,0.0,0.75,0.0,1.333333,0.0,0.0
2,3,0.0,0.416667,0.166667,0.138889,0.152778,0.722222,0.583333,0.138889,1.708333,...,0.0,0.125,0.138889,0.0,0.138889,0.194444,0.222222,0.541667,0.111111,0.0
3,4,0.0,0.592593,0.305556,0.050926,0.12037,1.171296,0.421296,0.078704,1.861111,...,0.064815,0.430556,0.0,0.083333,0.226852,0.828704,0.263889,0.5,0.166667,0.101852
4,5,0.0,1.090909,0.477273,0.295455,0.431818,1.136364,0.159091,0.090909,1.295455,...,0.0,0.386364,0.0,0.068182,0.090909,0.545455,0.613636,0.818182,0.0,0.068182


In [38]:
data = ratings.merge(user_data, on="userId").merge(movie_data, on="movieId")
user_features = [g + "_x" for g in movie_genres.columns]
item_features = [g + "_y" for g in movie_genres.columns]

X_user, X_item = data[user_features], data[item_features]
y = data["rating"].values.reshape(-1,1)

scalerUser, scalerItem, scalerTarget = StandardScaler(), StandardScaler(), MinMaxScaler()
X_user_s, X_item_s, y_s = scalerUser.fit_transform(X_user), scalerItem.fit_transform(X_item), scalerTarget.fit_transform(y)
u_train,u_test,i_train,i_test,y_train,y_test = train_test_split(X_user_s,X_item_s,y_s,test_size=0.2,random_state=42)

## Two-tower neural network

In [39]:
num_outputs = 32
input_user = tf.keras.layers.Input(shape=(len(user_features),))
u = tf.keras.layers.Dense(128, activation="relu")(input_user)
u = tf.keras.layers.Dense(num_outputs)(u)
u = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(u)

input_item = tf.keras.layers.Input(shape=(len(item_features),))
m = tf.keras.layers.Dense(128, activation="relu")(input_item)
m = tf.keras.layers.Dense(num_outputs)(m)
m = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(m)

output = tf.keras.layers.Dot(axes=1)([u,m])
model = tf.keras.Model([input_user,input_item], output)
model.compile(optimizer="adam", loss="mse")
model.fit([u_train,i_train], y_train, epochs=5, batch_size=256, validation_data=([u_test,i_test], y_test))

Epoch 1/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.0503 - val_loss: 0.0432
Epoch 2/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0417 - val_loss: 0.0425
Epoch 3/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0409 - val_loss: 0.0419
Epoch 4/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0404 - val_loss: 0.0417
Epoch 5/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0400 - val_loss: 0.0414


<keras.src.callbacks.history.History at 0x195ad788d10>

## Predictor functions

In [41]:
# prepare unique_items once (movies as used by item scaler / model)
unique_items = data[["movieId"] + item_features].drop_duplicates("movieId").reset_index(drop=True)
# make a lookup DataFrame for fast ordered retrieval
movies_by_id = movies.set_index("movieId")

def recommend_new_user(pref_vector, top_n=10):
    pref_vector = np.array(pref_vector).reshape(1, -1)
    n_movies = len(unique_items)
    top_n = min(top_n, n_movies)

    # replicate user vector to match unique_items
    user_vecs = np.repeat(pref_vector, n_movies, axis=0)

    # scale using DataFrame wrappers (keeps feature names)
    suser = scalerUser.transform(pd.DataFrame(user_vecs, columns=user_features))
    sitem = scalerItem.transform(pd.DataFrame(unique_items[item_features].values, columns=item_features))

    preds = model.predict([suser, sitem], verbose=0).reshape(-1, 1)
    preds_unscaled = scalerTarget.inverse_transform(preds).reshape(-1)

    top_idx = np.argsort(-preds_unscaled)[:top_n]
    top_movie_ids = unique_items.loc[top_idx, "movieId"].values

    # preserve predicted order using .loc with the list of ids
    return movies_by_id.loc[top_movie_ids][["title", "genres"]].reset_index()

In [42]:
def recommend_existing_user(user_id, top_n=10):
    row = user_data[user_data["userId"] == user_id]
    if row.empty:
        raise ValueError(f"User {user_id} not found in user_data")
    pref = row.drop("userId", axis=1).values.reshape(1, -1)

    n_movies = len(unique_items)
    top_n = min(top_n, n_movies)

    user_vecs = np.repeat(pref, n_movies, axis=0)

    suser = scalerUser.transform(pd.DataFrame(user_vecs, columns=user_features))
    sitem = scalerItem.transform(pd.DataFrame(unique_items[item_features].values, columns=item_features))

    preds = model.predict([suser, sitem], verbose=0).reshape(-1, 1)
    preds_unscaled = scalerTarget.inverse_transform(preds).reshape(-1)

    top_idx = np.argsort(-preds_unscaled)[:top_n]
    top_movie_ids = unique_items.loc[top_idx, "movieId"].values

    return movies_by_id.loc[top_movie_ids][["title", "genres"]].reset_index()

In [43]:
print("\n--- New user recommendations ---")
# use the average genre preferences across all users as a dummy new user
new_user_pref = user_data.drop("userId", axis=1).mean().values  
recommend_new_user(new_user_pref, top_n=5).head()


--- New user recommendations ---


Unnamed: 0,movieId,title,genres
0,90746,"Adventures of Tintin, The (2011)",Action|Animation|Mystery|IMAX
1,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
2,109850,Need for Speed (2014),Action|Crime|Drama|IMAX
3,162968,Kizumonogatari Part 1: Tekketsu (2016),Action|Animation|Mystery
4,115664,The Book of Life (2014),Adventure|Animation|Romance


In [44]:
print("\n--- Existing user recommendations ---")
recommend_existing_user(1, top_n=5).head()


--- Existing user recommendations ---


Unnamed: 0,movieId,title,genres
0,79702,Scott Pilgrim vs. the World (2010),Action|Comedy|Fantasy|Musical|Romance
1,1209,Once Upon a Time in the West (C'era una volta ...,Action|Drama|Western
2,99114,Django Unchained (2012),Action|Drama|Western
3,553,Tombstone (1993),Action|Drama|Western
4,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
