Recommendation with Autoencoders

In [48]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt

Ratings file from kaggle*


In [49]:
# movielens_data_file_url = (
#     "https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/download?datasetVersionNumber=7"
# )
# movielens_zipped_file = keras.utils.get_file(
#     "archive", movielens_data_file_url, extract=False
# )
# keras_datasets_path = Path(movielens_zipped_file)
# movielens_dir = Path(movielens_zipped_file)
# print(movielens_dir)
# # Only extract the data the first time the script is run.
# if not movielens_dir.exists():
#     with ZipFile(movielens_zipped_file, "r") as zip:
#         # Extract files
#         print("Extracting all the files now...")
#         zip.extractall(path=keras_datasets_path)
#         print("Done!")
# ratings_file = movielens_dir / "ratings_small.csv"
# df = pd.read_csv(ratings_file)
# print(df.head())

In [50]:
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)

Next, we do some preprocessing to encode users and movies as integer indices. We take the df columns userId and movieId, encode them, and save them into two new columns user and movie. min_rating and max_rating will be used to normalize the ratings later.

In [51]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


We then prepare the training and validation data by using "user", "movie", min_rating and max_rating. The data is split so that 90% of the data is used for training and 10% is used for validation.

In [52]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "movie"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

Now, inside of a function showRecommendationsTJ(), we create a feature vector for one member of our team, Timothy Jan. We pick some movies, assign a rating to each of them, and create a new Dataframe. It is then used to generate 10 movie recommendations using the recommender model passed by the input model.

In [53]:
def showRecommendationsTJ(model):
  tj_movie_ID = [121231, 122900, 122902, 122904, 122906, 122912, 122916, 122918, 122920, 122922, 125916, 128360, 131739, 134130, 134853, 140715, 140956, 159858, 161354, 161594, 162082, 163645, 164367, 166024, 166528, 167370, 168248, 168252, 168366, 174053, 174055, 176101, 177285, 177593, 177763, 177765, 179401, 179819, 180031, 180985, 182715, 183635, 184471, 185029, 185135, 185585, 187031, 187593, 189713, 188189, ]
  tj_movie_ratings = [2.5, 3.5, 2.5, 3.75, 4.25, 4.25, 4.25, 3.5, 3.75, 3.25, 1.5, 4, 3.75, 4, 3.5, 3.75, 4, 3.75, 3.75, 4.25, 4.5, 4.25, 4.5, 4, 3.75, 0.5, 4.25, 3.5, 2, 4, 2.5, 3, 3.25, 4, 4, 3.5, 3, 3, 4.25, 3, 4, 2, 2.5, 4.5, 4.25, 4, 2.5, 3.5, 3.5, 4.5, ]
  tj_dict = {'movieId': tj_movie_ID, 'rating':tj_movie_ratings}
  movies_watched_by_tj = pd.DataFrame(tj_dict)
  movie_df = pd.read_csv(movielens_dir / "movies.csv")

  movies_not_watched_by_tj = movie_df[
      ~movie_df["movieId"].isin(movies_watched_by_tj.movieId.values)
  ]["movieId"]
  movies_not_watched_by_tj = list(
      set(movies_not_watched_by_tj).intersection(set(movie2movie_encoded.keys()))
  )
  movies_not_watched_by_tj = [[movie2movie_encoded.get(x)] for x in movies_not_watched_by_tj]
  user_encoder = 609 #must be in range, does not affect prediction. tested
  user_movie_array = np.hstack(
      ([[user_encoder]] * len(movies_not_watched_by_tj), movies_not_watched_by_tj)
  )
  ratings = model.predict(user_movie_array).flatten()
  top_ratings_indices = ratings.argsort()[-10:][::-1]
  recommended_movie_ids = [
      movie_encoded2movie.get(movies_not_watched_by_tj[x][0]) for x in top_ratings_indices
  ]
  print("Showing recommendations for user: TJ")
  print("====" * 9)
  print("Movies with high ratings from user")
  print("----" * 8)
  top_movies_user = (
      movies_watched_by_tj.sort_values(by="rating", ascending=False)
      .head(5)
      .movieId.values
  )
  movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
  for row in movie_df_rows.itertuples():
      print(row.title, ":", row.genres)
  print("----" * 8)
  print("Top 10 movie recommendations")
  print("----" * 8)
  recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
  for row in recommended_movies.itertuples():
      print(row.title, ":", row.genres)

In [54]:
EMBEDDING_SIZE = 50

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding the embedded user-movie inputs."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class improvedRecommenderWithAutoEncoder(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(improvedRecommenderWithAutoEncoder, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)
        self.user_movie_relationship1 = tf.keras.layers.Dense(50, activation= "relu")
        #self.user_movie_relationship2 = tf.keras.layers.Dense(50, activation= "relu")
        self.output_layer = tf.keras.layers.Dense(1, activation= "sigmoid", input_shape= (50,))

        self.DenseSamplerZmean = tf.keras.layers.Dense(10, name="z_mean")
        self.DenseSamplerZlogvar = tf.keras.layers.Dense(10, name="z_log_var")
        self.Sampling = Sampling()

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        
        concat_inputs = tf.concat([user_vector, movie_vector], 1)
        concat_bias = user_bias + movie_bias
        
        z_mean = self.DenseSamplerZmean(concat_inputs)
        z_log_var = self.DenseSamplerZlogvar(concat_inputs)
        response = self.Sampling([z_mean, z_log_var])

        #response = self.user_movie_relationship1(concat_inputs)
        response = self.user_movie_relationship1(response)
        #response = self.user_movie_relationship2(response)
        response = response + concat_bias
        response = self.output_layer(response)
        return response
    

final_model = improvedRecommenderWithAutoEncoder(num_users, num_movies, EMBEDDING_SIZE)
final_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
)

history = final_model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

Epoch 1/5


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
showRecommendationsTJ(final_model)

Showing recommendations for user: TJ
Movies with high ratings from user
--------------------------------
Kingsglaive: Final Fantasy XV (2016) : Action|Adventure|Animation|Drama|Fantasy|Sci-Fi
Train to Busan (2016) : Action|Thriller
The Girl with All the Gifts (2016) : Drama|Horror|Sci-Fi|Thriller
A Quiet Place (2018) : Drama|Horror|Thriller
Sorry to Bother You (2018) : Comedy|Fantasy|Sci-Fi
--------------------------------
Top 10 movie recommendations
--------------------------------
Streetcar Named Desire, A (1951) : Drama
Once Upon a Time in the West (C'era una volta il West) (1968) : Action|Drama|Western
Amadeus (1984) : Drama
Kolya (Kolja) (1996) : Comedy|Drama
Trial, The (Procès, Le) (1962) : Drama
Last Tango in Paris (Ultimo tango a Parigi) (1972) : Drama|Romance
Jetée, La (1962) : Romance|Sci-Fi
Bad Boy Bubby (1993) : Drama
Memories of Murder (Salinui chueok) (2003) : Crime|Drama|Mystery|Thriller
Band of Brothers (2001) : Action|Drama|War
