In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd

# NOTE
# Shape (668,10325)

df_movies = pd.read_csv('/content/drive/MyDrive/ApplyAI/movies.csv')
df_ratings = pd.read_csv('/content/drive/MyDrive/ApplyAI/ratings.csv')


# remove timestamp column
df_ratings = df_ratings.drop(columns=['timestamp'])
print(df_ratings.head(10))

   userId  movieId  rating
0       1       16     4.0
1       1       24     1.5
2       1       32     4.0
3       1       47     4.0
4       1       50     4.0
5       1      110     4.0
6       1      150     3.0
7       1      161     4.0
8       1      165     3.0
9       1      204     0.5


In [None]:
ratings_df = df_ratings.copy()

In [None]:
ratings_df = ratings_df.merge(df_movies[['movieId', 'title', 'genres']], left_on='movieId',right_on='movieId', how='left')
ratings_df = ratings_df[~ratings_df['movieId'].isna()]
ratings_df.reset_index(drop=True, inplace=True)


In [None]:
ratings_df.head(20)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,16,4.0,Casino (1995),Crime|Drama
1,1,24,1.5,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
5,1,110,4.0,Braveheart (1995),Action|Drama|War
6,1,150,3.0,Apollo 13 (1995),Adventure|Drama|IMAX
7,1,161,4.0,Crimson Tide (1995),Drama|Thriller|War
8,1,165,3.0,Die Hard: With a Vengeance (1995),Action|Crime|Thriller
9,1,204,0.5,Under Siege 2: Dark Territory (1995),Action


In [None]:
movies_df = df_movies.copy()

In [None]:
movies_df = movies_df[['movieId', 'title']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [None]:
print(np.shape(ratings_df))
print(np.shape(movies_df))

(105339, 5)
(10329, 2)


In [None]:
ratings_df['userId'] = ratings_df['userId'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['title']]))


ratings = ratings.map(lambda x: {
    "title": x["title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["title"])



In [None]:
print('Total Data: {}'.format(len(ratings)))

Total Data: 105339


In [None]:
tf.random.set_seed(2206)
shuffled = ratings.shuffle(100_000, seed=2206, reshuffle_each_iteration=False)

train = ratings.take(95339)
test = ratings.skip(95339).take(10000)

In [None]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 10327
Unique users: 668


In [None]:
import tensorflow_recommenders as tfrs
import tensorflow as tf

In [None]:
class MovieModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 64

    # User and movie models.
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["userId"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["title"])

    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("rating")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [None]:
#model = tf.keras.models.load_model(model1.h5)

In [None]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000).batch(1_000).cache()
cached_test = test.batch(1_000).cache()

model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79896e402e30>

Time to train model: around 23 minutes

In [None]:
metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")


Retrieval top-100 accuracy: 0.014
Ranking RMSE: 0.987


In [None]:
model.save_weights('model1.h5')

In [None]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends movies out of the entire movies dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))

    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))

In [None]:
predict_movie(123, 5)

Top 5 recommendations for user 123:

1. Christmas with the Kranks (2004)
2. Four Seasons, The (1981)
3. New York Minute (2004)
4. Shall We Dance? (2004)
5. Crossing Delancey (1988)


In [None]:
predict_rating(123, 'Toy Story (1995)')

Predicted rating for Toy Story (1995): 4.102571487426758


In [None]:
ratings_df[ratings_df['userId'] == '123']

Unnamed: 0,userId,movieId,rating,title,genres
15044,123,5,3.5,Father of the Bride Part II (1995),Comedy
15045,123,7,3.5,Sabrina (1995),Comedy|Romance
15046,123,21,4.0,Get Shorty (1995),Comedy|Crime|Thriller
15047,123,141,3.5,"Birdcage, The (1996)",Comedy
15048,123,296,4.5,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
...,...,...,...,...,...
15204,123,34336,1.5,Must Love Dogs (2005),Comedy|Romance
15205,123,35836,4.0,"40-Year-Old Virgin, The (2005)",Comedy|Romance
15206,123,42734,2.0,Hoodwinked! (2005),Animation|Children|Comedy
15207,123,43836,3.0,"Pink Panther, The (2006)",Adventure|Comedy|Crime
