In [None]:
#!pip install -q tensorflow-recommenders
#!pip install -q --upgrade tensorflow-datasets
#!pip install -q scann

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs
import pandas as pd
import sys
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
if 'google.colab' in str(get_ipython()):
  ratings = pd.read_csv('/content/drive/MyDrive/585_project/data.csv', sep=',')
  movies = pd.read_csv('/content/drive/MyDrive/585_project/data_movie.csv', sep=',')
  users = pd.read_csv('/content/drive/MyDrive/585_project/user_data.csv', sep=';')
else:
  ratings = pd.read_csv('data.csv', sep=',')
  movies = pd.read_csv('data_movie.csv', sep=',')

In [None]:
users

Unnamed: 0.1,Unnamed: 0,userID,age,occupation,gender
0,0,1,34,sales/marketing,M
1,1,2,33,college/grad student,M
2,2,3,29,scientist,M
3,3,4,30,other or not specified,M
4,4,5,26,scientist,M
...,...,...,...,...,...
999995,999995,999996,31,scientist,M
999996,999996,999997,27,scientist,M
999997,999997,999998,37,executive/managerial,M
999998,999998,999999,29,scientist,M


#Data Preprocess

In [None]:
users['gender'] = users['gender'].map({'M': True, 'F': False})

In [None]:
users['occupation'] = users.occupation.astype('category').cat.codes

In [None]:
users = users[['userID','age','occupation','gender']]

In [None]:
users

Unnamed: 0,userID,age,occupation,gender
0,1,34,14,True
1,2,33,4,True
2,3,29,15,True
3,4,30,11,True
4,5,26,15,True
...,...,...,...,...
999995,999996,31,15,True
999996,999997,27,15,True
999997,999998,37,7,True
999998,999999,29,15,True


In [None]:
users['gender'].value_counts()

True     829960
False    170040
Name: gender, dtype: int64

In [None]:
(users["age"] < 18).value_counts()

False    987855
True      12145
Name: age, dtype: int64

In [None]:
(movies["adult"] == True).value_counts()

False    25506
True         1
Name: adult, dtype: int64

In [None]:
movies = movies[['movieID','genres','length']]
movies

Unnamed: 0,movieID,genres,length
0,++++++1959,"[{'id': 35, 'name': 'Comedy'}]",98
1,++++2013,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",90
2,+la+mode+1993,"[{'id': 35, 'name': 'Comedy'}]",82
3,+laventure+2008,"[{'id': 18, 'name': 'Drama'}]",104
4,+nos+amours+1983,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",102
...,...,...,...
25502,zulu+1964,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",138
25503,zulu+2013,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",110
25504,zulu+dawn+1979,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",117
25505,zus++zo+2001,"[{'id': 35, 'name': 'Comedy'}]",106


In [None]:
movies.isna().sum()

movieID    0
genres     0
length     0
dtype: int64

In [None]:
movies['genres'] = movies.apply(lambda row : eval(row['genres']), axis = 1)

In [None]:
movies

Unnamed: 0,movieID,genres,length
0,++++++1959,"[{'id': 35, 'name': 'Comedy'}]",98
1,++++2013,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",90
2,+la+mode+1993,"[{'id': 35, 'name': 'Comedy'}]",82
3,+laventure+2008,"[{'id': 18, 'name': 'Drama'}]",104
4,+nos+amours+1983,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",102
...,...,...,...
25502,zulu+1964,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",138
25503,zulu+2013,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",110
25504,zulu+dawn+1979,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",117
25505,zus++zo+2001,"[{'id': 35, 'name': 'Comedy'}]",106


In [None]:
l = movies['genres'][1]
l

[{'id': 9648, 'name': 'Mystery'},
 {'id': 80, 'name': 'Crime'},
 {'id': 18, 'name': 'Drama'}]

In [None]:
[d['id'] for d in l if 'id' in d]

[9648, 80, 18]

In [None]:
movies['genres'] = movies['genres'].apply(lambda row : [d['name'] for d in row if 'id' in d]) #id

In [None]:
movies

Unnamed: 0,movieID,genres,length
0,++++++1959,[Comedy],98
1,++++2013,"[Mystery, Crime, Drama]",90
2,+la+mode+1993,[Comedy],82
3,+laventure+2008,[Drama],104
4,+nos+amours+1983,"[Drama, Romance]",102
...,...,...,...
25502,zulu+1964,"[Action, Drama, History, War]",138
25503,zulu+2013,"[Crime, Drama, Thriller]",110
25504,zulu+dawn+1979,"[Adventure, Drama, History, War]",117
25505,zus++zo+2001,[Comedy],106


In [None]:
movies['genres'][35]

[]

In [None]:
unique_genres = movies["genres"].explode().unique() 

In [None]:
unique_genres[:10]

array(['Comedy', 'Mystery', 'Crime', 'Drama', 'Romance', 'Documentary',
       'Thriller', 'Action', 'Animation', 'Science Fiction'], dtype=object)

In [None]:
len(unique_genres)

21

In [None]:
movies = pd.concat([
        movies.drop("genres", 1),
        movies.genres.apply(lambda x: pd.Series(1, x)).fillna(0)
    ], axis=1)

  


In [None]:
movies.dtypes

movieID             object
length               int64
Comedy             float64
Mystery            float64
Crime              float64
Drama              float64
Romance            float64
Documentary        float64
Thriller           float64
Action             float64
Animation          float64
Science Fiction    float64
Adventure          float64
War                float64
Horror             float64
Western            float64
Fantasy            float64
Family             float64
History            float64
TV Movie           float64
Music              float64
Foreign            float64
dtype: object

In [None]:
movies = movies[['movieID', 'length']].join(movies.iloc[:,2:].astype("int8"))

In [None]:
movies

Unnamed: 0,movieID,length,Comedy,Mystery,Crime,Drama,Romance,Documentary,Thriller,Action,...,Adventure,War,Horror,Western,Fantasy,Family,History,TV Movie,Music,Foreign
0,++++++1959,98,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,++++2013,90,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,+la+mode+1993,82,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,+laventure+2008,104,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,+nos+amours+1983,102,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25502,zulu+1964,138,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
25503,zulu+2013,110,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
25504,zulu+dawn+1979,117,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
25505,zus++zo+2001,106,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
movies.isna().sum()

movieID            0
length             0
Comedy             0
Mystery            0
Crime              0
Drama              0
Romance            0
Documentary        0
Thriller           0
Action             0
Animation          0
Science Fiction    0
Adventure          0
War                0
Horror             0
Western            0
Fantasy            0
Family             0
History            0
TV Movie           0
Music              0
Foreign            0
dtype: int64

In [None]:
movies.loc[movies.isnull().any(axis=1)]

Unnamed: 0,movieID,length,Comedy,Mystery,Crime,Drama,Romance,Documentary,Thriller,Action,...,Adventure,War,Horror,Western,Fantasy,Family,History,TV Movie,Music,Foreign


In [None]:
ratings = ratings.merge(users, left_on = 'userID', right_on= 'userID')

In [None]:
# adjust type of the column to the model
ratings['userID'] = ratings['userID'].astype(str)
ratings

Unnamed: 0,userID,movieID,ratings,age,occupation,gender
0,340626,acapulco_+prima+spiaggia...+a+sinistra+1983,4,27,11,False
1,26447,otello+1986,3,27,7,True
2,172600,holes+2003,2,23,7,True
3,314533,my+brother+the+terrorist+2014,4,34,4,True
4,249289,sommer+der+gaukler+2011,2,30,15,True
...,...,...,...,...,...,...
199995,130638,whiplash+2014,5,26,7,True
199996,200610,digging+up+the+marrow+2015,4,33,14,True
199997,47231,the+zone+2007,4,27,4,True
199998,375329,shall+we+dance+1937,5,33,1,True


In [None]:
#movies['original_language'].fillna('en', inplace=True)
#movies['overview'].fillna(' ', inplace=True)
#movies['release_date'].fillna('2000-01-01', inplace=True)

In [None]:
#movies['genres'] = movies['genres'].astype(str)

In [None]:
ratings = ratings.merge(movies, left_on='movieID', right_on='movieID')

In [None]:
ratings

Unnamed: 0,userID,movieID,ratings,age,occupation,gender,length,Comedy,Mystery,Crime,...,Adventure,War,Horror,Western,Fantasy,Family,History,TV Movie,Music,Foreign
0,340626,acapulco_+prima+spiaggia...+a+sinistra+1983,4,27,11,False,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,120833,acapulco_+prima+spiaggia...+a+sinistra+1983,3,53,7,True,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,125301,acapulco_+prima+spiaggia...+a+sinistra+1983,4,29,7,True,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26447,otello+1986,3,27,7,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,294309,otello+1986,4,34,4,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199766,119698,lucy+2003,3,31,14,True,128,0,0,0,...,0,0,0,0,0,0,0,1,0,0
199767,389538,the+fraternity+2002,3,28,18,True,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199768,419919,the+blood+spattered+bride+1972,3,33,7,True,100,0,0,0,...,0,0,1,0,0,0,0,0,0,0
199769,48011,destroyer+1943,4,30,7,True,99,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [None]:
ratings.dtypes

userID             object
movieID            object
ratings             int64
age                 int64
occupation           int8
gender               bool
length              int64
Comedy               int8
Mystery              int8
Crime                int8
Drama                int8
Romance              int8
Documentary          int8
Thriller             int8
Action               int8
Animation            int8
Science Fiction      int8
Adventure            int8
War                  int8
Horror               int8
Western              int8
Fantasy              int8
Family               int8
History              int8
TV Movie             int8
Music                int8
Foreign              int8
dtype: object

In [None]:
ratings.isna().sum()

userID             0
movieID            0
ratings            0
age                0
occupation         0
gender             0
length             0
Comedy             0
Mystery            0
Crime              0
Drama              0
Romance            0
Documentary        0
Thriller           0
Action             0
Animation          0
Science Fiction    0
Adventure          0
War                0
Horror             0
Western            0
Fantasy            0
Family             0
History            0
TV Movie           0
Music              0
Foreign            0
dtype: int64

# Create Tensorflow Datasets

In [None]:
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings))
movies = tf.data.Dataset.from_tensor_slices(dict(movies))

In [None]:
type(ratings)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [None]:
# = ratings.map(lambda x: {
    #"movie_title": x["movieID"],
    #"user_id": x["userID"],
    #"user_rating": x["ratings"],
    #"genres" : x["Comedy"],
    #"length" : x["length"],
    #"occupation" : x["occupation"],
    #"age" : x["age"],
    #"gender" : x["gender"]
#})

#movies = movies.map(lambda x:{
    #"movieID" : x["movieID"],
    #"length" : x["length"]
#})


In [None]:
movie_length = np.concatenate(list(movies.map(lambda x: x["length"]).batch(100)))
#movie_genre = np.concatenate(list(movies.map(lambda x: x["genres"]).batch(100)))

In [None]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'Action': 0,
 'Adventure': 0,
 'Animation': 0,
 'Comedy': 1,
 'Crime': 0,
 'Documentary': 0,
 'Drama': 0,
 'Family': 0,
 'Fantasy': 0,
 'Foreign': 0,
 'History': 0,
 'Horror': 0,
 'Music': 0,
 'Mystery': 0,
 'Romance': 0,
 'Science Fiction': 0,
 'TV Movie': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 0,
 'age': 27,
 'gender': False,
 'length': 0,
 'movieID': b'acapulco_+prima+spiaggia...+a+sinistra+1983',
 'occupation': 11,
 'ratings': 4,
 'userID': b'340626'}


In [None]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'Action': 0,
 'Adventure': 0,
 'Animation': 0,
 'Comedy': 1,
 'Crime': 0,
 'Documentary': 0,
 'Drama': 0,
 'Family': 0,
 'Fantasy': 0,
 'Foreign': 0,
 'History': 0,
 'Horror': 0,
 'Music': 0,
 'Mystery': 0,
 'Romance': 0,
 'Science Fiction': 0,
 'TV Movie': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 0,
 'length': 98,
 'movieID': b'++++++1959'}


In [None]:
movies

<TensorSliceDataset element_spec={'movieID': TensorSpec(shape=(), dtype=tf.string, name=None), 'length': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Comedy': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Mystery': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Crime': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Drama': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Romance': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Documentary': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Thriller': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Action': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Animation': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Science Fiction': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Adventure': TensorSpec(shape=(), dtype=tf.int8, name=None), 'War': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Horror': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Western': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Fantasy': Ten

In [None]:
ratings

<TensorSliceDataset element_spec={'userID': TensorSpec(shape=(), dtype=tf.string, name=None), 'movieID': TensorSpec(shape=(), dtype=tf.string, name=None), 'ratings': TensorSpec(shape=(), dtype=tf.int64, name=None), 'age': TensorSpec(shape=(), dtype=tf.int64, name=None), 'occupation': TensorSpec(shape=(), dtype=tf.int8, name=None), 'gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'length': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Comedy': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Mystery': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Crime': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Drama': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Romance': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Documentary': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Thriller': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Action': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Animation': TensorSpec(shape=(), dtype=tf.int8, name=None), 'Science Fiction':

# Random split (80% of the ratings in the train set, and 20% in the test set).  In an industrial recommender system, this would most likely be done by time: the data up to time  would be used to predict interactions after.



In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(200_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(160_000)
test = shuffled.skip(160_000).take(40_000)

cached_train = train.shuffle(200_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [None]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(100_000).map(lambda x: x["userID"])#user_id

In [None]:
movie_titles

<BatchDataset element_spec={'movieID': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'length': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'Comedy': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Mystery': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Crime': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Drama': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Romance': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Documentary': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Thriller': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Action': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Animation': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Science Fiction': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Adventure': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'War': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Horror': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Wes

# Unique user ids and movie titles present in the dataset (to be used in StringLookup)

In [None]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles.map(lambda x: x["movieID"]))))
unique_gender = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["gender"]))))
unique_age = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["age"]))))
unique_user_occupation_label = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["occupation"]))))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [None]:
unique_movie_titles[:10]

array([b'++++++1959', b'++++2013', b'+la+mode+1993', b'+laventure+2008',
       b'+nos+amours+1983', b'+nous+la+libert+1931',
       b'+propos+de+nice+1930', b'...and+god+created+woman+1956',
       b'...and+justice+for+all+1979',
       b'...and+the+pursuit+of+happiness+1986'], dtype=object)

In [None]:
unique_user_ids[:10]

array([b'1', b'10', b'10000', b'100004', b'100006', b'100009', b'10001',
       b'100010', b'100016', b'100020'], dtype=object)

In [None]:
unique_gender[:]

array([False,  True])

In [None]:
unique_age[:10]

array([ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [None]:
unique_user_occupation_label[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int8)

# Retrieval Model

In [None]:
#filter(lambda v: v==v, unique_genres)

In [None]:
unique_genres

array(['Comedy', 'Mystery', 'Crime', 'Drama', 'Romance', 'Documentary',
       'Thriller', 'Action', 'Animation', 'Science Fiction', 'Adventure',
       'War', 'Horror', 'Western', nan, 'Fantasy', 'Family', 'History',
       'TV Movie', 'Music', 'Foreign'], dtype=object)

In [None]:
unique_genres = [x for x in unique_genres if str(x) != 'nan']

In [None]:
len(unique_genres)

20

In [None]:
unique_val = np.array([0,1])

In [None]:
#unique_genres = np.array(unique_genres)

In [None]:
type(unique_genres)

list

In [None]:
embedding_dimension = 32

class RetievalUserModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.user_embedding = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
    # We add an additional embedding to account for unknown tokens.
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])
    self.age_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_age, mask_token=None),
        tf.keras.layers.Embedding(len(unique_age) + 1, embedding_dimension),
    ])
 
    self.occupation_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_user_occupation_label, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_occupation_label) + 1, embedding_dimension),
    ])
 
    self.gender_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_gender, mask_token=None),
        tf.keras.layers.Embedding(len(unique_gender) + 1, embedding_dimension),
    ])
  def call(self, inputs):
    return tf.concat([self.user_embedding(inputs["user_id"]),
                      self.age_embedding(inputs["age"]),
                      self.occupation_embedding(inputs["occupation"]),
                      self.gender_embedding(inputs["gender"])
                      ], axis=1)

class RetievalMovieModel(tf.keras.Model):
  def __init__(self,use_genre,use_length):
    super().__init__()
    self.use_genre = use_genre
    self.use_length = use_length
    self.title_embedding = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
    tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)])
    if use_length:
      self.movie_length_normalizer = tf.keras.layers.Normalization(axis=None)
      self.length_embedding = tf.keras.Sequential([self.movie_length_normalizer,tf.keras.layers.Reshape([1])])
      self.movie_length_normalizer.adapt(movie_length)
    if use_genre:
      self.genre_embedding = tf.keras.Sequential([
       #tf.keras.layers.IntegerLookup(
            #vocabulary=unique_val, mask_token=None),
        tf.keras.layers.Embedding(len(unique_genres) + 1, embedding_dimension,input_length = len(unique_val)), #,input_length = len(unique_val)
      ])
  def call(self, inputs):
    if not self.use_genre:
      return tf.concat([self.title_embedding(inputs['movieID'])] +
                         [self.length_embedding(inputs['length'])],axis=1)
    else:
      return tf.concat([self.title_embedding(inputs['movieID'])] +
                         [self.length_embedding(inputs['length'])] + 
                     [self.genre_embedding([inputs[k] for k in unique_genres])], #movie_genres
                         axis=1)

class RecRetModel(tfrs.Model):
  def __init__(self, use_genre,use_length):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      RetievalUserModel(),
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      RetievalMovieModel(use_genre,use_length),
      tf.keras.layers.Dense(32)
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )
  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "user_id": features["userID"],
        "age" : features["age"],
        "occupation" : features["occupation"],
        "gender" : features["gender"]
    })
    movie_embeddings = self.candidate_model({
        "movieID": features["movieID"],
        "movie_genres" : [features[k] for k in unique_genres], #[features[k] for k in unique_genres]
        "length" : features["length"]
    })

    return self.task(query_embeddings, movie_embeddings)

In [None]:
for k in unique_genres: print(type(k))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
cached_train

<CacheDataset element_spec={'userID': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'movieID': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'ratings': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'age': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'occupation': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'gender': TensorSpec(shape=(None,), dtype=tf.bool, name=None), 'length': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'Comedy': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Mystery': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Crime': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Drama': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Romance': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Documentary': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Thriller': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Action': TensorSpec(shape=(None,), dtype=tf.int8, name=None), 'Animation

In [None]:
model_1 = RecRetModel(use_genre = True, use_length = True)
model_1.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model_1.fit(cached_train, epochs=3)



ValueError: ignored

In [None]:
model_1.evaluate(cached_test, return_dict=True)





{'factorized_top_k/top_1_categorical_accuracy': 7.543184619862586e-05,
 'factorized_top_k/top_5_categorical_accuracy': 0.00032687134807929397,
 'factorized_top_k/top_10_categorical_accuracy': 0.0006537426961585879,
 'factorized_top_k/top_50_categorical_accuracy': 0.003796736244112253,
 'factorized_top_k/top_100_categorical_accuracy': 0.007492896635085344,
 'loss': 26638.298828125,
 'regularization_loss': 0,
 'total_loss': 26638.298828125}

In [None]:
embedding_dimension = 32

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class RecRetModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [None]:
model_1 = RecRetModel(user_model, movie_model)
model_1.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model_1.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba672ac3d0>

In [None]:
model_1.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.001025000005029142,
 'factorized_top_k/top_5_categorical_accuracy': 0.008200000040233135,
 'factorized_top_k/top_10_categorical_accuracy': 0.01614999957382679,
 'factorized_top_k/top_50_categorical_accuracy': 0.0700249969959259,
 'factorized_top_k/top_100_categorical_accuracy': 0.10557500272989273,
 'loss': 25402.27734375,
 'regularization_loss': 0,
 'total_loss': 25402.27734375}

In [None]:
# Getting a list of 100 possible movies from the model. 

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model_1.user_model, k=100)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(200), movies.batch(200).map(model_1.movie_model)))
)

In [None]:
# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :]}")

Recommendations for user 42: [b'whisper+of+the+heart+1995' b'batman+begins+2005' b'shrek+2001'
 b'shutter+island+2010' b'ong-bak+the+thai+warrior+2003'
 b'blade+runner+1982' b'harry+potter+and+the+order+of+the+phoenix+2007'
 b'pirates+of+the+caribbean+the+curse+of+the+black+pearl+2003' b'up+2009'
 b'forrest+gump+1994' b'the+lord+of+the+rings+the+return+of+the+king+2003'
 b'aliens+1986' b'harry+potter+and+the+deathly+hallows+part+2+2011'
 b'dial+m+for+murder+1954' b'the+dark+knight+rises+2012'
 b'monsters_+inc.+2001' b'megamind+2010' b'inglourious+basterds+2009'
 b'wreck-it+ralph+2012' b'harry+potter+and+the+philosophers+stone+2001'
 b'iron+man+2+2010' b'penguins+of+madagascar+2014' b'casino+royale+2006'
 b'raiders+of+the+lost+ark+1981' b'the+jungle+book+1967'
 b'dragon+ball+z+broly+-+second+coming+1994' b'seventh+son+2014'
 b'peter+pan+1953' b'the+dark+knight+2008' b'the+fifth+element+1997'
 b'despicable+me+2010' b'monty+python+and+the+holy+grail+1975'
 b'shrek+forever+after+2010' b'mr

In [None]:
# Save the index (model).
tf.saved_model.save(index, '/content/gdrive/MyDrive/COMP_585_share/modeltf')



# Ranking Model

In [None]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, movie_title = inputs

    user_embedding = self.user_embeddings(user_id)
    movie_embedding = self.movie_embeddings(movie_title)

    return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))


class RecRankModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["movie_title"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("user_rating")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)


In [None]:
model_2 = RecRankModel()
model_2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
cached_train = train.shuffle(200_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [None]:
model_2.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba65395d50>

In [None]:
model_2.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.7398211359977722,
 'loss': 0.5707481503486633,
 'regularization_loss': 0,
 'total_loss': 0.5707481503486633}

In [None]:
test_ratings = {}
test_movie_titles = ["zulu+dawn+1979", "the+little+mermaid+1989", "shrek+2+2004"]
for movie_title in test_movie_titles:
  test_ratings[movie_title] = model_2({
      "user_id": np.array(["42"]),
      "movie_title": np.array([movie_title])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
zulu+dawn+1979: [[3.7354789]]
shrek+2+2004: [[3.716161]]
the+little+mermaid+1989: [[3.71266]]


In [None]:
# Save the index.
tf.saved_model.save(model_2, '/content/gdrive/MyDrive/COMP_585_share/modeltf_rank')



# Using the 2 steps (Retrieval -> Ranking)

In [None]:
# Load it back; can also be done in TensorFlow Serving.
loaded_retrieval = tf.saved_model.load('/content/gdrive/MyDrive/COMP_585_share/modeltf')

# Pass a user id in, get top predicted movie titles back.
scores, titles = loaded_retrieval(["42"])

print(f"Recommendations: {titles[0][:]}")

In [None]:
# Load it back; can also be done in TensorFlow Serving.
loaded_rank = tf.saved_model.load('/content/gdrive/MyDrive/COMP_585_share/modeltf_rank')

In [None]:
pred_ratings = np.zeros((100,))

for i in range(100):
  # Pass a user id in, movie id and perform predictions:
  pred_ratings[i] = loaded_rank({"user_id": np.array(["42"]), "movie_title":[titles[0][i].numpy()]}).numpy()[0][0]

order = np.argsort(-pred_ratings)[0:20]


In [None]:
pred_movies = titles[0].numpy()[list(order)]
pred_movies

array([b'interstellar+2014', b'raiders+of+the+lost+ark+1981',
       b'harry+potter+and+the+deathly+hallows+part+2+2011',
       b'the+lord+of+the+rings+the+two+towers+2002',
       b'the+godfather+1972', b'the+jungle+book+1967',
       b'the+dark+knight+2008', b'blade+runner+1982',
       b'the+dark+knight+rises+2012', b'forrest+gump+1994',
       b'the+lord+of+the+rings+the+return+of+the+king+2003',
       b'winchester+73+1950', b'a+grand+day+out+1990', b'up+2009',
       b'the+fifth+element+1997', b'forbidden+planet+1956',
       b'pirates+of+the+caribbean+the+curse+of+the+black+pearl+2003',
       b'the+terminator+1984',
       b'harry+potter+and+the+prisoner+of+azkaban+2004', b'aliens+1986'],
      dtype=object)

array([b'interstellar+2014', b'raiders+of+the+lost+ark+1981',
       b'harry+potter+and+the+deathly+hallows+part+2+2011',
       b'the+lord+of+the+rings+the+two+towers+2002',
       b'the+godfather+1972', b'the+jungle+book+1967',
       b'the+dark+knight+2008', b'blade+runner+1982',
       b'the+dark+knight+rises+2012', b'forrest+gump+1994',
       b'the+lord+of+the+rings+the+return+of+the+king+2003',
       b'winchester+73+1950', b'a+grand+day+out+1990', b'up+2009',
       b'the+fifth+element+1997', b'forbidden+planet+1956',
       b'pirates+of+the+caribbean+the+curse+of+the+black+pearl+2003',
       b'the+terminator+1984',
       b'harry+potter+and+the+prisoner+of+azkaban+2004', b'aliens+1986'],
      dtype=object)