# 5.3 Least Viewed Movies Exploration
In this notebook we'll add another source of movies to our ranking model - a list of all least viewed movies, in addition to the top moves selected by the retrieval model. This will allow these movies to be exposed to more users.

In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

Loading data and metadata:

In [2]:
train_df = pd.read_csv('../local_data/train_data.csv', header=0)
test_df = pd.read_csv('../local_data/test_data.csv', header=0)
with open('../local_data/metadata.json', 'r') as f:
    metadata = json.load(f)

In [3]:
all_users = [str(int(i)) for i in metadata['users']]
all_movies = [str(int(i)) for i in metadata['movies']]
all_cities = metadata['cities']
all_states = metadata['states']
all_ages = [str(int(i)) for i in metadata['ages']]
all_occupations = [str(int(i)) for i in metadata['occupations']]
all_genres = metadata['genres']
title_emb_len = metadata['title_emb_size']
na_value = metadata['string_na']

In [4]:
movies_candidates = tf.data.Dataset.from_tensor_slices({'movie': train_df['movie'].unique()}).map(lambda x: x['movie']).batch(200).map(tf.strings.as_string)

2022-02-26 18:12:16.848098: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Creating a movies Feature Store:

In [5]:
movies_db = pd.concat([train_df, test_df]).drop(['user','city','state','gender','age','occupation','hour','day','month','rating'], axis=1)
movies_db = movies_db.groupby('movie').first()
movies_db['movie'] = movies_db.index

In [6]:
def get_movie_features(movies_list):
    df = movies_db.loc[movies_list,:]
    d = {k:v.to_numpy() for k,v in dict(df).items()}
    d['genres'] = np.transpose(np.array([d[x] for x in all_genres]))
    d['title_emb'] = np.transpose(np.array([d[f'title_emb_{i}'] for i in range(title_emb_len)]))
    for x in all_genres + [f'title_emb_{i}' for i in range(title_emb_len)]:
        d.pop(x)
    d.update({k:np.expand_dims(np.vstack(v), axis=0) for k,v in d.items()})
    return d

Loading models from projects #3.2 and #4.1:

In [7]:
class RatingPredictionModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        
        tower_last_layer_size = 50
        large_embedding_size = 25
        medium_embedding_size = 5
        small_embedding_size = 3
        
        # User tower
        
        self.user_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='user_input')
        self.user_sl = tf.keras.layers.StringLookup(vocabulary=all_users, name='user_string_lookup')(self.user_input)
        self.user_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_users)+1, large_embedding_size, name='user_emb')(self.user_sl), axis=1)
        
        self.city_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='city_input')
        self.city_sl = tf.keras.layers.StringLookup(vocabulary=all_cities, mask_token=na_value, name='city_string_lookup')(self.city_input)
        self.city_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_cities)+2, medium_embedding_size, name='city_emb')(self.city_sl), axis=1)
        
        self.state_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='state_input')
        self.state_sl = tf.keras.layers.StringLookup(vocabulary=all_states, mask_token=na_value, name='state_string_lookup')(self.state_input)
        self.state_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_states)+2, small_embedding_size, name='state_emb')(self.state_sl), axis=1)
        
        self.age_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='age_input')
        self.age_sl = tf.keras.layers.StringLookup(vocabulary=all_ages, num_oov_indices=0, name='age_string_lookup')(self.age_input)
        self.age_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_ages), small_embedding_size, name='age_emb')(self.age_sl), axis=1)
        
        self.occupation_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='occupation_input')
        self.occupation_sl = tf.keras.layers.StringLookup(vocabulary=all_occupations, num_oov_indices=0, name='occupation_string_lookup')(self.occupation_input)
        self.occupation_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_occupations), small_embedding_size, name='occupation_emb')(self.occupation_sl), axis=1)
        
        self.gender_input = tf.keras.Input(shape=(1,), name='gender_input')
        self.hour_input = tf.keras.Input(shape=(1,), name='hour_input')
        self.day_input = tf.keras.Input(shape=(1,), name='day_input')
        self.month_input = tf.keras.Input(shape=(1,), name='month_input')
        
        self.user_merged = tf.keras.layers.concatenate([self.user_emb, self.city_emb, self.state_emb, self.age_emb, 
                                                        self.occupation_emb, self.gender_input, self.hour_input,
                                                        self.day_input, self.month_input], 
                                                       axis=-1, name='user_merged')
        self.user_dense = tf.keras.layers.Dense(100, activation='relu', name='user_dense')(self.user_merged)
        self.user_last_layer = tf.keras.layers.Dense(tower_last_layer_size, activation='relu', name='user_last_layer')(self.user_dense)
        
        # Movie tower
        
        self.movie_input = tf.keras.Input(shape=(None,1), dtype=tf.string, name='movie_input ')
        self.movie_sl = tf.keras.layers.StringLookup(vocabulary=all_movies, name='movie_string_lookup')(self.movie_input)
        self.movie_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_movies)+1, large_embedding_size, name='movie_emb')(self.movie_sl), axis=2)
        
        self.title_input = tf.keras.Input(shape=(None,title_emb_len), name='title_input')
        self.title_dense = tf.keras.layers.Dense(title_emb_len, activation='softmax', name='title_softmax')(self.title_input)
        
        self.genres_input = tf.keras.Input(shape=(None,len(all_genres)), name='genres_input')
        self.year_input = tf.keras.Input(shape=(None,1), name='year_input')
        
        self.movie_merged = tf.keras.layers.concatenate([self.movie_emb, self.title_dense, self.genres_input, self.year_input] ,axis=-1, name='movie_merged')
        self.movie_dense = tf.keras.layers.Dense(100, activation='relu', name='movie_dense')(self.movie_merged)
        self.movie_last_layer = tf.keras.layers.Dense(tower_last_layer_size, activation='relu', name='movie_last_layer')(self.movie_dense)
        
        # Combining towers
        
        self.towers_multiplied = tf.keras.layers.Multiply(name='towers_multiplied')([tf.expand_dims(self.user_last_layer, axis=2), 
                                                                                     tf.transpose(self.movie_last_layer, perm=[0,2,1])])
        self.towers_dense1 = tf.keras.layers.Dense(40, activation='relu', name='towers_dense1')(tf.transpose(self.towers_multiplied, perm=[0,2,1]))
        self.towers_dense2 = tf.keras.layers.Dense(20, activation='relu', name='towers_dense2')(self.towers_dense1)
        self.output_node = tf.keras.layers.Dense(1, name='output_node')(self.towers_dense2)
        
        # Model definition
        
        self.model = tf.keras.Model(inputs={'user': self.user_input, 
                                            'city': self.city_input,
                                            'state': self.state_input,
                                            'age': self.age_input,
                                            'occupation': self.occupation_input,
                                            'gender': self.gender_input,
                                            'hour': self.hour_input,
                                            'day': self.day_input,
                                            'month': self.month_input,
                                            'movie': self.movie_input,
                                            'title': self.title_input,
                                            'genres': self.genres_input,
                                            'year': self.year_input
                                            }, 
                                    outputs=self.output_node)
        
        self.task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        
    def call(self, features):
        return self.model({'user': tf.strings.as_string(features["user"]), 
                           'city': features["city"], 
                           'state': features["state"],
                           'age': tf.strings.as_string(features["age"]),
                           'occupation': tf.strings.as_string(features["occupation"]), 
                           'gender': features["gender"],
                           'hour': features["hour"],
                           'day': features["day"],
                           'month': features["month"],
                           'movie': tf.strings.as_string(features["movie"]),
                           'title': features["title_emb"],
                           'genres': features["genres"],
                           'year': features["movie_year"]
                           })
    
    def compute_loss(self, features_dict, training=False):
        labels = features_dict["rating"]
        predictions = self(features_dict)
        return self.task(labels=labels, predictions=predictions)

In [8]:
class MoviesRetrievalModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        
        large_embedding_size = 20
        medium_embedding_size = 5
        small_embedding_size = 3
        last_layer_size = 20
        
        # User Model
        
        self.user_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='user_input')
        self.user_sl = tf.keras.layers.StringLookup(vocabulary=all_users, name='user_string_lookup')(self.user_input)
        self.user_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_users)+1, large_embedding_size, name='user_emb')(self.user_sl), axis=1)
        
        self.city_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='city_input')
        self.city_sl = tf.keras.layers.StringLookup(vocabulary=all_cities, mask_token=na_value, name='city_string_lookup')(self.city_input)
        self.city_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_cities)+2, medium_embedding_size, name='city_emb')(self.city_sl), axis=1)
        
        self.state_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='state_input')
        self.state_sl = tf.keras.layers.StringLookup(vocabulary=all_states, mask_token=na_value, name='state_string_lookup')(self.state_input)
        self.state_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_states)+2, small_embedding_size, name='state_emb')(self.state_sl), axis=1)
        
        self.user_merged = tf.keras.layers.concatenate([self.user_emb, self.city_emb, self.state_emb], 
                                                       axis=-1, name='user_merged')
        self.user_dense = tf.keras.layers.Dense(last_layer_size, activation='relu', name='user_dense')(self.user_merged)
        
        self.user_model = tf.keras.Model(inputs={'user': self.user_input,
                                                 'city': self.city_input,
                                                 'state': self.state_input},
                                         outputs=self.user_dense)
        
        
        # Movie Model
        
        self.movie_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='movie_input ')
        self.movie_sl = tf.keras.layers.StringLookup(vocabulary=all_movies, name='movie_string_lookup')(self.movie_input)
        self.movie_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_movies)+1, last_layer_size, name='movie_emb')(self.movie_sl), axis=1)
        
        self.movie_model = tf.keras.Model(inputs={'movie': self.movie_input},
                                          outputs=self.movie_emb)
        
        
        # Task
        
        task_candidates = movies_candidates.map(self.movie_model)  
        top_k_metrics = [tf.keras.metrics.TopKCategoricalAccuracy(k=x, name=f'top_{x}_categorical_accuracy') for x in [10, 100]]
        task_metric = tfrs.metrics.FactorizedTopK(candidates=task_candidates, metrics=top_k_metrics)
        self.task = tfrs.tasks.Retrieval(metrics=task_metric)  # Default loss: tf.keras.losses.CategoricalCrossentropy
        
    
    def compute_loss(self, features, training=False):
        return self.task(
            self.user_model({'user': tf.strings.as_string(features["user"]), 
                             'city': features["city"],
                             'state': features["state"]}),
            self.movie_model(tf.strings.as_string(features["movie"]))
        )

In [9]:
retrieval_model = MoviesRetrievalModel()
ranking_model = RatingPredictionModel()
retrieval_model.compile()
ranking_model.compile()
retrieval_model.load_weights('../project_4/weights_4_1/p41')
ranking_model.load_weights('../project_3/weights_3_2/p32')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x14dd5db50>

We set are retrieval to return only 10 movies:

In [10]:
retrieval = tfrs.layers.factorized_top_k.BruteForce(retrieval_model.user_model, k=10)
retrieval.index_from_dataset(
  tf.data.Dataset.zip((movies_candidates, movies_candidates.map(retrieval_model.movie_model)))
)

2022-02-26 18:12:19.420523: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x14de129d0>

Creating a list of movies only watched once:

In [11]:
movies_count = train_df[['movie']].groupby(by='movie').size().sort_values()
least_viewed = movies_count.where(movies_count == 1).dropna().index.to_numpy()

## Online predictions
Making predicions for a test user. We recieve a list of 10 top movies for this user from the retrieval model, and then before passing it to the ranking model, we concatenate it with the list of the least-viewed movies. In this example, movie IDs which are shown as `b'...'` are the movies recieved from the retrieval model, and the others are the ones from the least-viewed list. We can clearly see the latter movies now appear on the final list, even though the model had 10 better movies at hand.

In [12]:
test_user = dict(test_df[['user','city','state','gender','age','occupation','hour','day','month']].iloc[0])
test_user

{'user': 6040,
 'city': 'Astoria',
 'state': 'NY',
 'gender': 0.0,
 'age': 25,
 'occupation': 6,
 'hour': 2,
 'day': 3,
 'month': 4}

In [13]:
movies_list = np.concatenate((retrieval([tf.constant([str(v)]) for k,v in test_user.items() if k in ['user','city','state']])[1].numpy()[0], 
                              least_viewed))
movies_data = get_movie_features([int(x) for x in movies_list])
ranking_input = {k:np.array([v]) for k,v in test_user.items()}
ranking_input.update(movies_data)
predicted_rating = ranking_model.predict(ranking_input)[0]

print(f'Top 10 predicted movies for user {test_user["user"]}:')
sorted(zip(movies_list, np.squeeze(predicted_rating)), key=lambda x: x[1], reverse=True)[:10]

Top 10 predicted movies for user 6040:


[(b'260', 4.11469),
 (b'1210', 3.783932),
 (b'3753', 3.51542),
 (b'3555', 3.2687955),
 (b'2013', 3.1841974),
 (398, 3.0998983),
 (b'3798', 3.0679595),
 (3237, 3.042822),
 (1852, 3.042057),
 (3277, 3.0390491)]