In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/96.2 kB[0m [31m985.7 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Ref: https://colab.research.google.com/github/tensorflow/recommenders/blob/main/docs/examples/sequential_retrieval.ipynb#scrollTo=GULCYkofR2pP&uniqifier=1

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
# import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [3]:
train_filename = "/content/drive/MyDrive/tensorflow_movie_rec/train_movielens_1m.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "/content/drive/MyDrive/tensorflow_movie_rec/test_movielens_1m.tfrecord"
test = tf.data.TFRecordDataset(test_filename)

feature_description = {
    'synthetic_session_movie_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'label_movie_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}

def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    "synthetic_session_movie_id": tf.strings.as_string(x["synthetic_session_movie_id"]),
    "label_movie_id": tf.strings.as_string(x["label_movie_id"])
})

test_ds = test.map(_parse_function).map(lambda x: {
    "synthetic_session_movie_id": tf.strings.as_string(x["synthetic_session_movie_id"]),
    "label_movie_id": tf.strings.as_string(x["label_movie_id"])
})

for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'label_movie_id': array([b'720'], dtype=object),
 'synthetic_session_movie_id': array([b'3186', b'1270', b'1721', b'1022', b'2340', b'1836', b'3408',
       b'2804', b'1207', b'1193'], dtype=object)}


In [4]:
import pandas as pd

In [5]:
movies = pd.read_csv("/content/drive/MyDrive/movie_data/ml-1m//movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding = 'latin-1')

  movies = pd.read_csv("/content/drive/MyDrive/movie_data/ml-1m//movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding = 'latin-1')


In [6]:
movie_id = movies['movie_id']

print(type(movie_id))

<class 'pandas.core.series.Series'>


In [7]:
dataset = tf.data.Dataset.from_tensor_slices(movie_id.values.astype(np.bytes_))

In [8]:
def map_function(movie_id):
    # Convert the input to byte string if it's not already
    return tf.strings.as_string(tf.strings.to_number(movie_id, out_type=tf.int32))

In [9]:
mapped_dataset = dataset.map(map_function)

In [10]:
movie_ids = mapped_dataset.batch(1_000)
unique_movie_ids = np.unique(np.concatenate(list(movie_ids)))

In [11]:
print(unique_movie_ids)

[b'1' b'10' b'100' ... b'997' b'998' b'999']


In [None]:
# movies = tfds.load("movielens/1m-movies", split='train')
# movies = movies.map(lambda x: x["movie_id"])
# movie_ids = movies.batch(1_000)
# unique_movie_ids = np.unique(np.concatenate(list(movie_ids)))

Downloading and preparing dataset 5.64 MiB (download: 5.64 MiB, generated: 351.12 KiB, total: 5.99 MiB) to /root/tensorflow_datasets/movielens/1m-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/3883 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/1m-movies/0.1.1.incomplete1RLLGL/movielens-train.tfrecord*...:  …

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/1m-movies/0.1.1. Subsequent calls will reuse this data.


In [12]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_movie_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dimension),
    tf.keras.layers.GRU(embedding_dimension),
])

candidate_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dimension)
])

In [13]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=mapped_dataset.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class Model(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self.query_model = query_model
        self.candidate_model = candidate_model
        self.task = task

    def compute_loss(self, features, training=False):
        watch_history = features["synthetic_session_movie_id"]
        watch_next_label = features["label_movie_id"]

        query_embedding = self.query_model(watch_history)
        candidate_embedding = self.candidate_model(watch_next_label)

        return self.task(query_embedding, candidate_embedding, compute_metrics=not training)

In [14]:
model = Model(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [15]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [16]:
model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ef0c0143070>

In [17]:
eval = model.evaluate(cached_test, return_dict=True)
print(eval)

{'factorized_top_k/top_1_categorical_accuracy': 0.0148540660738945, 'factorized_top_k/top_5_categorical_accuracy': 0.07610049098730087, 'factorized_top_k/top_10_categorical_accuracy': 0.13044126331806183, 'factorized_top_k/top_50_categorical_accuracy': 0.36749982833862305, 'factorized_top_k/top_100_categorical_accuracy': 0.5095497965812683, 'loss': 11002.9736328125, 'regularization_loss': 0, 'total_loss': 11002.9736328125}


In [18]:
for i in cached_test:
  test_array = i
  break
print(test_array)

{'synthetic_session_movie_id': <tf.Tensor: shape=(2560, 10), dtype=string, numpy=
array([[b'2402', b'2404', b'2815', ..., b'1031', b'3142', b'1951'],
       [b'2404', b'2815', b'2565', ..., b'3142', b'1951', b'1416'],
       [b'2815', b'2565', b'73', ..., b'1951', b'1416', b'1760'],
       ...,
       [b'1499', b'3039', b'2918', ..., b'595', b'1223', b'1889'],
       [b'3039', b'2918', b'333', ..., b'1223', b'1889', b'2594'],
       [b'2918', b'333', b'2231', ..., b'1889', b'2594', b'1206']],
      dtype=object)>, 'label_movie_id': <tf.Tensor: shape=(2560, 1), dtype=string, numpy=
array([[b'1416'],
       [b'1760'],
       [b'2917'],
       ...,
       [b'2594'],
       [b'1206'],
       [b'32']], dtype=object)>}


In [None]:
# movies = tfds.load("movielens/100k-movies", split="train")
# movies = (movies
#           # Retain only the fields we need.
#           .map(lambda x: x["movie_title"])
#           # Cache for efficiency.
#           .cache(tempfile.NamedTemporaryFile().name))

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-movies/0.1.1.incompleteFM2JV2/movielens-train.tfrecord*...:…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.1. Subsequent calls will reuse this data.


In [19]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.candidate_model)
brute_force.index_from_dataset(
    mapped_dataset.batch(128).map(lambda movie_id: (movie_id, model.candidate_model(movie_id)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ef0c0140790>

In [20]:
_, titles = brute_force(test_array, k=3)

  inputs = self._flatten_to_reference_inputs(inputs)


In [21]:
print(titles)

tf.Tensor(
[[b'1416' b'48' b'1487']
 [b'1760' b'829' b'2084']
 [b'2917' b'1227' b'3529']
 ...
 [b'2594' b'1306' b'3503']
 [b'1206' b'2010' b'1214']
 [b'32' b'1584' b'29']], shape=(2560, 3), dtype=string)


In [22]:
model.save_weights('/content/drive/MyDrive/tensorflow_movie_rec/trained_model_weights_without_tfds/content_model_weights', save_format='tf')

In [23]:
loaded_model = Model(query_model,candidate_model)
loaded_model.load_weights('/content/drive/MyDrive/tensorflow_movie_rec/trained_model_weights_without_tfds/content_model_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ef0c0096590>

In [25]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(loaded_model.candidate_model)
brute_force.index_from_dataset(
    mapped_dataset.batch(128).map(lambda movie_id: (movie_id, loaded_model.candidate_model(movie_id)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ef04a680ee0>

In [26]:
_, titles = brute_force(test_array, k=10)

In [27]:
print(titles)

tf.Tensor(
[[b'1416' b'48' b'1487' ... b'3602' b'829' b'2877']
 [b'1760' b'829' b'2084' ... b'2088' b'2392' b'3600']
 [b'2917' b'1227' b'3529' ... b'2349' b'1674' b'3101']
 ...
 [b'2594' b'1306' b'3503' ... b'1653' b'2232' b'2681']
 [b'1206' b'2010' b'1214' ... b'3703' b'1210' b'1199']
 [b'32' b'1584' b'29' ... b'2916' b'1748' b'1200']], shape=(2560, 10), dtype=string)


In [28]:
# testing new data from users

data = np.array([[b'2402', b'2404', b'2815', b'1031', b'3142', b'1951', b'231', b'265', b'1355', b'1363']])

test_array = tf.constant(data, dtype= tf.string)
_, titles = brute_force(test_array, k=10)

In [29]:
print(titles)

tf.Tensor(
[[[b'2402' b'2815' b'3441' b'2404' b'2476' b'2816' b'2410' b'2260'
   b'3431' b'2756']
  [b'2404' b'2816' b'2815' b'2402' b'3766' b'1445' b'3441' b'3716'
   b'2260' b'3389']
  [b'2815' b'2402' b'3441' b'2816' b'2404' b'2476' b'2411' b'3431'
   b'2756' b'2260']
  [b'1031' b'1030' b'2135' b'2088' b'3672' b'107' b'3345' b'2099'
   b'3600' b'2056']
  [b'3142' b'1856' b'1932' b'1381' b'2659' b'769' b'3465' b'206' b'1289'
   b'3459']
  [b'1951' b'2941' b'3199' b'3549' b'3675' b'3600' b'963' b'1932'
   b'3604' b'2565']
  [b'2694' b'1614' b'1911' b'231' b'2060' b'784' b'3120' b'344' b'3254'
   b'333']
  [b'265' b'2443' b'446' b'17' b'25' b'1094' b'58' b'28' b'2708' b'2291']
  [b'1355' b'1168' b'891' b'742' b'1342' b'2279' b'1717' b'2898' b'382'
   b'532']
  [b'1363' b'2906' b'169' b'455' b'2412' b'510' b'640' b'1417' b'2269'
   b'193']]], shape=(1, 10, 10), dtype=string)
