In [70]:
import os
import pprint
import tempfile
import json

In [71]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas

# Import user reviews and game data

In [6]:
with open('data/pruned_user_data.json', "r") as infile:
    user_data = json.load(infile)
    
with open('data/final_app_data.json', "r") as infile:
    game_data = json.load(infile)

## In version 1 of the retrieval model, we create a "user_upvoted" tf dataset where each entry is a movie and a user who positively reviewed it. We make dict tensor slices to support more features in the future.

In [11]:
user_upvoted_df = pandas.DataFrame(columns=['user_id', 'game_id'])
for user in user_data:
    for game in user_data[user]:
        if user_data[user][game]['voted_up?'] == True:
            user_upvoted_df.loc[len(user_upvoted_df.index)] = [user, game]

In [12]:
user_upvoted_df.to_csv('data/user_upvoted.csv')

### Change id types to ints

In [45]:
user_upvoted_df[["user_id", "game_id"]] = user_upvoted_df[["user_id", "game_id"]].apply(pandas.to_numeric)

In [52]:
user_upvoted_df.dtypes

user_id    int64
game_id    int64
dtype: object

In [46]:
user_upvoted_ds = tf.data.Dataset.from_tensor_slices(dict(user_upvoted_df))

### Check data structure

In [14]:
for element in user_upvoted_ds: 
    print(element)
    break

{'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'76561198318922553'>, 'game_id': <tf.Tensor: shape=(), dtype=string, numpy=b'394360'>}


In [15]:
type(user_upvoted_ds)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

## Now get the list of game_ids and create the game dataset, game_ds

In [28]:
unique_game_ids = [int(x) for x in game_data]
game_ds = tf.data.experimental.from_list(unique_game_ids)

In [29]:
for element in game_ds: 
    print(element)
    break

tf.Tensor(730, shape=(), dtype=int32)


In [24]:
unique_user_ids = [int(x) for x in user_data]

### Create train and test data from user_upvoted data

In [22]:
tf.data.experimental.cardinality(user_upvoted_ds).numpy()

146852

In [53]:
tf.random.set_seed(1989)
shuffled = user_upvoted_ds.shuffle(146_852, seed=1989, reshuffle_each_iteration=False)

train = shuffled.take(100_000)
test = shuffled.skip(100_000).take(25_000)

# User and Game embedding models

For both we map the unique game and user ids to a continuous range using their ids as a vocabulary, then convert to an embedding using the Embedding layer

In [54]:
embedding_dim = 32
user_model = tf.keras.Sequential([
    tf.keras.layers.IntegerLookup(
        vocabulary=unique_user_ids, mask_token=None),
    # We add an additional embedding to account for unknown tokens
    tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dim)
])

game_model = tf.keras.Sequential([
    tf.keras.layers.IntegerLookup(
        vocabulary=unique_game_ids, mask_token=None),
    # We add an additional embedding to account for unknown tokens
    tf.keras.layers.Embedding(len(unique_game_ids)+1, embedding_dim)
])

# Metric
We use Factorized Top K metric, since it takes as its only input the candidates from the retrieval model. We need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates

Then combine this with built-in loss function

In [55]:
metrics = tfrs.metrics.FactorizedTopK(candidates=game_ds.batch(128).map(game_model))
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

# Retrieval Model
Defining the final model, defining training and test steps with loss functions

If training_metrics is set False, we skip calculating metrics in training as well as evaluation

In [81]:
class SteamRetrievalModel(tf.keras.Model):
    def __init__(self, user_model, game_model, training_metrics=True):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.game_model: tf.keras.Model = game_model
        self.task: tf.keras.layers.Layer = task

    # features / datasets should be of form {str: tensor}
    def train_step(self, features):

        # Record gradients
        with tf.GradientTape() as tape:
            # Loss computation
            user_embeddings = self.user_model(features["user_id"])
            positive_game_embeddings = self.game_model(features["game_id"])
            loss = self.task(user_embeddings, positive_game_embeddings)

            # Regularization losses prevent overfitting by encouraging learning smaller weights
            # Penalties added back into the final loss 
            regularization_loss = sum(self.losses)

            total_loss = loss + regularization_loss

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    #if training_metrics:
        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss
    
        return metrics

    # features / datasets should be of form {str: tensor}
    def test_step(self, features) -> tf.Tensor:

        # Loss computation
        user_embeddings = self.user_model(features["user_id"])
        positive_game_embeddings = self.game_model(features["game_id"])
        loss = self.task(user_embeddings, positive_game_embeddings)

        regularization_loss = sum(self.losses)

        total_loss = loss + regularization_loss

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss

        return metrics

IndentationError: unexpected indent (417085925.py, line 33)

### Simple model

In [74]:
def SteamRetrievalModelSimple(tfrs.Model):
    
    def __init__(self, usermodel, movie_model):
        super().__init__()
        self.game_model: tf.keras.Model = game_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features):
        user_embed = self.user_model(features["user_id"])
        positive_game_embed = self.game_model(features["game_id"])
        return self.task(user_embed, positive_game_embed)

SyntaxError: invalid syntax (1478502560.py, line 1)

# Keras fitting and evaluation
### Train in three epochs using gradient model AdaGrad

In [78]:
model = SteamRetrievalModel(user_model, game_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

### Shuffle, batch, and cache training and test data

In [79]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

### Train the model

In [80]:
model.fit(cached_train, epochs=3)

Epoch 1/3


NameError: in user code:

    File "C:\Users\Bluecat\miniconda3\envs\tf_conda\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Bluecat\miniconda3\envs\tf_conda\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Bluecat\miniconda3\envs\tf_conda\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Bluecat\AppData\Local\Temp\ipykernel_29056\3440614214.py", line 27, in train_step
        if training_metrics:

    NameError: name 'training_metrics' is not defined
