In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install numpy
!pip install pandas
!pip install sklearn



In [3]:
import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [4]:
# data pre-processing


# import dataset and the dataset is from https://www.kaggle.com/tamber/steam-video-games/
df = pd.read_csv('./steam-200k.csv', header=None)

df_play = df[(df[2] == 'play')]

df_play_input = df_play[[0, 1, 3]]
df_play_input.columns = ["user_id", "name", "playing_hours"]

# get the max time of a game and min time of a game

df_max = df_play_input.groupby("name").max().reset_index()[["name", "playing_hours"]]
df_max.columns = ["name", "max_playing_hours"]
df_min = df_play_input.groupby("name").min().reset_index()[["name", "playing_hours"]]
df_min.columns = ["name", "min_playing_hours"]
df_max_and_min = pd.merge(df_max, df_min, on="name")

# get index of game

df_max_and_min['game_index'] = df_max_and_min.index

# calutate the percentage

df_full_table = pd.merge(df_play_input, df_max_and_min, on="name")
df_full_table['percentage'] = df_full_table['playing_hours'] / (
            df_full_table['max_playing_hours'] + df_full_table['min_playing_hours'])


# convert to rating

def convert_to_rating(i):
    if (0 <= i < 0.2):
        rating = 1
    elif (0.2 <= i < 0.4):
        rating = 2
    elif (0.4 <= i < 0.6):
        rating = 3
    elif (0.6 <= i < 0.8):
        rating = 4
    elif (0.8 <= i <= 1.0):
        rating = 5
    else:
        rating = 0
    return rating


df_full_table['rating'] = df_full_table['percentage'].apply(lambda x: convert_to_rating(x))

df_game_info = pd.read_csv('./steam_games.csv')
df_full_table_with_game_info = pd.merge(df_full_table, df_game_info, on="name")

df_full_table_array = df_full_table_with_game_info[['name','user_id','rating','game_description','genre']].to_numpy().astype('U')
df_full_table_array = pd.DataFrame(data=df_full_table_array, columns=["name",'user_id','rating','game_description','genre'])

In [5]:
# generate inputs for tensorflow 

ratings = tf.data.Dataset.from_tensor_slices(dict(df_full_table_array)).map(lambda x: {
    "user_id": x["user_id"],
    "game_title": x["name"],
    "user_rating": float(x["rating"]),
    "game_desc": x["game_description"],
    "game_genre": x["genre"]  
})

In [6]:
# dataset splitting
tf.random.set_seed(40)
shuffled = ratings.shuffle(36257, seed=40, reshuffle_each_iteration=False)

train = shuffled.take(29005)
test = shuffled.skip(29005).take(7252)

In [7]:
# get unique items from features
games = tf.data.Dataset.from_tensor_slices(dict(df_full_table_array)).map(lambda x: 
    x["name"])

game_desc = tf.data.Dataset.from_tensor_slices(dict(df_full_table_array)).map(lambda x: 
    x["game_description"])

game_genre = tf.data.Dataset.from_tensor_slices(dict(df_full_table_array)).map(lambda x: 
    x["genre"])

game_titles = ratings.batch(1_000).map(lambda x: x["game_title"])
unique_game_titles = np.unique(np.concatenate(list(game_titles)))

user_ids = ratings.batch(1_000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

game_descs = ratings.batch(1_000).map(lambda x: x["game_desc"])
unique_game_descs = np.unique(np.concatenate(list(game_descs)))

game_genres = ratings.batch(1_000).map(lambda x: x["game_genre"])
unique_game_genres = np.unique(np.concatenate(list(game_genres)))


In [8]:
class RankingModel(tf.keras.Model):

  def __init__(self, use_game_titles, use_game_descs, use_game_genres):
    super().__init__()
    embedding_dimension = 32
    max_tokens = 10_000_000

    self.use_game_titles = use_game_titles
    self.use_game_descs = use_game_descs
    self.use_game_genres = use_game_genres

    # embeddings for users
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # embeddings for games
    self.game_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_game_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_game_titles) + 1, embedding_dimension)
    ])

    self.ratings = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
  ])
    
    # embeddings for game descriptions
    if self.use_game_titles:    
      self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
          max_tokens=max_tokens)
    
      self.title_text_embedding = tf.keras.Sequential([
        self.title_vectorizer,
        tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
        tf.keras.layers.GlobalAveragePooling1D(),
      ])

      self.title_vectorizer.adapt(games)

    # embeddings for game descriptions
    if self.use_game_descs: 
      self.game_desc_embeddings = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.StringLookup(
          vocabulary=unique_game_descs, mask_token=None),
        tf.keras.layers.Embedding(len(unique_game_descs) + 1, embedding_dimension)
      ])    
      
      self.game_desc_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
          max_tokens=max_tokens)

      self.game_desc_text_embedding = tf.keras.Sequential([
        self.game_desc_vectorizer,
        tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
        tf.keras.layers.GlobalAveragePooling1D(),
      ])

      self.game_desc_vectorizer.adapt(game_desc)


    # embeddings for game genres
    if self.use_game_genres:
      self.game_genre_embeddings = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.StringLookup(
          vocabulary=unique_game_genres, mask_token=None),
        tf.keras.layers.Embedding(len(unique_game_genres) + 1, embedding_dimension)
      ])    
      
      self.game_genre_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
          max_tokens=max_tokens)

      self.game_genre_text_embedding = tf.keras.Sequential([
        self.game_genre_vectorizer,
        tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
        tf.keras.layers.GlobalAveragePooling1D(),
      ])

      self.game_genre_vectorizer.adapt(game_genre)

  
    
  def call(self, inputs):

    user_id = inputs[0]
    game_title = inputs[1]
    if self.use_game_descs and self.use_game_genres:
      game_descs = inputs[2]
      game_genres = inputs[3]
    else:
      if self.use_game_descs:
        game_descs = inputs[2]
      if self.use_game_genres:
        game_genres = inputs[2]

    embedddings = []

    user_embedding = self.user_embeddings(user_id)
    embedddings.append(user_embedding)
    game_embedding = self.game_embeddings(game_title)
    embedddings.append(game_embedding)

    if self.use_game_titles: 
      title_text_embedding = self.title_text_embedding(game_title)
      embedddings.append(title_text_embedding)

    if self.use_game_descs: 
      game_desc_embeddings = self.game_desc_embeddings(game_descs)
      game_desc_text_embedding = self.game_desc_text_embedding(game_descs)
      embedddings.append(game_desc_embeddings)
      embedddings.append(game_desc_text_embedding)

    if self.use_game_genres:
      game_genre_embeddings = self.game_genre_embeddings(game_genres)
      game_genre_text_embedding = self.game_genre_text_embedding(game_genres)

    return self.ratings(tf.concat(embedddings, axis=1))


In [9]:
# define loss function
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError(),tf.keras.metrics.MeanAbsoluteError()]
)

In [10]:
class SteamModel(tfrs.models.Model):

  def __init__(self, use_game_titles, use_game_descs, use_game_genres):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel(use_game_titles, use_game_descs, use_game_genres)
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError(),tf.keras.metrics.MeanAbsoluteError()]
    )
    self.use_game_titles = use_game_titles
    self.use_game_descs = use_game_descs
    self.use_game_genres = use_game_genres


  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    feature_lst = [features["user_id"], features["game_title"]]
    if self.use_game_descs:
      feature_lst.append(features["game_desc"])
    if self.use_game_genres:
      feature_lst.append(features["game_genre"])  
    rating_predictions = self.ranking_model(tuple(feature_lst))

    return self.task(labels=features["user_rating"], predictions=rating_predictions)

In [11]:
# compile model
model = SteamModel(use_game_titles = True, use_game_descs = False, use_game_genres = True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [12]:
cached_train = train.shuffle(29005).batch(512)
cached_test = test.batch(1024).cache()

In [13]:
# fit the model
model.fit(cached_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f3b39584490>

In [14]:
# model evaluation
model.evaluate(cached_test, return_dict=True)



{'loss': 0.3453771770000458,
 'mean_absolute_error': 0.42796310782432556,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.9483715891838074,
 'total_loss': 0.3453771770000458}

In [15]:
# Reference:
# [1] TensorFlow. (2021, March 19). TensorFlow Recommenders. https://www.tensorflow.org/recommenders