In [None]:
#data handling
import pandas as pd
import numpy as np
import pprint

#plotting
import matplotlib.pyplot as plt

#model building
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
from tensorflow import keras
import keras
import pickle

#misc
import os
import datetime

In [None]:
df_complete = pd.read_csv('../data/game_details_cleaned.csv')
df_complete.info()

In [None]:
df_complete.columns

In [None]:
df_features = df_complete.drop(['name', 'yearpublished','rank', 'bayesaverage', 'average',
       'usersrated', 'is_expansion', 'abstracts_rank', 'cgs_rank',
       'childrensgames_rank','familygames_rank', 'partygames_rank', 'strategygames_rank',
       'thematic_rank', 'wargames_rank', 'averageweight', 'boardgameartists',
       'boardgamecategories', 'boardgamedesigners', 'boardgamefamilies','boardgamemechanics', 'community_best_with',
       'community_recommended_with', 'description', 'game_id', 'maxplayers',
       'maxplaytime', 'median', 'minage', 'minplayers','minplaytime', 'numcomments', 'numweights', 'owned','stddev', 'trading', 'wanting', 'wishing',], axis=1)


In [None]:
# List of features for model building
# feature_names = ['average', 'usersrated', 'playingtime', 'averageweight']
feature_list = df_features.columns.to_list()
# Create feature dataset
# df_features = df_merged[feature_names]

# Create game id dataset (important for indexing later)
df_game_ids = df_complete['game_id']

In [None]:
df_features.info()

In [None]:
# Create candidate dictionary with features and game_id keys 
candidate_data_dict = {
    "features": [],
    "game_id": []
}

for index, feature_row in df_features.iterrows():
    candidate_data_dict["features"].append(feature_row.tolist())  
    candidate_data_dict["game_id"].append(df_game_ids.iloc[index])


In [None]:
# Convert the list into a tensorflow dataset
candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_data_dict)

# Batch data for optimization
candidate_dataset = candidate_dataset.batch(32)

In [None]:
# Split feature dataset into train, validation, and test sets
train_data, test_data = train_test_split(df_features, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [None]:
# Create a tensorflow normalization layer and adapt it on the training data
normalizer = tf.keras.layers.Normalization()
normalizer.adapt(train_data.to_numpy())

In [None]:
# Function for creating train, validation and test tensorflow datasets
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices({"features": dataframe.to_numpy()})
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

batch_size = 32
train_ds = df_to_dataset(train_data, batch_size=batch_size)
val_ds = df_to_dataset(val_data, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_data, shuffle=False, batch_size=batch_size)

In [None]:
# Define the BoardgameContentModel class
class BoardgameContentModel(tfrs.Model):
    # Initialize the parent tfrs.Model class.
    def __init__(self, embedding_dim, candidate_dataset):
        super().__init__()
        
        # Build the boardgame feature encoder (tower)
        # - Input: Feature vector with length equal to number of feature_names
        # - Hidden layer: 64 neurons with ReLU activation
        # - Output layer: Projects to the embedding space of dimension `embedding_dim`
        # - Normalization: L2 normalization for cosine similarity      
        self.boardgame_model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(df_features.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(embedding_dim),
            tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
        ])
        
        # Precompute candidate embeddings from the candidate dataset
        # Each candidate is expected to be a dictionary with a features key    
        candidate_embeddings = candidate_dataset.map(
            lambda x: self.boardgame_model(x["features"])
        )
        
        # Configure the retrieval task with FactorizedTopK metrics using the candidate embeddings
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=candidate_embeddings)
        )
         
        # Add a call() method to define the forward pass.
    def call(self, inputs, training=False):
        # For inference, simply return the boardgame embedding.
        return self.boardgame_model(inputs["features"])
    
    # The compute_loss method defines how the model's loss is computed during training
    def compute_loss(self, features, training=False):
        # Compute boardgame embeddings from the input features.
        boardgame_embeddings = self.boardgame_model(features["features"])
        
        # Use the embeddings as both query and candidate for the retrieval task
        # The task computes a loss based on ranking similar items higher    
        return self.task(boardgame_embeddings, boardgame_embeddings)




In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Pass candidate_ds to your model
embedding_dim = 32
model = BoardgameContentModel(embedding_dim, candidate_dataset)

# Compile the model with the adam adaptive learning rate optimization algorithm
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# Train the model
history = model.fit(train_ds, validation_data=val_ds, epochs=2, callbacks=[tensorboard_callback])

# Plot the training and validation loss over epochs.
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# # Create visual representation of the model
# # Load model with sample iinput
# sample_input = tf.random.uniform((1, df_features.shape[1]))
# _ = model({"features": sample_input})  # Run a forward pass

# # Plot model
# tf.keras.utils.plot_model(model, to_file="boardgame_model.png", show_shapes=True, show_layer_names=True)


In [None]:
tensorboard --logdir notebooks/logs


In [None]:
# Plot changes in model accuracy during training
plt.plot(history.history["factorized_top_k/top_100_categorical_accuracy"])
plt.plot(history.history["val_factorized_top_k/top_100_categorical_accuracy"])
plt.title("Model accuracies during training")
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.legend(["train", "test"], loc="upper right")
plt.show()

In [None]:
# Evaluate model on test data 
model.evaluate(test_ds, return_dict=True)

In [None]:
# # Custom callback to evaluate on the test set after each epoch.
# class TestSetEvaluator(tf.keras.callbacks.Callback):
#     def __init__(self, test_dataset):
#         super().__init__()
#         self.test_dataset = test_dataset
#         self.test_metrics = []  # To store metrics from each epoch

#     def on_epoch_end(self, epoch, logs=None):
#         # Evaluate the model on the test set silently
#         test_result = self.model.evaluate(self.test_dataset, verbose=0)
#         # Create a dictionary mapping metric names to their values
#         metrics_dict = dict(zip(self.model.metrics_names, test_result))
#         self.test_metrics.append(metrics_dict)
#         print(f"Test metrics after epoch {epoch+1}: {metrics_dict}")

# # Assume you have a test dataset called `test_ds`
# test_evaluator = TestSetEvaluator(test_ds)

# # Train the model while using the custom callback
# history = model.fit(train_ds, validation_data=val_ds, epochs=3, callbacks=[test_evaluator])


In [None]:
# # Extract training top-1 accuracy from history.
# # Adjust the key based on your metric name (e.g., "factorized_top_k/top_1_categorical_accuracy")
# train_top1 = history.history['factorized_top_k/top_1_categorical_accuracy']

# # Extract test top-1 accuracy from the custom callback
# test_top1 = [metrics['factorized_top_k/top_1_categorical_accuracy'] for metrics in test_evaluator.test_metrics]

# # Define the number of epochs (assuming both lists have the same length)
# epochs = range(1, len(train_top1) + 1)

# # Plot training and test metrics
# plt.figure(figsize=(8, 5))
# plt.plot(epochs, train_top1, marker='o', label='Train Top-1 Accuracy')
# plt.plot(epochs, test_top1, marker='o', label='Test Top-1 Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Top-1 Accuracy')
# plt.title('Train vs Test Top-1 Accuracy')
# plt.legend()
# plt.grid(True)
# plt.show()


In [None]:
# Set up the retrieval index
index = tfrs.layers.factorized_top_k.BruteForce(model.boardgame_model)
index.index_from_dataset(
    candidate_dataset.map(lambda x: (x["game_id"], model.boardgame_model(x["features"])))
)


In [None]:
feature_list.append('name')


In [None]:
feature_list

In [None]:

df_input = df_complete[feature_list]
df_input.head()

In [None]:
user_input = 'Cascadia'
user_input_game_id = df_complete.loc[df_complete['name'] == user_input, 'game_id'].values
user_input_game_id

In [None]:

query_features = np.array(df_input[df_input['name'] == user_input].iloc[:, 0:-1].values.tolist())

In [None]:
scores, recommended_game_ids = index(query_features, k=10)
recommended_game_ids.numpy()

In [None]:
scores

In [None]:
filtered_recommendations_ids = recommended_game_ids[recommended_game_ids != user_input_game_id]

In [None]:
game_names = df_merged.loc[
    df_merged["game_id"].isin(filtered_recommendations_ids.numpy()),
    "game_name"
]
game_names

In [None]:
print("Recommended Boardgames:", filtered_recommendations_ids.numpy())

In [None]:
# Assume df_features and df_ids are your complete candidate features and ids.
# Create a raw candidate dataset (unbatched).
raw_candidate_ds = tf.data.Dataset.from_tensor_slices({
    "features": df_features.to_numpy().astype("float32"),
    "game_id": df_game_ids.to_numpy().astype("int32")
}).map(lambda x: (x["features"], x["game_id"]))

# Batch the candidate dataset uniformly.
candidate_embeddings_ds = raw_candidate_ds.batch(1024, drop_remainder=True).map(
    lambda features_batch, id_batch: (model.boardgame_model(features_batch), id_batch)
)

# Collect all candidate embeddings into one tensor.
all_candidate_embeddings = []
all_candidate_ids = []
for embeddings, ids in candidate_embeddings_ds:
    all_candidate_embeddings.append(embeddings)
    all_candidate_ids.append(ids)

combined_candidate_embeddings = tf.concat(all_candidate_embeddings, axis=0)
combined_candidate_ids = tf.concat(all_candidate_ids, axis=0)

# Compute the query embedding (assume query_features has shape (1, 5)).
query_embedding = model.boardgame_model(query_features)  # shape: (1, embedding_dim)

# Compute similarity scores (dot product) between the query and all candidate embeddings.
# This produces a (1, num_candidates) tensor.
scores = tf.matmul(query_embedding, combined_candidate_embeddings, transpose_b=True)
scores = tf.squeeze(scores)  # shape: (num_candidates,)

# Optionally, sort to get top-k recommendations:
top_k = tf.math.top_k(scores, k=10)
top_candidate_ids = tf.gather(combined_candidate_ids, top_k.indices)
top_candidate_ids = top_candidate_ids[top_candidate_ids != user_input_game_id]

print("Top candidate IDs:", top_candidate_ids.numpy())
print("Top scores:", top_k.values.numpy())

# Plot the distribution of similarity scores.
plt.figure(figsize=(8, 6))
plt.hist(scores.numpy(), bins=50, edgecolor="k")
plt.xlabel("Similarity Score")
plt.ylabel("Frequency")
plt.title("Distribution of Similarity Scores for Query")
plt.show()


In [None]:
# Manually create query features with features from 5 different boardgames

query_features = np.array([[8.58304e+00, 5.02260e+04, 1.20000e+02, 3.87170e+00],[8.52011e+00, 5.49990e+04, 6.00000e+01, 2.83080e+00],
                           [8.53508e+00, 4.98110e+04, 1.50000e+02, 3.77710e+00],[8.5711e+00, 6.4138e+04, 1.2000e+02, 3.9127e+00],
                           [8.58275e+00, 2.54200e+04, 4.80000e+02, 4.32960e+00]])
query_features

In [None]:
# Compute individual embeddings from the manually generated multiple games feature query
query_embeddings = model.boardgame_model(query_features)

# Aggregate individual embeddings byy average
aggregated_query_embedding = tf.reduce_mean(query_embeddings, axis=0, keepdims=True)

# Compute similarity scores using the aggregated query embedding.
scores = tf.matmul(aggregated_query_embedding, combined_candidate_embeddings, transpose_b=True)
scores = tf.squeeze(scores)  

# Get top-k recommendations (indices with highest similarity).
top_k = tf.math.top_k(scores, k=10)
print("Top scores:", top_k.values.numpy())
print("Indices of top candidates:", top_k.indices.numpy())

# Instead of using top_k.indices directly as game IDs, map them to actual game IDs:
recommended_game_ids = tf.gather(combined_candidate_ids, top_k.indices)
print("Recommended game IDs:", recommended_game_ids.numpy())

# Build a dictionary mapping from game_id to game_name.
game_id_to_name = dict(zip(df_merged["game_id"], df_merged["game_name"]))

# Use list comprehension to retrieve names in the order of recommended_game_ids.
ordered_game_names = [game_id_to_name[game_id] for game_id in recommended_game_ids.numpy()]
print("Ordered recommended game names:", ordered_game_names)
