# Preprocessing and Modeling

## Import required modules

In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
from keras.api.layers import Dense, Embedding, GRU, LeakyReLU, Concatenate, Masking, Layer, StringLookup, Normalization, BatchNormalization
from keras.api import Input
from keras.api.models import Model
from keras.api.losses import SparseCategoricalCrossentropy
from keras.api.metrics import SparseCategoricalAccuracy, Mean, TopKCategoricalAccuracy
# from transformers.models.bert import TFBertTokenizer, TFBertEmbeddings  # embedding and tokenizer for description/nlp related stufff
from keras.api.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast

## Preprocessing

### Load CSV

In [2]:
# For Running in Google Colab
# "https://raw.githubusercontent.com/{user}/{repo}/main/{src_dir}/{file}"
url = "https://raw.githubusercontent.com/zeev-haydar/ML-TuneHive/main/model-dev/data/session-data.csv"
!wget --no-cache --backups=1 {url}

--2024-12-04 11:36:57--  https://raw.githubusercontent.com/zeev-haydar/ML-TuneHive/main/model-dev/data/session-data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21952709 (21M) [text/plain]
Failed to rename session-data.csv to session-data.csv.1: (2) No such file or directory
Saving to: ‘session-data.csv’


2024-12-04 11:36:58 (198 MB/s) - ‘session-data.csv’ saved [21952709/21952709]



In [2]:
import keras
import os
print(keras.__version__)

# root_path = "data"
root_path = "" # if using colab
df = pd.read_csv(os.path.join(root_path, "session-data.csv"))
df

3.5.0


Unnamed: 0,index_x,SongID,TimeStamp_Central,Performer_x,Album,Song_x,TimeStamp_UTC,index_y,Performer_y,Song_y,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,session_3_hour,session_id
0,0,Twenty Five MilesEdwin Starr,5/25/2021 5:18:00 PM,Edwin Starr,25 Miles,Twenty Five Miles,2021-05-25 23:18:00,9761,Edwin Starr,Twenty Five Miles,...,1.0,0.0607,0.0595,0.000015,0.2240,0.964,124.567,4.0,2021-05-25 21:00:00,4332
1,1,Devil's EyesGreyhounds,5/25/2021 5:15:00 PM,Greyhounds,Change of Pace,Devil's Eyes,2021-05-25 23:15:00,206,Greyhounds,Devil's Eyes,...,0.0,0.0456,0.3540,0.000414,0.0974,0.858,113.236,4.0,2021-05-25 21:00:00,4332
2,2,Pussy and PizzaMurs,5/25/2021 5:12:00 PM,Murs,Have a Nice Life,Pussy and Pizza,2021-05-25 23:12:00,6404,Murs,Pussy and Pizza,...,1.0,0.0659,0.0708,0.000004,0.0780,0.381,93.991,4.0,2021-05-25 21:00:00,4332
3,8,Our Special PlaceThe Heavy,5/25/2021 4:46:00 PM,The Heavy,Great Vengeance and Furious Fire,Our Special Place,2021-05-25 22:46:00,6205,The Heavy,Our Special Place,...,1.0,0.0386,0.2720,0.003610,0.0991,0.939,193.996,4.0,2021-05-25 21:00:00,4332
4,10,Make Peace and be FreePerfect Confusion,5/25/2021 4:39:00 PM,Perfect Confusion,Perfect Confusion,Make Peace and be Free,2021-05-25 22:39:00,6051,Perfect Confusion,Make Peace and be Free,...,1.0,0.0315,0.0138,0.000017,0.0649,0.431,78.037,4.0,2021-05-25 21:00:00,4332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50013,62902,From Me To You - Remastered 2009The Beatles,1/1/2017 10:04:00 AM,The Beatles,Past Masters (Vols. 1 & 2 / Remastered),From Me To You - Remastered 2009,2017-01-01 16:04:00,5693,The Beatles,From Me To You - Remastered 2009,...,1.0,0.0309,0.6130,0.000000,0.2690,0.966,136.125,4.0,2017-01-01 15:00:00,0
50014,62903,And I Love Her - Remastered 2009The Beatles,1/1/2017 10:01:00 AM,The Beatles,A Hard Day's Night (Remastered),And I Love Her - Remastered 2009,2017-01-01 16:01:00,360,The Beatles,And I Love Her - Remastered 2009,...,0.0,0.0337,0.6400,0.000000,0.0681,0.636,113.312,4.0,2017-01-01 15:00:00,0
50015,62904,Ticket To Ride - Remastered 2009The Beatles,1/1/2017 9:58:00 AM,The Beatles,Help! (Remastered),Ticket To Ride - Remastered 2009,2017-01-01 15:58:00,9715,The Beatles,Ticket To Ride - Remastered 2009,...,1.0,0.0678,0.0457,0.000000,0.2330,0.749,123.419,4.0,2017-01-01 15:00:00,0
50016,62905,Come Together - Remastered 2009The Beatles,1/1/2017 9:54:00 AM,The Beatles,Abbey Road (Remastered),Come Together - Remastered 2009,2017-01-01 15:54:00,7425,The Beatles,Come Together - Remastered 2009,...,0.0,0.0393,0.0302,0.248000,0.0926,0.187,165.007,4.0,2017-01-01 15:00:00,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50018 entries, 0 to 50017
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index_x                    50018 non-null  int64  
 1   SongID                     50018 non-null  object 
 2   TimeStamp_Central          50018 non-null  object 
 3   Performer_x                50018 non-null  object 
 4   Album                      47890 non-null  object 
 5   Song_x                     50018 non-null  object 
 6   TimeStamp_UTC              50018 non-null  object 
 7   index_y                    50018 non-null  int64  
 8   Performer_y                50018 non-null  object 
 9   Song_y                     50018 non-null  object 
 10  spotify_genre              50018 non-null  object 
 11  spotify_track_id           50018 non-null  object 
 12  spotify_track_preview_url  36001 non-null  object 
 13  spotify_track_duration_ms  50018 non-null  flo

In [None]:
test_col_name = 'mode'
df.loc[:, test_col_name]

0        1.0
1        0.0
2        1.0
3        1.0
4        1.0
        ... 
50013    1.0
50014    0.0
50015    1.0
50016    0.0
50017    1.0
Name: mode, Length: 50018, dtype: float64

## Remove N.A.N data

In [None]:
# df_filtered = df[~df['danceability'].isna()]
# df_filtered.info()

### Prepare Tensorflow Datasets

In [5]:
import time
from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import train_test_split

# Feature columns (as provided)
feature_columns = [
    'spotify_genre',
    'spotify_track_popularity',
    'danceability',
    'loudness',
    'acousticness',
    'instrumentalness',
    'tempo',
]

# Define the DataPreprocessor class
class DataPreprocessor:
    def __init__(self, df, feature_columns, batch_size=16, fixed_genre_size=10, train_size=0.8):
        """
        Initializes the data preprocessor with necessary parameters and preprocessing layers.

        Args:
            df (DataFrame): The input DataFrame containing session data.
            feature_columns (list): List of feature column names.
            batch_size (int): The batch size for dataset creation.
            fixed_genre_size (int): The fixed size for genre vectorization.
            train_size (float): Proportion of the data to use for training (between 0 and 1).
        """
        self.df = df
        self.feature_columns = feature_columns
        self.batch_size = batch_size
        self.fixed_genre_size = fixed_genre_size
        self.train_size = train_size

        # Split the dataset into training and testing datasets
        self.train_df, self.test_df = train_test_split(self.df, train_size=self.train_size, random_state=42)

        # Numeric feature preprocessing
        self.numeric_data = self.df[feature_columns[1:]].apply(pd.to_numeric, errors='coerce')
        self.mean_values = self.numeric_data.mean()
        self.std_values = self.numeric_data.std()

        # Initialize LabelEncoder for SongID and spotify_genre
        self.song_id_encoder = LabelEncoder()
        self.genre_encoder = LabelEncoder()

        # Extract unique SongIDs and genres
        unique_song_ids = self.df['SongID'].unique()
        all_genres = []
        for genre_str in self.df['spotify_genre']:
            try:
                genre_list = ast.literal_eval(genre_str)  # Safely parse the string into a list
                if isinstance(genre_list, list):
                    all_genres.extend(genre_list)
            except Exception as e:
                print(f"Error parsing genre: {e}")

        unique_genres = list(set(all_genres))

        # Fit the LabelEncoders on the data
        self.song_id_encoder.fit(unique_song_ids)
        self.genre_encoder.fit(unique_genres)

        self.items_size = len(self.song_id_encoder.classes_)  # Number of unique SongIDs
        self.genres_size = len(self.genre_encoder.classes_)

        self.dataset = None

    def preprocess_song_id(self, song_id):
        """
        Encode the SongID using LabelEncoder.
        """
        return self.song_id_encoder.transform([song_id])[0]

    def clean_genre(self, value, default_value=0, dtype=tf.int32):
        """
        Clean and process the 'spotify_genre' feature.
        """
        if value is None or (isinstance(value, str) and not value.strip()):
            return np.full((self.fixed_genre_size,), default_value, dtype=dtype.as_numpy_dtype)

        try:
            genre_list = eval(value) if isinstance(value, str) else value
            if isinstance(genre_list, list):
                genre_encoded = self.genre_encoder.transform(genre_list)
            else:
                genre_encoded = self.genre_encoder.transform([value])
        except Exception:
            genre_encoded = self.genre_encoder.transform([value])

        # Pad or truncate to fixed size
        return np.pad(genre_encoded, (0, max(0, self.fixed_genre_size - len(genre_encoded))),
                      mode='constant')[:self.fixed_genre_size].astype(dtype.as_numpy_dtype)

    def clean_numeric_feature(self, value, default_value=0.0, feature_name="feature", mean=None, std=None):
        """
        Clean, process, and normalize numerical features using Z-score normalization.
        """
        if value is None or (isinstance(value, float) and np.isnan(value)):
            return default_value

        try:
            value = float(value)
            # Apply Z-score normalization if mean and std are provided
            if mean is not None and std is not None and std != 0:
                z_score_value = (value - mean) / std
                return z_score_value
            return value  # Return raw value if no normalization
        except ValueError:
            return default_value

    def create_session_dataset(self, session_df):
        """
        Create session dataset as a list of dictionaries for each session.
        """
        session_df = session_df.sort_values(by=['session_id', 'TimeStamp_UTC'])
        grouped = session_df.groupby('session_id')
        sessions_data = []
        for session_id, group in grouped:
            session_data = group.to_dict(orient='records')
            sessions_data.append(session_data)
        return sessions_data

    def preprocess_data(self, sessions, k=1):
        """
        Preprocess session data into TensorFlow dataset with split genre and features,
        filtering out sequences where the next item sequence length is not greater than 10.
        """
        item_sequences = []
        next_item_sequences = []
        genre_sequences = []
        feature_sequences = []
        processed_item_count = 0

        for idx, session in enumerate(sessions):
            session_item_sequences = []
            session_next_item_sequences = []
            session_genre_sequences = []
            session_feature_sequences = []

            for i in range(len(session) - 1):
                # Process items
                session_item_encoded = self.preprocess_song_id(session[i]['SongID'])
                next_session_item_encoded = self.preprocess_song_id(session[i + 1]['SongID'])
                session_item_sequences.append(session_item_encoded)
                session_next_item_sequences.append(next_session_item_encoded)

                # Process genre
                genre_cleaned = self.clean_genre(session[i].get('spotify_genre', None))
                session_genre_sequences.append(genre_cleaned)

                # Process numerical features
                numeric_features = []
                for col in self.feature_columns:
                    if col != 'spotify_genre':
                        mean = self.mean_values.get(col, None)
                        std = self.std_values.get(col, None)
                        cleaned_feature = self.clean_numeric_feature(session[i].get(col, None), mean=mean, std=std)
                        numeric_features.append(cleaned_feature)

                session_feature_sequences.append(numeric_features)

            # Filter out sessions where the next item sequence length is not greater than 10
            if len(session_next_item_sequences) > k:
                # Extend sequences only if the next item sequence length is greater than 10
                item_sequences.extend(session_item_sequences)
                next_item_sequences.extend(session_next_item_sequences)
                genre_sequences.extend(session_genre_sequences)
                feature_sequences.extend(session_feature_sequences)
                processed_item_count += len(session_item_sequences)

                print(f"Session {idx + 1} processed with {len(session_item_sequences)} items.")
            else:
                print(f"Session {idx + 1} skipped because next item sequence length is {len(session_next_item_sequences)}.")

        print(f"Total processed items: {processed_item_count}")

        # Convert to tensors
        item_sequences = tf.stack(item_sequences, axis=-1)
        next_item_sequences = tf.stack(next_item_sequences, axis=-1)
        genre_sequences_tensor = tf.constant(genre_sequences, dtype=tf.int32)
        feature_sequences_tensor = tf.constant(feature_sequences, dtype=tf.float32)

        # Create TensorFlow dataset
        dataset = tf.data.Dataset.from_tensor_slices({
            'item': item_sequences,
            'genre': genre_sequences_tensor,
            'features': feature_sequences_tensor,
            'next_item': next_item_sequences
        })

        return dataset

    def create_session_dataset_tensor(self, k=1):
        """
        Main function to create session dataset as tensors and return the dataset.
        """
        if self.dataset is not None:
            print("Dataset already created")
            return

        print("Creating session dataset")
        sessions_data = self.create_session_dataset(self.train_df)  # Use train data for training
        print("Creating tensor dataset")
        dataset = self.preprocess_data(sessions_data, k=k)

        # Shuffle and batch the training data
        dataset = (
            dataset.shuffle(buffer_size=1024)
                   .batch(self.batch_size, drop_remainder=True)
                   .prefetch(buffer_size=tf.data.AUTOTUNE)
        )

        self.dataset = dataset
        return dataset

    def get_test_data(self, k):
        """
        Return preprocessed test dataset without shuffling.
        """
        sessions_data = self.create_session_dataset(self.test_df)
        dataset = self.preprocess_data(sessions_data, k)

        # Batch the test data without shuffling
        dataset = (
            dataset.batch(self.batch_size, drop_remainder=True)
                   .prefetch(buffer_size=tf.data.AUTOTUNE)
        )

        return dataset

    def batch_timer(self, dataset):
        """
        Timer function to track the time taken for batch processing.
        """
        for batch in dataset:
            start_time = time.time()

            # Simulate processing (e.g., model training or data transformation)
            end_time = time.time()
            batch_time = end_time - start_time
            print(f"Batch processing time: {batch_time:.4f} seconds")



In [6]:
preprocessor = DataPreprocessor(df[:10000], feature_columns)

# Create the session dataset tensor
train_dataset = preprocessor.create_session_dataset_tensor()


Creating session dataset
Creating tensor dataset
Session 1 skipped because next item sequence length is 0.
Session 2 processed with 3 items.
Session 3 skipped because next item sequence length is 0.
Session 4 processed with 3 items.
Session 5 processed with 2 items.
Session 6 processed with 4 items.
Session 7 skipped because next item sequence length is 0.
Session 8 skipped because next item sequence length is 0.
Session 9 skipped because next item sequence length is 0.
Session 10 processed with 4 items.
Session 11 processed with 5 items.
Session 12 processed with 7 items.
Session 13 processed with 7 items.
Session 14 processed with 4 items.
Session 15 processed with 10 items.
Session 16 processed with 3 items.
Session 17 skipped because next item sequence length is 1.
Session 18 processed with 5 items.
Session 19 processed with 7 items.
Session 20 processed with 8 items.
Session 21 processed with 6 items.
Session 22 skipped because next item sequence length is 0.
Session 23 processed 

In [7]:
test_dataset = preprocessor.get_test_data(k=1)

Session 1 skipped because next item sequence length is 0.
Session 2 skipped because next item sequence length is 0.
Session 3 skipped because next item sequence length is 0.
Session 4 skipped because next item sequence length is 0.
Session 5 skipped because next item sequence length is 0.
Session 6 skipped because next item sequence length is 0.
Session 7 processed with 2 items.
Session 8 skipped because next item sequence length is 1.
Session 9 processed with 2 items.
Session 10 skipped because next item sequence length is 0.
Session 11 processed with 2 items.
Session 12 skipped because next item sequence length is 1.
Session 13 skipped because next item sequence length is 0.
Session 14 skipped because next item sequence length is 0.
Session 15 skipped because next item sequence length is 1.
Session 16 skipped because next item sequence length is 0.
Session 17 skipped because next item sequence length is 1.
Session 18 processed with 2 items.
Session 19 processed with 3 items.
Session 

In [8]:
for batch in train_dataset.take(1):
    print("Items (SongID):", batch['item'].numpy())
    print("Genre:", batch['genre'].numpy())
    print("Features:", batch['features'].numpy())
    print("Next Items (Next SongID):", batch['next_item'].numpy())

for batch in test_dataset.take(1):
    print("Items (SongID):", batch['item'].numpy())
    print("Genre:", batch['genre'].numpy())
    print("Features:", batch['features'].numpy())
    print("Next Items (Next SongID):", batch['next_item'].numpy())

Items (SongID): [ 769 2339 1303 1666 1060 2250  297 2722 2081 1684 1223 1970 2152  696
 1214 1233]
Genre: [[232 361 497 571   0   0   0   0   0   0]
 [ 13 148 395 496   0   0   0   0   0   0]
 [ 61 264 670   0   0   0   0   0   0   0]
 [ 14  17 268 497 501 545   0   0   0   0]
 [ 21 321 322 323 327   0   0   0   0   0]
 [536   0   0   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]
 [496   0   0   0   0   0   0   0   0   0]
 [536   0   0   0   0   0   0   0   0   0]
 [232 361 443 497   0   0   0   0   0   0]
 [535   0   0   0   0   0   0   0   0   0]
 [220   0   0   0   0   0   0   0   0   0]
 [ 14  17  30 459 501 503 514 516 545   0]
 [  8 277 375 545 581   0   0   0   0   0]
 [ 14  17  30 459 501 503 514 516 545   0]
 [148 248 250 275 279 530 665   0   0   0]]
Features: [[ 1.25556636e+00  7.19815493e-01 -2.51732826e+00  4.67997104e-01
  -3.52666885e-01 -5.66331863e-01]
 [ 1.09284341e-01  1.80212295e+00  5.60936928e-01  6.55097544e-01
  -3.52666885e-01 -8.0153536

## Define Model class

In [9]:
class ItemEmbedding(Layer):
    def __init__(self, num_items, item_embed_dim):
        super(ItemEmbedding, self).__init__()

        self.item_embedding = Embedding(input_dim=num_items, output_dim=item_embed_dim, mask_zero=True)

    def call(self, items):
        # Embed items
        items_embedded = self.item_embedding(items)
        return items_embedded

class GRU4REC(Model):
    def __init__(self, rnn_params, genre_embed_dim, item_embed_dim, ffn1_units, feature_dense_units,  preprocessed_data:DataPreprocessor):
        super(GRU4REC, self).__init__()
        print(f"items size: {preprocessed_data.items_size}")
        print(f"genres size: {preprocessed_data.genres_size}")
        self.embedding = ItemEmbedding(preprocessed_data.items_size, item_embed_dim)

        # Genre embedding (only for genre, which is categorical and a string)
        self.genre_embedding = Embedding(input_dim=preprocessed_data.genres_size, output_dim=genre_embed_dim, mask_zero=True, name='genre_embedding')

        # RNN layers
        self.rnn_layers = []
        self.rnn_layers.append(GRU(**rnn_params[0], return_sequences=True))
        for i in range(1, len(rnn_params) - 1):
            self.rnn_layers.append(GRU(**rnn_params[i], return_sequences=True))
        self.rnn_layers.append(GRU(**rnn_params[-1], return_sequences=False))

        self.concat = Concatenate(axis=-1, name='concat_1')
        self.batch_norm = BatchNormalization(name='batchnorm')

        # Feed-forward layers
        self.feature_dense = Dense(feature_dense_units, activation='relu', name='feature_dense')  # Dense layer for features (if required)
        self.ffn1 = Dense(ffn1_units, name='ffn_1')
        self.activation1 = LeakyReLU(alpha=0.2, name='freaky_relu')
        self.out = Dense(preprocessed_data.items_size, activation='softmax', name='output_layer')

    def call(self, inputs, training=False):
        """
        Forward pass for the GRU4REC model.
        :param inputs: Tuple (item_sequences, item_features, item_genres)
        :param training: Boolean indicating if the model is in training mode
        """

        item_sequences, item_features, item_genres = inputs

        # Embed items
        item_embedded = self.embedding(item_sequences)
        item_embedded = tf.expand_dims(item_embedded, axis=1)
        # Genre embedding
        genre_embedded = self.genre_embedding(item_genres)
        genre_embedded = tf.reduce_mean(genre_embedded, axis=1)
        genre_embedded = tf.expand_dims(genre_embedded, axis=1)

        # Feature transformation (features are passed directly as floats, so no embedding is needed)
        feature_transformed = self.feature_dense(item_features)
        feature_transformed = tf.expand_dims(feature_transformed, axis=1)

        combined_input = tf.concat([item_embedded, feature_transformed, genre_embedded], axis=-1)
        combined_input = self.batch_norm(combined_input)
        # Pass through RNN layers
        x = combined_input
        x = self.rnn_layers[0](x)
        for i in range(1, len(self.rnn_layers)):
            x = self.concat([combined_input, x])  # Concatenate item embeddings with RNN outputs
            x = self.rnn_layers[i](x)

        # Feed-forward layers
        x = self.ffn1(x)
        x = self.activation1(x)
        logits = self.out(x)  # Shape: (batch_size, num_items)

        # Generate the sequence of items (choose the item with the highest probability using argmax)
        predicted_items = tf.argmax(logits, axis=-1)  # (batch_size, sequence_length)

        return predicted_items, logits  # Return both predicted item indices and logits (probabilities)

## Training Loop

In [10]:
from keras.api.metrics import Recall

class RecallAtK(tf.keras.metrics.Metric):
    def __init__(self, k=10, name="recall_at_k", **kwargs):
        super(RecallAtK, self).__init__(name=name, **kwargs)
        self.k = k
        self.recall_at_k = Recall(top_k=self.k)

    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        Update the state of the metric.
        """
        # Since y_true is a list of true items and y_pred are the predicted scores,
        # we need to calculate recall for top-k predicted items
        y_true = tf.cast(y_true, tf.int32)

        # Calculate the top-k predicted items
        top_k_preds = tf.argsort(y_pred, axis=-1, direction='DESCENDING')[:, :self.k]

        # Calculate recall by comparing true labels with the top-k predictions
        recall = tf.reduce_mean(tf.cast(tf.equal(y_true, top_k_preds), tf.float32), axis=-1)
        return recall

    def result(self):
        return self.recall_at_k.result()

    def reset_state(self):
        self.recall_at_k.reset_state()

@tf.function
def train_step(batch):
    with tf.GradientTape() as tape:
        predicted_items, logits = model((batch['item'], batch['features'], batch['genre']), training=True)
        loss = loss_fn(batch['next_item'], logits)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_gru4rec(model, train_dataset, optimizer, loss_fn, epochs, k, val_dataset=None):
    # metric = tf.keras.metrics.TopKCategoricalAccuracy(k=k)

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = 0.0

        for step, batch in enumerate(train_dataset):
            loss = train_step(batch)
            if loss is None:
              print(f"Warning: train_step returned None at step {step}")
              continue
            epoch_loss += loss.numpy()

            # Update metric
            # metric.update_state(batch['next_item'], logits)

        print(f"Training Loss: {epoch_loss / (step + 1):.4f}")
        # metric.reset_state()

        if val_dataset:
            val_loss = 0.0
            for step, batch in enumerate(val_dataset):
                _, logits = model((batch['item'], batch['features'], batch['genre']), training=False)
                val_loss += loss_fn(batch['next_item'], logits).numpy()
            print(f"Validation Loss: {val_loss / (step + 1):.4f}")

def plot_training_history(loss_history, metric_history, metric_name, top_k):
    """Plot the training loss and accuracy."""
    epochs = range(1, len(loss_history) + 1)

    # Create subplots for loss and accuracy
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

    # Plot the training loss
    ax1.plot(epochs, loss_history, label='Loss', color='blue', linestyle='-', marker='o')
    ax1.set_title('Training Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()

    # Plot the top-k accuracy
    ax2.plot(epochs, metric_history, label=metric_name, color='green', linestyle='-', marker='o')
    ax2.set_title(f'Training {metric_name}')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel(metric_name)
    ax2.legend()

    plt.tight_layout()
    plt.show()

## Run the training process

In [11]:
# Define the model
# num_items = len()
# feature_vocab_size = len(feature_columns)

model = GRU4REC(
    rnn_params=[
        {"units": 128},
        {"units": 128},
        {"units": 64}
    ],
    item_embed_dim=32,
    genre_embed_dim=16,
    ffn1_units=128,
    feature_dense_units=64,
    preprocessed_data=preprocessor
)

print(model.trainable_variables)


items size: 2864
genres size: 675
[]




In [12]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Train the model
train_gru4rec(model=model, train_dataset=train_dataset,optimizer=optimizer, loss_fn=loss_fn, epochs=500, k=8)

Epoch 1/500
Training Loss: 7.6787
Epoch 2/500
Training Loss: 6.7786
Epoch 3/500
Training Loss: 6.1443
Epoch 4/500
Training Loss: 5.5022
Epoch 5/500
Training Loss: 4.8591
Epoch 6/500
Training Loss: 4.2198
Epoch 7/500
Training Loss: 3.6909
Epoch 8/500
Training Loss: 3.2873
Epoch 9/500
Training Loss: 2.9938
Epoch 10/500
Training Loss: 2.7720
Epoch 11/500
Training Loss: 2.6321
Epoch 12/500
Training Loss: 2.5475
Epoch 13/500
Training Loss: 2.4924
Epoch 14/500
Training Loss: 2.4171
Epoch 15/500
Training Loss: 2.3653
Epoch 16/500
Training Loss: 2.3196
Epoch 17/500
Training Loss: 2.2901
Epoch 18/500
Training Loss: 2.2661
Epoch 19/500
Training Loss: 2.2358
Epoch 20/500
Training Loss: 2.2116
Epoch 21/500
Training Loss: 2.1863
Epoch 22/500
Training Loss: 2.1669
Epoch 23/500
Training Loss: 2.1488
Epoch 24/500
Training Loss: 2.1380
Epoch 25/500
Training Loss: 2.1269
Epoch 26/500
Training Loss: 2.1123
Epoch 27/500
Training Loss: 2.0977
Epoch 28/500
Training Loss: 2.0838
Epoch 29/500
Training Loss: 2

In [13]:
def predict(model, item_sequences, item_features, item_genres):
    """
    Predict the item with the highest probability for the given input sequences using argmax of softmax.

    Args:
    - model: The trained model.
    - item_sequences: Input item sequences (batch_size, seq_length).
    - item_features: Input item features (batch_size, feature_length).
    - item_genres: Input item genres (batch_size, genre_length).

    Returns:
    - predicted_items: A list of predicted items with the highest probability for each input sequence.
    """
    # Run the model in inference mode (not training)
    predicted_items, logits = model((item_sequences, item_features, item_genres), training=False)

    # Apply softmax to the logits to get probabilities
    probabilities = tf.nn.softmax(logits, axis=-1)

    # Get the item with the highest probability by finding the index of the maximum probability
    predicted_items = tf.argmax(probabilities, axis=-1, output_type=tf.int32)

    # Convert to numpy array for easier handling
    predicted_items = predicted_items.numpy()

    return predicted_items

def compute_recall(predicted_items, targets):
    """
    Compute the recall for the given predictions and targets.

    Args:
    - predicted_items: The predicted items (batch_size,).
    - targets: The actual next items (batch_size,).

    Returns:
    - recall: The recall metric.
    """
    # True positives: Predicted item matches the target
    true_positives = np.sum(predicted_items == targets)

    # Total relevant items (in this case, it is the number of items in the batch)
    total_items = len(targets)

    # Recall calculation
    recall = true_positives / total_items if total_items > 0 else 0
    return recall

# Initialize variables to calculate overall recall
total_true_positives = 0
total_items = 0

# Loop through training dataset and predict the most probable item
for step, batch in enumerate(train_dataset):
    item_sequences = batch['item']
    item_genres = batch['genre']
    item_features = batch['features']
    targets = batch['next_item']

    # Get the predicted item with the highest probability for each sequence in the batch
    predicted_items = predict(model, item_sequences, item_features, item_genres)

    # Compute recall for the current batch
    batch_recall = compute_recall(predicted_items, targets)
    print(f"Batch {step + 1} recall: {batch_recall:.4f}")

    # Accumulate for overall recall
    total_true_positives += np.sum(predicted_items == targets)
    total_items += len(targets)

# Calculate overall recall
overall_recall = total_true_positives / total_items if total_items > 0 else 0
print(f"Overall recall: {overall_recall:.4f}")


Batch 1 recall: 0.4375
Batch 2 recall: 0.3750
Batch 3 recall: 0.4375
Batch 4 recall: 0.3125
Batch 5 recall: 0.2500
Batch 6 recall: 0.2500
Batch 7 recall: 0.3750
Batch 8 recall: 0.3125
Batch 9 recall: 0.3750
Batch 10 recall: 0.4375
Batch 11 recall: 0.1875
Batch 12 recall: 0.3125
Batch 13 recall: 0.3750
Batch 14 recall: 0.1875
Batch 15 recall: 0.2500
Batch 16 recall: 0.3125
Batch 17 recall: 0.0625
Batch 18 recall: 0.3750
Batch 19 recall: 0.3125
Batch 20 recall: 0.2500
Batch 21 recall: 0.5000
Batch 22 recall: 0.2500
Batch 23 recall: 0.2500
Batch 24 recall: 0.2500
Batch 25 recall: 0.1875
Batch 26 recall: 0.1875
Batch 27 recall: 0.2500
Batch 28 recall: 0.2500
Batch 29 recall: 0.1250
Batch 30 recall: 0.3750
Batch 31 recall: 0.3750
Batch 32 recall: 0.4375
Batch 33 recall: 0.3125
Batch 34 recall: 0.3750
Batch 35 recall: 0.3750
Batch 36 recall: 0.3125
Batch 37 recall: 0.2500
Batch 38 recall: 0.2500
Batch 39 recall: 0.3125
Batch 40 recall: 0.2500
Batch 41 recall: 0.3125
Batch 42 recall: 0.1875
B

In [19]:
def preprocess_data_single_session(
    session,
    feature_columns,
    k=1
):
    """
    Preprocess a single session into TensorFlow dataset with split genre and features.

    Args:
    - session (list): A list of dictionaries containing session data.
    - feature_columns (list): List of numerical feature column names.
    - mean_values (dict): Mean values for numerical features for normalization.
    - std_values (dict): Std values for numerical features for normalization.
    - k (int): Minimum length of `next_item_sequences`.

    Returns:
    - tf.data.Dataset: TensorFlow dataset containing preprocessed data.
    """
    item_sequences = []
    next_item_sequences = []
    genre_sequences = []
    feature_sequences = []

    for i in range(len(session) - 1):
        # Process items
        session_item_encoded = preprocessor.preprocess_song_id(session[i]['SongID'])
        next_session_item_encoded = preprocessor.preprocess_song_id(session[i + 1]['SongID'])
        item_sequences.append(session_item_encoded)
        next_item_sequences.append(next_session_item_encoded)

        # Process genre
        genre_cleaned = preprocessor.clean_genre(session[i].get('spotify_genre', None))
        genre_sequences.append(genre_cleaned)

        # Process numerical features
        numeric_features = []
        for col in feature_columns:
            if col != 'spotify_genre':
                mean = preprocessor.mean_values.get(col, None)
                std = preprocessor.std_values.get(col, None)
                cleaned_feature = preprocessor.clean_numeric_feature(session[i].get(col, None), mean=mean, std=std)
                numeric_features.append(cleaned_feature)

        feature_sequences.append(numeric_features)

    # Filter session if next_item_sequences length is not greater than k
    if len(next_item_sequences) <= k:
        print(f"Session skipped because next item sequence length is {len(next_item_sequences)}.")
        return

    print(f"Processed session with {len(item_sequences)} items.")

    # Convert to tensors
    item_sequences = tf.stack(item_sequences, axis=0)
    next_item_sequences = tf.stack(next_item_sequences, axis=0)
    genre_sequences_tensor = tf.constant(genre_sequences, dtype=tf.int32)
    feature_sequences_tensor = tf.constant(feature_sequences, dtype=tf.float32)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices({
        'item': item_sequences,
        'genre': genre_sequences_tensor,
        'features': feature_sequences_tensor,
        'next_item': next_item_sequences
    })

    return dataset

In [26]:
def predict_next(model, item_sequences, item_features, item_genres, k=5):
    """
    Predict the next item for a given input sequence.

    Args:
    - model: The trained model.
    - item_sequences: Input item sequences (batch_size, seq_length).
    - item_features: Input item features (batch_size, feature_length).
    - item_genres: Input item genres (batch_size, genre_length).
    - k: Number of top predictions to consider.

    Returns:
    - predicted_items: Top k predicted next items (batch_size, k).
    """
    # Run inference
    _, logits = model((item_sequences, item_features, item_genres), training=False)

    # Apply softmax to logits to get probabilities
    probabilities = tf.nn.softmax(logits, axis=-1)

    # Get the top k predictions
    top_k_values, top_k_indices = tf.nn.top_k(probabilities, k=k, sorted=True)

    return top_k_indices.numpy()  # Return top k item indices

def compute_recall_at_k(predicted_sequence, target_sequence, k):
    """
    Compute Recall@k for a given session.

    Args:
    - predicted_sequence: The predicted sequence of items.
    - target_sequence: The actual target sequence of items.
    - k: The number of top predictions to consider.

    Returns:
    - recall_at_k: Recall@k value.
    """
    # Extract 'SongID' from dictionaries if needed
    predicted_ids = [item['SongID'] if isinstance(item, dict) else item for item in predicted_sequence]
    target_ids = [item['SongID'] if isinstance(item, dict) else item for item in target_sequence]

    # Select the top k items from predicted sequence
    top_k_predicted = predicted_ids[:k]

    # Count how many of the target items appear in the top k predictions
    hits = len(set(top_k_predicted) & set(target_ids[:k]))
    return hits / len(target_ids[:k]) if target_ids[:k] else 0.0


# Initialize overall metrics
total_recall = 0
session_count = 0
k = 5  # Set the value of k

print("Creating session dataset")
sessions_data = preprocessor.create_session_dataset(preprocessor.train_df)  # Use train data for training
print("Creating tensor dataset")

# Process each session in the dataset
for session in sessions_data:  # Assume sessions_data is your preprocessed session data
    if len(session) <= k:
        continue
    print(session)
    context_length = len(session) - k
    context = session[:context_length]
    target = session[context_length:]
    dataset = preprocess_data_single_session(session, feature_columns, k=k)
    if dataset is None:
        continue
    dataset = dataset.batch(1)
    predicted_sequence = []
    current_sequence = context

    # Generate k predictions iteratively
    for batch in dataset:
        # Prepare input features for the current sequence
        item_sequences = batch['item']  # Batch of size 1
        item_features = batch['features']  # Extract corresponding features
        item_genres = batch['genre']  # Extract corresponding genres

        # Predict the top k items
        top_k_predictions = predict_next(model, item_sequences, item_features, item_genres, k=k)

        # Append the top k predictions to the sequence (only append first item from the top k)
        predicted_sequence.extend(top_k_predictions[0])  # Assuming batch size is 1

        # Update the current sequence
        current_sequence = current_sequence[1:] + [top_k_predictions[0][0]]  # Only use first predicted item for the next context

        if len(predicted_sequence) >= k:
            break

    # Calculate Recall@k for the current session
    session_recall = compute_recall_at_k(predicted_sequence, target, k)
    total_recall += session_recall
    session_count += 1

    print(f"Session recall: {session_recall:.4f}")

# Calculate overall Recall@k
overall_recall_at_k = total_recall / session_count if session_count > 0 else 0
print(f"Overall Recall@{k}: {overall_recall_at_k:.4f}")


Creating session dataset
Creating tensor dataset
[{'index_x': 17197, 'SongID': 'Ruby CumulousRX Bandits', 'TimeStamp_Central': '5/14/2020 12:13:00 PM', 'Performer_x': 'RX Bandits', 'Album': 'Gemini Her Majesty', 'Song_x': 'Ruby Cumulous', 'TimeStamp_UTC': '2020-05-14 18:13:00', 'index_y': 3285, 'Performer_y': 'RX Bandits', 'Song_y': 'Ruby Cumulous', 'spotify_genre': "['dreamo', 'modern ska punk', 'ska', 'ska punk']", 'spotify_track_id': '5MOKQJ6HMqpEoD04eqVFyB', 'spotify_track_preview_url': 'https://p.scdn.co/mp3-preview/6e8d6b4987d93e158eda63250891e1b39546dcab?cid=b8d3901151d34489a160e3cf0ab1fa94', 'spotify_track_duration_ms': 291147.0, 'spotify_track_popularity': 41.0, 'spotify_track_explicit': False, 'danceability': 0.475, 'energy': 0.862, 'key': 8.0, 'loudness': -6.955, 'mode': 0.0, 'speechiness': 0.163, 'acousticness': 0.00323, 'instrumentalness': 0.000795, 'liveness': 0.112, 'valence': 0.704, 'tempo': 180.257, 'time_signature': 4.0, 'session_3_hour': '2020-05-14 18:00:00', 'sessi