# Preprocessing and Modeling

## Import required modules

In [30]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
from keras.api.layers import Dense, Embedding, GRU, LeakyReLU, Concatenate, Masking, Layer, StringLookup, Normalization
from keras.api import Input
from keras.api.models import Model
from keras.api.losses import SparseCategoricalCrossentropy
from keras.api.metrics import SparseCategoricalAccuracy, Mean, TopKCategoricalAccuracy
# from transformers.models.bert import TFBertTokenizer, TFBertEmbeddings  # embedding and tokenizer for description/nlp related stuff
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Preprocessing

### Load CSV

In [2]:
import keras
print(keras.__version__)

df = pd.read_csv("data/session-data.csv")
df

3.6.0


Unnamed: 0,index_x,SongID,TimeStamp_Central,Performer_x,Album,Song_x,TimeStamp_UTC,index_y,Performer_y,Song_y,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,session_6_hour,session_id
0,0,Twenty Five MilesEdwin Starr,5/25/2021 5:18:00 PM,Edwin Starr,25 Miles,Twenty Five Miles,2021-05-25 23:18:00,9761,Edwin Starr,Twenty Five Miles,...,1.0,0.0607,0.0595,0.000015,0.2240,0.964,124.567,4.0,2021-05-25 20:00:00,3729
1,1,Devil's EyesGreyhounds,5/25/2021 5:15:00 PM,Greyhounds,Change of Pace,Devil's Eyes,2021-05-25 23:15:00,206,Greyhounds,Devil's Eyes,...,0.0,0.0456,0.3540,0.000414,0.0974,0.858,113.236,4.0,2021-05-25 20:00:00,3729
2,2,Pussy and PizzaMurs,5/25/2021 5:12:00 PM,Murs,Have a Nice Life,Pussy and Pizza,2021-05-25 23:12:00,6404,Murs,Pussy and Pizza,...,1.0,0.0659,0.0708,0.000004,0.0780,0.381,93.991,4.0,2021-05-25 20:00:00,3729
3,8,Our Special PlaceThe Heavy,5/25/2021 4:46:00 PM,The Heavy,Great Vengeance and Furious Fire,Our Special Place,2021-05-25 22:46:00,6205,The Heavy,Our Special Place,...,1.0,0.0386,0.2720,0.003610,0.0991,0.939,193.996,4.0,2021-05-25 20:00:00,3729
4,10,Make Peace and be FreePerfect Confusion,5/25/2021 4:39:00 PM,Perfect Confusion,Perfect Confusion,Make Peace and be Free,2021-05-25 22:39:00,6051,Perfect Confusion,Make Peace and be Free,...,1.0,0.0315,0.0138,0.000017,0.0649,0.431,78.037,4.0,2021-05-25 20:00:00,3729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54496,62902,From Me To You - Remastered 2009The Beatles,1/1/2017 10:04:00 AM,The Beatles,Past Masters (Vols. 1 & 2 / Remastered),From Me To You - Remastered 2009,2017-01-01 16:04:00,5693,The Beatles,From Me To You - Remastered 2009,...,1.0,0.0309,0.6130,0.000000,0.2690,0.966,136.125,4.0,2017-01-01 16:00:00,1
54497,62903,And I Love Her - Remastered 2009The Beatles,1/1/2017 10:01:00 AM,The Beatles,A Hard Day's Night (Remastered),And I Love Her - Remastered 2009,2017-01-01 16:01:00,360,The Beatles,And I Love Her - Remastered 2009,...,0.0,0.0337,0.6400,0.000000,0.0681,0.636,113.312,4.0,2017-01-01 16:00:00,1
54498,62904,Ticket To Ride - Remastered 2009The Beatles,1/1/2017 9:58:00 AM,The Beatles,Help! (Remastered),Ticket To Ride - Remastered 2009,2017-01-01 15:58:00,9715,The Beatles,Ticket To Ride - Remastered 2009,...,1.0,0.0678,0.0457,0.000000,0.2330,0.749,123.419,4.0,2017-01-01 12:00:00,0
54499,62905,Come Together - Remastered 2009The Beatles,1/1/2017 9:54:00 AM,The Beatles,Abbey Road (Remastered),Come Together - Remastered 2009,2017-01-01 15:54:00,7425,The Beatles,Come Together - Remastered 2009,...,0.0,0.0393,0.0302,0.248000,0.0926,0.187,165.007,4.0,2017-01-01 12:00:00,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54501 entries, 0 to 54500
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index_x                    54501 non-null  int64  
 1   SongID                     54501 non-null  object 
 2   TimeStamp_Central          54501 non-null  object 
 3   Performer_x                54501 non-null  object 
 4   Album                      52032 non-null  object 
 5   Song_x                     54501 non-null  object 
 6   TimeStamp_UTC              54501 non-null  object 
 7   index_y                    54501 non-null  int64  
 8   Performer_y                54501 non-null  object 
 9   Song_y                     54501 non-null  object 
 10  spotify_genre              53131 non-null  object 
 11  spotify_track_id           50077 non-null  object 
 12  spotify_track_preview_url  36056 non-null  object 
 13  spotify_track_duration_ms  50077 non-null  flo

In [None]:
test_col_name = 'mode'
df.loc[:, test_col_name]

array([ 1.,  0., nan])

### Prepare Tensorflow Datasets

In [47]:
import tensorflow as tf
import numpy as np
import time

# Feature columns (as provided)
feature_columns = [
    'spotify_genre',
    'spotify_track_duration_ms',
    'spotify_track_popularity',
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
]

# Define TextVectorization for the string features (e.g., 'spotify_genre')
genre_vectorizer = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=1)

# Function to clean the features and ensure they are float32
def clean_feature(value, default_value, dtype, feature_name):
    """
    Ensure feature values are cleaned and cast to a consistent type (float32).
    String features will be encoded using TextVectorization.
    """
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return default_value
    
    # Handle spotify_genre, which is a list of strings
    if feature_name == 'spotify_genre' and isinstance(value, str):
        # Check if 'value' is a string representing a list (e.g., "[...]")
        try:
            genre_list = eval(value)  # Convert string representation of list to a list
            if isinstance(genre_list, list):  # Ensure it's actually a list
                return genre_vectorizer(genre_list)
        except Exception as e:
            return genre_vectorizer([value])  # If it's a single genre, use it as a list

    # Convert other feature values to float (for numerical features)
    if dtype == tf.float32 or dtype == tf.float64:
        try:
            return float(value)
        except ValueError:
            return default_value  # In case conversion fails, use the default value
    
    return value

# Create sessions dataset
def create_session_dataset(session_df):
    # Reorder the session by the TimeStamp_UTC ascending for each session
    session_df = session_df.sort_values(by=['session_id', 'TimeStamp_UTC'])

    # Group the DataFrame by session_id
    grouped = session_df.groupby('session_id')

    # Convert each session's data into a list of dictionaries
    sessions_data = []
    for session_id, group in grouped:
        session_data = group.to_dict(orient='records')
        sessions_data.append(session_data)

    return sessions_data

# Preprocess the data and prepare tensors
def preprocess_data(sessions, batch_size):
    item_sequences = []
    next_item_sequences = []
    feature_sequences = []
    
    processed_item_count = 0

    # Collect all items and their next items across sessions
    for session in sessions:
        for i in range(len(session) - 1):
            # Current item (SongID)
            item_sequences.append(session[i]['SongID'])
            # Next item (target item)
            next_item_sequences.append(session[i + 1]['SongID'])
            
            # Feature sequence for the current item
            feature_sequence = [
                clean_feature(session[i].get(col, None), default_value=0.0, dtype=tf.float32, feature_name=col)
                for col in feature_columns
            ]
            feature_sequences.append(feature_sequence)
            
            processed_item_count += 1
            print(f"process item {processed_item_count}")

    # Convert to tensors
    item_sequences = tf.constant(item_sequences, dtype=tf.string)
    next_item_sequences = tf.constant(next_item_sequences, dtype=tf.string)

    # Stack features into a single tensor (all features will be converted to float32)
    feature_sequences_tensor = tf.constant(feature_sequences, dtype=tf.float32)

    # Now, construct the final dataset where:
    # First index: SongID
    # Middle: Feature columns (one for each feature in feature_columns)
    # Last index: Next item (next SongID)
    dataset_tensor = tf.stack([item_sequences[:, None], feature_sequences_tensor, next_item_sequences[:, None]], axis=-1)

    # Create a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(dataset_tensor)
    
    dataset = dataset.map(lambda x: x, num_parallel_calls=tf.data.AUTOTUNE)

    # Batch the dataset
    dataset = dataset.shuffle(buffer_size=1024).batch(batch_size, drop_remainder=True)

    return dataset


def batch_timer(dataset):
    for batch in dataset:
        start_time = time.time()  # Start the timer
        
        # Process the batch (here we assume you're performing some operation on the batch)
        # In practice, this could be model training or data transformation
        
        end_time = time.time()  # End the timer
        batch_time = end_time - start_time
        
        print(f"Batch processing time: {batch_time:.4f} seconds")


# Main function to create session dataset as tensor
def create_session_dataset_tensor(session_df, batch_size=16):
    # Ensure the 'spotify_genre' column is a list of strings
    print("Adapt the genre")
    genre_vectorizer.adapt(session_df['spotify_genre'].astype(str).unique())  # Convert genres to strings before adapting
    print("Create session dataset")
    sessions_data = create_session_dataset(session_df)
    print("create tensor dataset")
    dataset = preprocess_data(sessions_data, batch_size)
    
    # Call the batch_timer function to track processing time for each batch
    batch_timer(dataset)
    
    return dataset

In [48]:
dataset = create_session_dataset_tensor(df)
dataset

Adapt the genre
Create session dataset
create tensor dataset
process item 1
process item 2
process item 3
process item 4
process item 5
process item 6
process item 7
process item 8
process item 9
process item 10
process item 11
process item 12
process item 13
process item 14
process item 15
process item 16
process item 17
process item 18
process item 19
process item 20
process item 21
process item 22
process item 23
process item 24
process item 25
process item 26
process item 27
process item 28
process item 29
process item 30
process item 31
process item 32
process item 33
process item 34
process item 35
process item 36
process item 37
process item 38
process item 39
process item 40
process item 41
process item 42
process item 43
process item 44
process item 45
process item 46
process item 47
process item 48
process item 49
process item 50
process item 51
process item 52
process item 53


  pad_end_extra = (block_shape - full_input_shape % block_shape) % block_shape


process item 54
process item 55
process item 56
process item 57
process item 58
process item 59
process item 60
process item 61
process item 62
process item 63
process item 64
process item 65
process item 66
process item 67
process item 68
process item 69
process item 70
process item 71
process item 72
process item 73
process item 74
process item 75
process item 76
process item 77
process item 78
process item 79
process item 80
process item 81
process item 82
process item 83
process item 84
process item 85
process item 86
process item 87
process item 88
process item 89
process item 90
process item 91
process item 92
process item 93
process item 94
process item 95
process item 96
process item 97
process item 98
process item 99
process item 100
process item 101
process item 102
process item 103
process item 104
process item 105
process item 106
process item 107
process item 108
process item 109
process item 110
process item 111
process item 112
process item 113
process item 114
process i

ValueError: TypeError: Scalar tensor has no `len()`
Traceback (most recent call last):

  File "e:\Haidar\bangkit\capstone\ML-TuneHive\model-dev\.venv\Lib\site-packages\tensorflow\python\framework\ops.py", line 357, in __len__
    raise TypeError("Scalar tensor has no `len()`")

TypeError: Scalar tensor has no `len()`



In [11]:
for song in sessions_data[2]:
    print(song)

{'index_x': 62875, 'SongID': 'Powerful (feat. Ellie Goulding & Tarrus Riley)Major Lazer', 'TimeStamp_Central': '1/2/2017 6:25:00 PM', 'Performer_x': 'Major Lazer', 'Album': 'Peace Is the Mission', 'Song_x': 'Powerful (feat. Ellie Goulding & Tarrus Riley)', 'TimeStamp_UTC': '2017-01-03 00:25:00', 'index_y': 12119, 'Performer_y': 'Major Lazer', 'Song_y': 'Powerful (feat. Ellie Goulding & Tarrus Riley)', 'spotify_genre': "['dance pop', 'edm', 'electro house', 'moombahton', 'pop', 'tropical house']", 'spotify_track_id': nan, 'spotify_track_preview_url': nan, 'spotify_track_duration_ms': nan, 'spotify_track_popularity': nan, 'spotify_track_explicit': nan, 'danceability': nan, 'energy': nan, 'key': nan, 'loudness': nan, 'mode': nan, 'speechiness': nan, 'acousticness': nan, 'instrumentalness': nan, 'liveness': nan, 'valence': nan, 'tempo': nan, 'time_signature': nan, 'session_6_hour': '2017-01-03 00:00:00', 'session_id': 2}
{'index_x': 62874, 'SongID': 'CloserChic Gamine', 'TimeStamp_Central'

## Define Model class

In [None]:
class ItemEmbedding(Layer):
    def __init__(self, num_items, item_embed_dim, vocab_items):
        super(ItemEmbedding, self).__init__()
        
        self.item_lookup = StringLookup(vocabulary=vocab_items, mask_token=None, num_oov_indices=1)
        self.item_embedding = Embedding(input_dim=num_items, output_dim=item_embed_dim, mask_zero=True)

    def call(self, items):
        # Embed items and features
        items_indices = self.item_lookup(items)
        items_embedded = self.item_embedding(items_indices)
        return items_embedded

class MusicFeatureEmbedding(Layer):
    def __init__(self, genre_vocab, genre_embed_dim):
        super(MusicFeatureEmbedding, self).__init__()
        # String lookup for genre
        self.genre_lookup = StringLookup(vocabulary=genre_vocab, mask_token=None, num_oov_indices=1)
        # Embedding layer for genre
        self.genre_embedding = Embedding(input_dim=len(genre_vocab) + 1, output_dim=genre_embed_dim)

        # Normalization layers for numerical features
        self.track_duration_norm = Normalization(axis=None)
        self.track_popularity_norm = Normalization(axis=None)
        self.danceability_norm = Normalization(axis=None)
        self.energy_norm = Normalization(axis=None)
        self.key_norm = Normalization(axis=None)
        self.loudness_norm = Normalization(axis=None)
        self.mode_norm = Normalization(axis=None)
        self.speechiness_norm = Normalization(axis=None)
        self.acousticness_norm = Normalization(axis=None)
        self.instrumentalness_norm = Normalization(axis=None)
        self.liveness_norm = Normalization(axis=None)
        self.valence_norm = Normalization(axis=None)
        self.tempo_norm = Normalization(axis=None)

    def adapt(self, dataset):
        """Adapt normalization layers using the dataset."""
        for feature, layer in [
            ("spotify_track_duration_ms", self.track_duration_norm),
            ("spotify_track_popularity", self.track_popularity_norm),
            ("danceability", self.danceability_norm),
            ("energy", self.energy_norm),
            ("key", self.key_norm),
            ("loudness", self.loudness_norm),
            ("mode", self.mode_norm),
            ("speechiness", self.speechiness_norm),
            ("acousticness", self.acousticness_norm),
            ("instrumentalness", self.instrumentalness_norm),
            ("liveness", self.liveness_norm),
            ("valence", self.valence_norm),
            ("tempo", self.tempo_norm),
        ]:
            values = dataset.map(lambda x, _: x[feature])
            layer.adapt(values)

    def call(self, inputs):
        # Extract categorical and numerical features
        genre = inputs['spotify_genre']
        duration = inputs['spotify_track_duration_ms']
        popularity = inputs['spotify_track_popularity']
        danceability = inputs['danceability']
        energy = inputs['energy']
        key = inputs['key']
        loudness = inputs['loudness']
        mode = inputs['mode']
        speechiness = inputs['speechiness']
        acousticness = inputs['acousticness']
        instrumentalness = inputs['instrumentalness']
        liveness = inputs['liveness']
        valence = inputs['valence']
        tempo = inputs['tempo']

        # Process categorical features
        genre_indices = self.genre_lookup(genre)
        genre_embedded = self.genre_embedding(genre_indices)

        # Normalize numerical features
        numerical_features = [
            self.track_duration_norm(duration),
            self.track_popularity_norm(popularity),
            self.danceability_norm(danceability),
            self.energy_norm(energy),
            self.key_norm(key),
            self.loudness_norm(loudness),
            self.mode_norm(mode),
            self.speechiness_norm(speechiness),
            self.acousticness_norm(acousticness),
            self.instrumentalness_norm(instrumentalness),
            self.liveness_norm(liveness),
            self.valence_norm(valence),
            self.tempo_norm(tempo),
        ]
        normalized_features = Concatenate()(numerical_features)

        # Combine embeddings and numerical features
        combined_features = Concatenate()([genre_embedded, normalized_features])
        return combined_features
    
class GRU4REC(Model):
    def __init__(self, k, num_unique_items, rnn_params, item_embed_dim, ffn1_units, feature_dense_units, vocab_items, vocab_genres, genre_embed_dim):
        super(GRU4REC, self).__init__()
        self.k = k
        self.embedding = ItemEmbedding(num_unique_items, item_embed_dim, vocab_items)
        self.feature_embedding = MusicFeatureEmbedding(vocab_genres, genre_embed_dim)

        # RNN layers
        self.rnn_layers = []
        self.rnn_layers.append(GRU(**rnn_params[0], return_sequences=True))
        for i in range(1, len(rnn_params) - 1):
            self.rnn_layers.append(GRU(**rnn_params[i], return_sequences=True))
        self.rnn_layers.append(GRU(**rnn_params[-1], return_sequences=False))

        self.concat = Concatenate(axis=-1)

        # Feed-forward layers
        self.feature_dense = Dense(feature_dense_units, activation='relu')
        self.ffn1 = Dense(ffn1_units)
        self.activation1 = LeakyReLU(alpha=0.2)
        self.out = Dense(k, activation='softmax')

    def call(self, inputs, training=False):
        """
        Forward pass for the GRU4REC model.
        :param inputs: Tuple (item_sequences, item_features)
        :param training: Boolean indicating if the model is in training mode
        """
        item_sequences, item_features= inputs

        # Embed items and features
        item_embedded = self.embedding(item_sequences)

        # Transform feature embeddings
        feature_embedded = self.feature_embedding(item_features)
        feature_transformed = self.feature_dense(feature_embedded)

        # Pass through RNN layers
        x = self.rnn_layers[0](item_embedded)
        for i in range(1, len(self.rnn_layers)):
            x = self.concat([item_embedded, x])  # Concatenate item embeddings with RNN outputs
            x = self.rnn_layers[i](x)

        # Concatenate RNN output with feature embeddings
        x = self.concat([x, feature_transformed])

        # Feed-forward layers
        x = self.ffn1(x)
        x = self.activation1(x)
        output = self.out(x)

        return output

## Training Loop

In [26]:
def train_gru4rec(model, dataset, optimizer, loss_fn, num_epochs, top_k=5):
    """Custom training loop for GRU4REC."""
    # Metrics to track loss and top-k precision
    train_loss = Mean(name='train_loss')
    train_top_k_precision = TopKCategoricalAccuracy(k=top_k, name='train_top_k_precision')
    
    # Histories for loss and metrics
    loss_history = []
    metrics_history = []

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        # Reset metrics at the start of each epoch
        train_loss.reset_state()
        train_top_k_precision.reset_state()

        # Iterate over the dataset
        for batch, (item_sequences, item_features, labels) in enumerate(dataset):
            with tf.GradientTape() as tape:
                # Forward pass
                predictions = model((item_sequences, item_features), training=True)
                loss = loss_fn(labels, predictions)

            # Backward pass and optimization
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            # Update metrics
            train_loss.update_state(loss)
            train_top_k_precision.update_state(labels, predictions)

        # Log per epoch
        epoch_loss = train_loss.result().numpy()
        epoch_top_k_precision = train_top_k_precision.result().numpy()
        loss_history.append(epoch_loss)
        metrics_history.append(epoch_top_k_precision)

        print(f"Epoch {epoch + 1}: Loss: {epoch_loss:.4f}, Top-{top_k} Precision: {epoch_top_k_precision:.4f}")

    # Plot training history
    plot_training_history(loss_history, metrics_history, f'Precision@{top_k}', top_k)

def plot_training_history(loss_history, metric_history, metric_name, top_k):
    """Plot the training loss and accuracy."""
    epochs = range(1, len(loss_history) + 1)

    # Create subplots for loss and accuracy
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

    # Plot the training loss
    ax1.plot(epochs, loss_history, label='Loss', color='blue', linestyle='-', marker='o')
    ax1.set_title('Training Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()

    # Plot the top-k accuracy
    ax2.plot(epochs, metric_history, label=metric_name, color='green', linestyle='-', marker='o')
    ax2.set_title(f'Training {metric_name}')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel(metric_name)
    ax2.legend()

    plt.tight_layout()
    plt.show()

## Run the training process

In [None]:
# Define the model
num_items = len()
feature_vocab_size = len(feature_columns)

model = GRU4REC(
    k=num_items,  # Number of items
    num_items=num_items,  # Number of unique items
    rnn_params=[
        {"units": 128},
        {"units": 128},
        {"units": 64}
    ],
    item_embed_dim=64,
    feature_embed_dim=32,
    ffn1_units=128,
    feature_dense_units=64
)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Train the model
train_gru4rec(model, dataset, optimizer, loss_fn, num_epochs=10, top_k=5)



Epoch 1/10


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''dtype='string' is not a valid dtype for Keras type promotion.''
1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling Ite

UnimplementedError: Exception encountered when calling Embedding.call().

[1m{{function_node __wrapped__Cast_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cast string to int32 is not supported [Op:Cast] name: [0m

Arguments received by Embedding.call():
  • inputs=tf.Tensor(shape=(16,), dtype=string)