## import the libraries and setup

In [None]:
# import the libraries
import pandas as pd
import os
from tqdm.notebook import tqdm
import gc
import pickle
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Model

from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Activation, RepeatVector, \
                                    Permute, Multiply, Lambda, Concatenate, LSTM, TimeDistributed
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from collections import Counter
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import backend as K
import random
from sklearn.preprocessing import OneHotEncoder

In [None]:
from joblib import Parallel, delayed

In [None]:
# gpu config
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

In [None]:
# show all columns
pd.set_option('display.max_columns', None)

## Data Prep for train, validation, and test

In [None]:
# current directory
os.getcwd()

In [None]:
# set the main path
os.chdir("D:\RecSys")

In [None]:
# list the files in one of the ten training chunks
files = os.listdir("all_train/training_set")

In [None]:
len(files)

In [None]:
# read the session logs
# how many do you want to read
empty = []
for i in tqdm(range(0,1)):
    sample_df_1 = pd.read_csv("all_train/training_set/" + files[i])
    empty.append(sample_df_1)
    print(files[i])

In [None]:
# del the read sample df from the loop to save memory
del(sample_df_1)

In [None]:
# sampled data
# concatenate into dataframe
train = pd.concat(empty, ignore_index = True)

In [None]:
del(empty)

In [None]:
# check shape
train.shape

In [None]:
# how many unique sessions
train.session_id.nunique()

In [None]:
train.track_id_clean.nunique()

In [None]:
## this function helps with the padding and the sequences to same length

In [None]:
def pad_sequences(sequence, pad_to_this_length, pre): 
    if len(sequence) < pad_to_this_length:
        if pre == True:
            add = np.repeat(0, pad_to_this_length - len(sequence)).astype(str).tolist()
            combo = [add, sequence]
            combo = [item1 for item in combo for item1 in item]
        else:
            add = np.repeat(0, pad_to_this_length - len(sequence)).astype(str).tolist()
            combo = [sequence,add]
            combo = [item1 for item in combo for item1 in item]
            
    else: 
        combo = sequence
    return combo

In [None]:
files[1]

In [None]:
files[4]

In [None]:
# read the session logs
# how many do you want to read
empty = []
for i in tqdm(range(4,5)):
    sample_df_1 = pd.read_csv("all_train/training_set/" + files[i])
    print(files[i])
    empty.append(sample_df_1)

In [None]:
# del the read sample df from the loop to save memory
del(sample_df_1)

In [None]:
# sampled data
# concatenate into dataframe
valid = pd.concat(empty, ignore_index = True)

In [None]:
del(empty)

In [None]:
# check shape
valid.shape

In [None]:
# how many unique sessions
valid.session_id.nunique()

In [None]:
valid.track_id_clean.nunique()

In [None]:
# sort by first session id and then session position
valid = valid.sort_values(["session_id", "session_position"])

In [None]:
# take a look at the data
valid.head(5)

In [None]:
# frequencies for session length
valid.session_length.value_counts()

In [None]:
def get_all_sequences(data): 
    # sort by first session id and then session position
    data = data.sort_values(["session_id", "session_position"])

    data["row_number"] = data.groupby(['session_id']).cumcount()

    data["session_row_number"] = data["session_id"] + data["row_number"].astype(str)

#     train.head()

    train_session_lengths = data.groupby("session_id")[["session_length"]].first()

    train_session_lengths_half = train_session_lengths.copy()

    train_session_lengths_half[["session_length"]] = train_session_lengths_half[["session_length"]]//2

#     train_session_lengths_half

    repeats = np.repeat(train_session_lengths_half.index, train_session_lengths_half.session_length)

    repeats_df = pd.DataFrame(repeats)

    repeats_df["row_number"] =  repeats_df.groupby("session_id").cumcount()

#     repeats_df.head()

    repeats_df["session_row_number"] = repeats_df["session_id"] + repeats_df["row_number"].astype(str)

    train_first_half = pd.merge(data, repeats_df["session_row_number"], how = "inner", left_on = ["session_row_number"], 
        right_on = ["session_row_number"])

#     train_first_half

    train_first_half_check = train_first_half.groupby("session_id")[["session_length"]].count()

#     train_first_half_check.head()

#     np.mean(train_session_lengths_half["session_length"] == train_first_half_check["session_length"])

#     train.index

#     train_first_half.index

    train_second_half = data[~data.session_row_number.isin(train_first_half.session_row_number)]

#     train_first_half.head()

    train_first_half_seqs = pd.DataFrame(train_first_half.groupby("session_id")["track_id_clean"].apply(list)).reset_index()

    train_second_half_seqs = pd.DataFrame(train_second_half.groupby("session_id")["track_id_clean"].apply(list)).reset_index()

    train_all_seqs = pd.DataFrame(data.groupby("session_id")["track_id_clean"].apply(list)).reset_index()

#     train_all_seqs.shape

#     train_second_half_seqs.shape

#     train_all_seqs.shape

    return train_first_half_seqs, train_second_half_seqs, train_all_seqs, \
            np.mean(train_all_seqs.session_id.values == train_first_half_seqs.session_id.values),  \
           np.mean(train_all_seqs.session_id.values == train_second_half_seqs.session_id.values), \
           np.mean(train_first_half_seqs.session_id.values == train_second_half_seqs.session_id.values),\
            train_first_half, \
            train_second_half

In [None]:
train_first_half_seqs, train_second_half_seqs, train_all_seqs, a,b,c, train_first_half, train_second_half = get_all_sequences(train)

In [None]:
a,b,c

In [None]:
valid_first_half_seqs, valid_second_half_seqs, valid_all_seqs, a,b,c, valid_first_half, \
valid_second_half= get_all_sequences(valid)

a,b,c

In [None]:
train_first_half

In [None]:
train_targets = train_second_half.groupby("session_id")["skip_2"].apply(list)

In [None]:
valid_targets = valid_second_half.groupby("session_id")["skip_2"].apply(list)

In [None]:
def pad_sequences_target(sequence, pad_to_this_length, pre): 
    if len(sequence) < pad_to_this_length:
        sequence = [np.float32(i) for i in sequence]
        if pre == True:
            add = np.repeat(-1, pad_to_this_length - len(sequence)).tolist()
            combo = [add, sequence]
            combo = [item1 for item in combo for item1 in item]
        elif pre== False:
            add = np.repeat(-1, pad_to_this_length - len(sequence)).tolist()
            combo = [sequence,add]
            combo = [item1 for item in combo for item1 in item]
            
    else:
        sequence = [np.float32(i) for i in sequence]
        combo = sequence
    return combo

In [None]:
train_targets = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences_target)(i, 10, True) for i in train_targets)

In [None]:
valid_targets = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences_target)(i, 10, False) for i in valid_targets)

In [None]:
valid_targets[:10]

In [None]:
train_targets[:10]

## Read the acoustic features now

In [None]:
# track features
# read the track level acoustic features
track_features_0 = pd.read_csv(r"track_features/tf_000000000000.csv")
track_features_1 = pd.read_csv(r"track_features/tf_000000000001.csv")
# combine into one unified dataframe
track_features_all = pd.concat([track_features_0, track_features_1], ignore_index = True)

In [None]:
track_features_all.shape

In [None]:
## keep only those features which are there in train

In [None]:
track_features_train = track_features_all[track_features_all["track_id"].isin(train.track_id_clean)].reset_index(drop = True)

In [None]:
track_features_train.shape

In [None]:
track_features_all.head()

## Continue Data Prep

In [None]:
def text_to_padded_seq(data_train_first_half, data_train_second_half, data_valid_first_half, data_valid_second_half, 
                       max_len): 
    check_train_first_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i, max_len, True) for i in data_train_first_half["track_id_clean"])
    check_train_second_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i, max_len, False) for i in data_train_second_half["track_id_clean"])
    check_valid_first_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i, max_len, True) for i in data_valid_first_half["track_id_clean"])
    check_valid_second_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i, max_len, False) for i in data_valid_second_half["track_id_clean"])
    
    
    string_lookup_learned = tf.keras.layers.StringLookup(
    max_tokens=train.track_id_clean.nunique()+2, num_oov_indices=1, 
    output_mode='int', mask_token = '0')
    
    string_lookup_learned.adapt(train.track_id_clean.values, batch_size = 1000000)
    
    text_to_seq_train_first_half = string_lookup_learned(check_train_first_half)
    text_to_seq_train_second_half = string_lookup_learned(check_train_second_half)

    text_to_seq_valid_first_half = string_lookup_learned(check_valid_first_half)
    text_to_seq_valid_second_half = string_lookup_learned(check_valid_second_half)
    
    
    string_lookup_static = tf.keras.layers.StringLookup(
    max_tokens=track_features_all.track_id.nunique()+1, num_oov_indices=0, 
    output_mode='int', mask_token = '0')
    
    string_lookup_static.adapt(track_features_all.track_id, batch_size = 1000000)
    
    text_to_seq_train_first_half_static = string_lookup_static(check_train_first_half)
    text_to_seq_train_second_half_static = string_lookup_static(check_train_second_half)

    text_to_seq_valid_first_half_static = string_lookup_static(check_valid_first_half)
    text_to_seq_valid_second_half_static = string_lookup_static(check_valid_second_half)
    
    return(text_to_seq_train_first_half, text_to_seq_train_second_half, 
          text_to_seq_valid_first_half, text_to_seq_valid_second_half, 
          text_to_seq_train_first_half_static, text_to_seq_train_second_half_static, 
          text_to_seq_valid_first_half_static, text_to_seq_valid_second_half_static, string_lookup_static, string_lookup_learned)

In [None]:
text_to_seq_train_first_half, text_to_seq_train_second_half, text_to_seq_valid_first_half, text_to_seq_valid_second_half, \
text_to_seq_train_first_half_static, text_to_seq_train_second_half_static, \
text_to_seq_valid_first_half_static, text_to_seq_valid_second_half_static, string_lookup_static, string_lookup_learned = \
text_to_padded_seq(train_first_half_seqs, train_second_half_seqs, valid_first_half_seqs, valid_second_half_seqs, 10)

In [None]:
train_all_seqs = tf.keras.layers.Concatenate()([text_to_seq_train_first_half, text_to_seq_train_second_half])

In [None]:
valid_all_seqs = tf.keras.layers.Concatenate()([text_to_seq_valid_first_half, text_to_seq_valid_second_half])

In [None]:
train_all_seqs_static = tf.keras.layers.Concatenate()([text_to_seq_train_first_half_static, text_to_seq_train_second_half_static])

In [None]:
valid_all_seqs_static = tf.keras.layers.Concatenate()([text_to_seq_valid_first_half_static, text_to_seq_valid_second_half_static])

In [None]:
# list the files in testing
files = os.listdir("test_set")

In [None]:
# read the session logs
# how many do you want to read
empty = []
for i in tqdm(range(0,1)):
    sample_df_1 = pd.read_csv("test_set/" + files[i])
    empty.append(sample_df_1)

In [None]:
# del the read sample df from the loop to save memory
del(sample_df_1)

In [None]:
# sampled data
# concatenate into dataframe
test = pd.concat(empty, ignore_index = True)

In [None]:
# check shape
test.shape

In [None]:
# how many unique sessions
test.session_id.nunique()

In [None]:
test.track_id_clean.nunique()

In [None]:
# sort by first session id and then session position
test = test.sort_values(["session_id", "session_position"])

In [None]:
# take a look at the data
test.head(15)

In [None]:
# frequencies for session length
test.session_length.value_counts()

## Data Sanity Check

In [None]:
## intersection of tracks between train, test, valid

In [None]:
## percentage common between train and valid

In [None]:
len(set(train.track_id_clean.values).intersection(valid.track_id_clean.values))/valid.track_id_clean.nunique()

In [None]:
## percentage common between test and train

len(set(train.track_id_clean.values).intersection(test.track_id_clean.values))/test.track_id_clean.nunique()

In [None]:
# how about sessions?

In [None]:
len(set(train.session_id.values).intersection(valid.session_id.values))/valid.session_id.nunique()

In [None]:
len(set(train.session_id.values).intersection(test.session_id.values))/test.session_id.nunique()

## Start the data preparation

### creating embedding layer for static features

In [None]:
## Preparing the static track embeddings
## has to be normalized and stuff

In [None]:
## We basically need three types of data
## 1. Static song features - so need to one hot encode here
## 2. Learnable embeddings - so need to know the population of training songs
## 3. Session level features - these are the static user features
# train.head()

In [None]:
## let's first tackle the track embedding network

In [None]:
## for track embedding network, we need to know all the tracks in the
## track features
## and all the tracks in the training data

In [None]:
track_features_all.dtypes

In [None]:
not_float_columns = track_features_all.columns[track_features_all.dtypes != "float64"]

In [None]:
for column in not_float_columns: 
    print(column)
    print(track_features_all[column].nunique())

In [None]:
# release year seems to have many values
# let's treat as numeric

In [None]:
# category columns
# basically need to read spotify documentation
cat_columns = ["key", "mode"]

In [None]:
# one hot encode these
transformer = OneHotEncoder()

In [None]:
# fit the transformer
transformer.fit(track_features_all[cat_columns])

In [None]:
# transform for one hot encode
ohe_features = transformer.transform(track_features_all[cat_columns])

In [None]:
# all columns but the categorical treat as float
float_columns = track_features_all.columns[~track_features_all.columns.isin(cat_columns)].drop("track_id")

In [None]:
# convert to array
track_features_all_array = np.array(track_features_all[float_columns])

In [None]:
track_features_all_array.shape

In [None]:
# convert sparse one hot encoded stuff to array
ohe_features = ohe_features.toarray()

In [None]:
# stack the float features and one hot encoded stuff side by side
track_features_all_array = np.hstack((track_features_all_array, ohe_features))

In [None]:
# paper mentions that they convery everything to zero mean
# and unit variance so confirm that
std = StandardScaler()

In [None]:
# fit the scaler
std.fit(track_features_all_array)

In [None]:
# transform
track_features_all_array = std.transform(track_features_all_array)

In [None]:
# all means should be zero
np.mean(track_features_all_array,0)

In [None]:
# all std should be 1
np.std(track_features_all_array,0)

In [None]:
string_lookup_static.get_vocabulary()

In [None]:
track_features_all.track_id.values

In [None]:
track_features_all.track_id.values.shape

In [None]:
track_features_all_array.shape

In [None]:
track_features_all = pd.concat([track_features_all.track_id, pd.DataFrame(track_features_all_array)],1)

In [None]:
track_features_all_weights = track_features_all.set_index("track_id").reindex(string_lookup_static.get_vocabulary()[1:])

In [None]:
# add a row of zeros for padding

In [None]:
track_features_all_weights = np.vstack((np.zeros((1,41)), track_features_all_weights.values))

In [None]:
track_features_all_weights.shape

In [None]:
## now we can instantiate an embedding matrix

In [None]:
static_embedding_layer = tf.keras.layers.Embedding(len(track_features_all_weights), track_features_all_weights.shape[1], 
                         weights = [track_features_all_weights], trainable = False, mask_zero = True)

In [None]:
# can we store it?

In [None]:
input_static_embeddings = tf.keras.layers.Input(shape = (10,))

In [None]:
static_embeddings = static_embedding_layer(input_static_embeddings)

In [None]:
model_static_embeddings = tf.keras.models.Model(input_static_embeddings, static_embeddings)

In [None]:
model_static_embeddings.summary()

In [None]:
model_static_embeddings.save("static_embedding_model")

### creating the playback and meta features

In [None]:
def meta_playback_features_train(data):
    data["day_of_week"] = pd.to_datetime(data["date"]).dt.day_name()

    meta_columns = ["premium", "day_of_week"]

    # one hot encode these
    transformer_meta = OneHotEncoder()

    # fit the transformer
    transformer_meta.fit(data[meta_columns])

    # transform for one hot encode
    ohe_features_meta = transformer_meta.transform(data[meta_columns])

#     data["session_length"].values.reshape(-1,1).shape

    meta_features_train = np.hstack((ohe_features_meta.toarray(), data["session_length"].values.reshape(-1,1)))

    std_meta_features_train = StandardScaler()

    meta_features_train = std_meta_features_train.fit_transform(meta_features_train)

    # other playback features

    playback_features = data.columns[~data.columns.isin(meta_columns)]

# playback_features.dtypes

# for col in playback_data.columns:
#     print(col)
#     print(playback_data[col].nunique())

    categorical_features = [features for features in playback_features.tolist() if features not in ['session_id', "session_position", 
                                                                                      'track_id_clean', 
                                                                                       'hist_user_behavior_n_seekfwd',
                                                                                       'hist_user_behavior_n_seekback', 
                                                                                      'hour_of_day','date'
                                                                                      ]]

    playback_data_cat = data[categorical_features]

#     playback_data_cat.head()

    # one hot encode these
    transformer_playback_cat = OneHotEncoder()

    # fit the transformer
    transformer_playback_cat.fit(data[categorical_features])

    # transform for one hot encode
    playback_ohe_features_train = transformer_playback_cat.transform(data[categorical_features])

    # playback numerical features

    playback_numerical_train = data[["session_position", 'hist_user_behavior_n_seekfwd','hist_user_behavior_n_seekback', 'hour_of_day']]

    std_playback_numerical_train = StandardScaler()

    std_playback_numerical_train.fit(playback_numerical_train)

    playback_numerical_train = std_playback_numerical_train.transform(playback_numerical_train)

    playback_features_train = np.hstack((playback_ohe_features_train.toarray(), playback_numerical_train))
    
    return meta_features_train, playback_features_train, transformer_meta, std_meta_features_train, \
           transformer_playback_cat, std_playback_numerical_train

In [None]:
meta_features_train, playback_features_train, transformer_meta, std_meta_features_train, \
           transformer_playback_cat, std_playback_numerical_train = meta_playback_features_train(train)

In [None]:
meta_features_train = pd.concat([train[["session_id", "session_position"]], pd.DataFrame(meta_features_train)],1)


In [None]:
meta_features_train = meta_features_train.drop_duplicates(["session_id"]).reset_index(drop = True).drop("session_position",1)

In [None]:
meta_features_train.head()

In [None]:
playback_features_train = pd.concat([train[["session_id", "session_position"]], pd.DataFrame(playback_features_train)],1)


In [None]:
playback_features_train["row_number"] = playback_features_train.groupby("session_id").cumcount()

In [None]:
playback_features_train["session_row_number"] = playback_features_train["session_id"] + \
                                                playback_features_train["row_number"].astype(str)

In [None]:
playback_features_train.head()

In [None]:
train_first_half.head()

In [None]:
## this is an important step, keeping only those playbacks which are in first half of train and valid'

In [None]:
playback_features_train = playback_features_train[playback_features_train.session_row_number.isin(train_first_half.session_row_number)]

In [None]:
playback_features_train.shape

In [None]:
playback_features_train

In [None]:
import gc

In [None]:
gc.collect()

In [None]:
list_of_playback_tracks = pd.DataFrame(playback_features_train.groupby(["session_id"])["session_row_number"].apply(list))

In [None]:
check_train_first_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i,
                         10, True) for i in list_of_playback_tracks["session_row_number"])

In [None]:
check_train_first_half

In [None]:
playback_features_train.head()

In [None]:
string_lookup_playback = tf.keras.layers.StringLookup(
max_tokens=playback_features_train.session_row_number.nunique()+1, num_oov_indices=0, 
output_mode='int', mask_token = '0')
    
string_lookup_playback.adapt(playback_features_train.session_row_number, batch_size = 100000)

In [None]:
playback_features_train = playback_features_train.drop(["session_id","session_position", "row_number"],1)

In [None]:
playback_features_train = playback_features_train.set_index("session_row_number").reindex(string_lookup_playback.get_vocabulary()[1:])

In [None]:
playback_features_train.head()

In [None]:
playback_weights = np.vstack((np.zeros((1,playback_features_train.shape[1])), 
                              playback_features_train.values))

In [None]:
playback_embedding = tf.keras.layers.Embedding(playback_weights.shape[0], playback_weights.shape[1], mask_zero = True, 
                         weights = [playback_weights], trainable = False)

In [None]:
np.mean(meta_features_train.session_id == train_first_half_seqs.session_id)

In [None]:
train_playback_first_half_seq = string_lookup_playback(check_train_first_half)

In [None]:
with tf.device('/cpu:0'):
    playback_embedding_first_half = playback_embedding(train_playback_first_half_seq)

In [None]:
def meta_playback_features(data, transformer_meta, std_meta_features_train, transformer_playback_cat, 
                          std_playback_numerical_train):
    data["day_of_week"] = pd.to_datetime(data["date"]).dt.day_name()

    meta_columns = ["premium", "day_of_week"]

    # one hot encode these
#     transformer_meta = OneHotEncoder()

    # fit the transformer
    transformer_meta.fit(data[meta_columns])

    # transform for one hot encode
    ohe_features_meta = transformer_meta.transform(data[meta_columns])

#     data["session_length"].values.reshape(-1,1).shape

    meta_features_train = np.hstack((ohe_features_meta.toarray(), data["session_length"].values.reshape(-1,1)))

#     std_meta_features_train = StandardScaler()

    meta_features_train = std_meta_features_train.fit_transform(meta_features_train)

    # other playback features

    playback_features = data.columns[~data.columns.isin(meta_columns)]

# playback_features.dtypes

# for col in playback_data.columns:
#     print(col)
#     print(playback_data[col].nunique())

    categorical_features = [features for features in playback_features.tolist() if features not in ['session_id', "session_position", 
                                                                                      'track_id_clean', 
                                                                                       'hist_user_behavior_n_seekfwd',
                                                                                       'hist_user_behavior_n_seekback', 
                                                                                      'hour_of_day','date'
                                                                                      ]]

    playback_data_cat = data[categorical_features]

#     playback_data_cat.head()

    # one hot encode these
#     transformer_playback_cat = OneHotEncoder()

    # fit the transformer
    transformer_playback_cat.fit(data[categorical_features])

    # transform for one hot encode
    playback_ohe_features_train = transformer_playback_cat.transform(data[categorical_features])

    # playback numerical features

    playback_numerical_train = data[["session_position", 'hist_user_behavior_n_seekfwd','hist_user_behavior_n_seekback', 'hour_of_day']]

#     std_playback_numerical_train = StandardScaler()

    std_playback_numerical_train.fit(playback_numerical_train)

    playback_numerical_train = std_playback_numerical_train.transform(playback_numerical_train)

    playback_features_train = np.hstack((playback_ohe_features_train.toarray(), playback_numerical_train))
    
    return meta_features_train, playback_features_train

In [None]:
valid_features = meta_playback_features(valid, transformer_meta, std_meta_features_train, transformer_playback_cat, 
                          std_playback_numerical_train)

In [None]:
meta_features_valid, playback_features_valid = valid_features[0], valid_features[1]

In [None]:
meta_features_valid = pd.concat([valid[["session_id", "session_position"]], pd.DataFrame(meta_features_valid)],1)


In [None]:
meta_features_valid = meta_features_valid.drop_duplicates(["session_id"]).reset_index(drop = True).drop("session_position",1)

In [None]:
meta_features_valid.head()

In [None]:
np.mean(meta_features_valid.session_id == valid_first_half_seqs.session_id)

In [None]:
playback_features_valid = pd.concat([valid[["session_id", "session_position"]], pd.DataFrame(playback_features_valid)],1)


In [None]:
playback_features_valid["row_number"] = playback_features_valid.groupby("session_id").cumcount()

In [None]:
playback_features_valid["session_row_number"] = playback_features_valid["session_id"] + \
                                                playback_features_valid["row_number"].astype(str)

In [None]:
playback_features_valid = playback_features_valid[playback_features_valid.session_row_number.isin(valid_first_half.session_row_number)]

In [None]:
playback_features_valid

In [None]:
valid_first_half.head()

In [None]:
valid_first_half_seqs

In [None]:
list_of_playback_tracks_valid = pd.DataFrame(playback_features_valid.groupby(["session_id"])["session_row_number"].apply(list))

In [None]:
# [len(l[0]) for l in list_of_playback_tracks_valid[["session_row_number"]].values]

In [None]:
check_valid_first_half = Parallel(n_jobs=6, verbose = 3)(delayed(pad_sequences)(i,
                         10, True) for i in list_of_playback_tracks_valid["session_row_number"])

In [None]:
playback_features_valid.head()

In [None]:
string_lookup_playback_valid = tf.keras.layers.StringLookup(
max_tokens=playback_features_valid.session_row_number.nunique()+1, num_oov_indices=0, 
output_mode='int', mask_token = '0')
    
string_lookup_playback_valid.adapt(playback_features_valid.session_row_number, batch_size = 100000)

In [None]:
playback_features_valid = playback_features_valid.drop(["session_id","session_position", "row_number"],1)

In [None]:
playback_features_valid = playback_features_valid.set_index("session_row_number").reindex(string_lookup_playback_valid.get_vocabulary()[1:])

In [None]:
playback_features_valid.head()

In [None]:
playback_weights_valid = np.vstack((np.zeros((1,playback_features_valid.shape[1])), 
                              playback_features_valid.values))

In [None]:
playback_embedding_valid = tf.keras.layers.Embedding(playback_weights_valid.shape[0], playback_weights_valid.shape[1], mask_zero = True, 
                         weights = [playback_weights_valid], trainable = False)

In [None]:
np.array(check_valid_first_half).shape

In [None]:
valid_playback_first_half_seq = string_lookup_playback_valid(check_valid_first_half)

In [None]:
valid_playback_first_half_seq.shape

In [None]:
with tf.device('/cpu:0'):
    playback_embedding_first_half_valid = playback_embedding_valid(valid_playback_first_half_seq)

In [None]:
playback_embedding_first_half_valid.shape

In [None]:
playback_embedding_first_half.shape

In [None]:
## keep only those playbacks which exist in first half of train and valid

In [None]:
## need to pad the playback train and valid features as well

In [None]:
## and check if the ordering by session_ids is consistent

In [None]:
text_to_seq_train_first_half, text_to_seq_train_second_half, text_to_seq_valid_first_half, text_to_seq_valid_second_half, \
text_to_seq_train_first_half_static, text_to_seq_train_second_half_static, \
text_to_seq_valid_first_half_static, text_to_seq_valid_second_half_static, string_lookup_static, 
train_all_seqs, valid_all_seqs, train_all_seqs_static, valid_all_seqs_static, train_targets, valid_targets, meta_features_train, 
meta_features_valid

In [None]:
meta_features_train.iloc[:,1:]

In [None]:
train_targets_array = np.array(train_targets).reshape(np.array(train_targets).shape[0], np.array(train_targets).shape[1])

In [None]:
valid_targets_array = np.array(valid_targets).reshape(np.array(valid_targets).shape[0], np.array(valid_targets).shape[1])

In [None]:
valid_targets_array.shape

## Make the model

In [None]:
class SimpleAttention(tf.keras.layers.Layer):
    def call(self, input_tensor, mask ):
#         broadcast_float_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        x = tf.keras.layers.Dense(32)(input_tensor)
        x = tf.keras.layers.Dense(1)(x)
        x = tf.keras.layers.Flatten()(x)
        a =  tf.keras.layers.Softmax()(x, mask = mask)
        x = tf.keras.layers.RepeatVector(32)(a)
        x = tf.keras.layers.Permute((2,1))(x)
        x = tf.keras.layers.Multiply()((input_tensor, x))
        x = tf.keras.layers.Lambda(lambda x: tf.math.reduce_sum(x, axis = 1))(x)
        return a, x

In [None]:
# first handle the first half sequence for the learnable embeddings
learnable_first_half_sequence = tf.keras.layers.Input(shape = (10,), name = "first_half_learnable_tracks")

# now handle the first half sequence for the static embeddings
static_first_half_sequence = tf.keras.layers.Input(shape = (10,), name = "first_half_static_tracks")

# pass these two through their respective embedding layers

learnable_embs_layer = tf.keras.layers.Embedding(len(string_lookup_learned.get_vocabulary()), 50, mask_zero = True, 
                                                 name = "learnable_embeddings")

# emb processed output for learnable embs
learnable_first_half_embs_output = learnable_embs_layer(learnable_first_half_sequence)

# keep this mask
first_half_mask = tf.keras.layers.Masking(name = "mask_for_first_half")(learnable_first_half_sequence)

# emb processed output for static embs
static_first_half_embs_output = static_embedding_layer(static_first_half_sequence)

# combine these two together
concatenated_embeddings_first_half = tf.keras.layers.Concatenate(name = \
                                                                 "concatenate_learnable_with_static_first_half") \
                                                                    ([learnable_first_half_embs_output,
                                                                    static_first_half_embs_output])

# now we need to pass these concatenated embeddings through a Dense layer
combining_learnable_static_embs_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(50), 
                                                                       name = "time_distributed_for_embs_transforms")

# transformed embs first half
transformed_embeddings_first_half = combining_learnable_static_embs_layer(concatenated_embeddings_first_half, 
                                                                          mask = tf.cast(first_half_mask, tf.bool))

# get the input for the second half

# first handle the second half sequence for the learnable embeddings
learnable_second_half_sequence = tf.keras.layers.Input(shape = (10,), name = "input_second_half_learnable_tracks")

# keep this mask
second_half_mask = tf.keras.layers.Masking(name = "second_half_mask")(learnable_second_half_sequence)

# now handle the first half sequence for the static embeddings
static_second_half_sequence = tf.keras.layers.Input(shape = (10,), name = "input_second_half_static_tracks")

# get the second half learnable embs output
learnable_second_half_embs_output = learnable_embs_layer(learnable_second_half_sequence)

# get the second half static embs output 
static_second_half_embs_output = static_embedding_layer(static_second_half_sequence)

# combine these two together
concatenated_embeddings_second_half = tf.keras.layers.Concatenate(name = "concatenate_learnable_with_static_second_half")([learnable_second_half_embs_output,
                                                                    static_second_half_embs_output])

# transformed embs second half
transformed_embeddings_second_half = combining_learnable_static_embs_layer(concatenated_embeddings_second_half, 
                                                                          mask = tf.cast(second_half_mask, tf.bool))

# transformed embeddings session
transformed_embeddings_session = tf.keras.layers.Concatenate(axis = -2, 
                                                            name = "concatenate_embs_for_session")([transformed_embeddings_first_half, 
                                                               transformed_embeddings_second_half])

# combined mask session
combined_mask_session = tf.keras.layers.Concatenate(name = "concatenate_first_second_half_mask")([first_half_mask, second_half_mask])

# pass the session embeddings through an LSTM first
lstm_for_session_encoder_layer = tf.keras.layers.LSTM(32, return_sequences = True, name = "LSTM_for_first_half", 
                                                      recurrent_dropout = 0.750)

# lstm processed session encodings at each timestep
lstm_processed_embs_for_session = lstm_for_session_encoder_layer(transformed_embeddings_session, mask = tf.cast(combined_mask_session, tf.bool))


## putting the attention here
x = tf.keras.layers.Dense(lstm_processed_embs_for_session.get_shape()[-1])(lstm_processed_embs_for_session)
x = tf.keras.layers.Dense(1)(x)
x = tf.keras.layers.Flatten()(x)
a =  tf.keras.layers.Softmax()(x, mask = combined_mask_session)
x = tf.keras.layers.RepeatVector(lstm_processed_embs_for_session.get_shape()[-1])(a)
x = tf.keras.layers.Permute((2,1))(x)
x = tf.keras.layers.Multiply()((lstm_processed_embs_for_session, x))
x = tf.keras.layers.Lambda(lambda x: tf.math.reduce_sum(x, axis = 1))(x)

# put the attention on top of the session encodings
# _, session_embedding = SimpleAttention(name = "Attention_for_session_encodings")(lstm_processed_embs_for_session, combined_mask_session)

## get the hidden states ready for the lstm to consume

# first concatenate the session embedding and the meta features
meta_features = tf.keras.layers.Input(shape = (10), name = "input_for_meta_features")

# concatenate with the session encodinh
meta_with_session_encoding = tf.keras.layers.Concatenate(name = "concatenate_meta_with_session")([meta_features, x])

# pass this twice through some dense layers
# # Pass through one Dense for state
dense_for_state = Dense(32, name = "transform_for_lstm_hidden_state",activation = "relu")

# # Pass through one Dense for carry
dense_for_carry = Dense(32, name = "transform_for_lstm_carry_state",activation = "relu")

## hidden state
first_half_lstm_hidden_state = dense_for_state(meta_with_session_encoding)

## carry state
first_half_lstm_carry = dense_for_carry(meta_with_session_encoding)

# combine the states
encoder_states = [first_half_lstm_hidden_state, first_half_lstm_carry]

# now get the input for the playback tracks or sequences
playback_first_half_input = tf.keras.layers.Input(shape = (10,60,), name = "playback_input")

# concatenate this with the embeddings from the first half
encoder_seq_input = tf.keras.layers.Concatenate(name = "concatenate_playback_with_first_half_encodings")([tf.cast(transformed_embeddings_first_half, tf.float32),
                                                   tf.cast(playback_first_half_input, tf.float32)])

## pass this through an lstm for the encoder
lstm_for_first_half = tf.keras.layers.LSTM(32, return_state = True, name = "lstm_for_first_half", 
                                           recurrent_dropout = 0.75)


# lstm processed first half encoder
_, lstm_first_half_hidden, lstm_first_half_carry = lstm_for_first_half(encoder_seq_input, mask = tf.cast(first_half_mask, tf.bool), 
                                                        initial_state = encoder_states)

# now get the state to be passed to the decoder
decoder_states = [lstm_first_half_hidden, lstm_first_half_carry]

# pass this decoder state to the lstm for the second half along with mask 
lstm_for_second_half = tf.keras.layers.LSTM(32, return_sequences = True, name = "lstm_for_second_half", 
                                           recurrent_dropout = 0.75)

# pass the second half sequence through this
features_to_predict = lstm_for_second_half(transformed_embeddings_second_half, mask = tf.cast(second_half_mask, tf.bool), 
                    initial_state = decoder_states)

# now pass them through a time distributed dense layer
time_distibuted_dense_for_classification = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation = "sigmoid"), name = "time_distributed_for_classification")

# get the predictions
final_preds_for_logits = time_distibuted_dense_for_classification(features_to_predict, mask = tf.cast(second_half_mask, tf.bool))

# flatten
final_preds_for_logits = tf.keras.layers.Flatten()(final_preds_for_logits)

In [None]:
model = tf.keras.models.Model([learnable_first_half_sequence, static_first_half_sequence, 
                              learnable_second_half_sequence, static_second_half_sequence, 
                              meta_features, playback_first_half_input], final_preds_for_logits)

In [None]:
model.summary()

In [None]:
def custom_binary_cross_entropy(y_true, y_pred):
    mask_current = tf.cast(y_true != -1,tf.float32) 
    y_true = tf.math.multiply(mask_current, y_true)
#         y_pred = mask_current*(y_pred)
    denom = tf.math.reduce_sum(mask_current, axis = 1)
    bce = -(y_true*tf.cast(tf.math.log(y_pred),tf.float32) + (1-y_true)*tf.cast(tf.math.log(1-y_pred), tf.float32))
    bce = tf.math.multiply(bce, mask_current)
#     weights = tf.range(1,(tf.shape(bce)[1]*3)+1,3)[:tf.shape(bce)[1]][::-1]
    weights_overall = tf.convert_to_tensor(np.array([20,1,1,1,1,1,1,1,1,1]))
    weights = weights_overall[:tf.shape(bce)[1]]
    weights_sum = tf.math.reduce_sum(weights)
    bce = bce*tf.cast(weights, tf.float32)
    num = tf.math.reduce_sum(bce, axis = 1)
    return num/tf.cast(weights_sum, tf.float32)

In [None]:
def average_accuracy(actual, preds):
    preds = tf.cast(preds > 0.5, tf.float32)
    first_part = tf.cast(tf.math.equal(actual, preds), tf.float32)
    second_part = tf.math.cumsum(first_part, axis = 1)
    third_part = tf.math.multiply(first_part, second_part)
    fourth_part = tf.math.cumsum(tf.ones((tf.shape(actual)[0],tf.shape(actual)[1])), axis = 1)
    fifth_part = tf.math.divide(third_part, fourth_part)
    sixth = K.mean(fifth_part, axis = 1)
    return sixth

In [None]:
class average_accuracy(tf.keras.metrics.Metric):
    def __init__(self, name="average_accuracy", **kwargs):
        super().__init__(name=name, **kwargs)
        self.average_accuracy = self.add_weight(name="aa", initializer="zeros")
        self.total_samples = self.add_weight(name="ts", initializer="zeros", dtype = "int32")
        
    def update_state(self, actual, preds, sample_weight=None):
        actual = tf.cast(actual , tf.float32)
        mask_inner = tf.not_equal(actual, -1)

        cumsum_mask_inner = tf.math.cumsum(tf.cast(mask_inner, tf.float32), axis = 1)
        
        
        
        preds = tf.cast(preds > 0.5, tf.float32)
        first_part = tf.cast(tf.math.equal(actual, preds), tf.float32)
        second_part = tf.math.cumsum(first_part, axis = 1)
        third_part = tf.math.multiply(first_part, second_part)
        mymean = tf.math.divide(third_part, cumsum_mask_inner)
        mymean = tf.where(tf.math.is_nan(mymean), tf.zeros_like(mymean), mymean)
        mymean = tf.reduce_sum(tf.where(tf.math.is_nan(mymean), tf.zeros_like(mymean), mymean),1)
        mymean1 = tf.reduce_sum(tf.cast(mask_inner, tf.float32), 1)
#         fourth_part = tf.math.cumsum(tf.ones((tf.shape(actual)[0],tf.shape(actual)[1])), axis = 1)
        fifth_part = tf.math.divide(mymean, mymean1)
#         sixth = K.mean(fifth_part, axis = 1)
        num_samples = tf.shape(preds)[0]
        self.average_accuracy.assign_add(tf.reduce_sum(fifth_part))
        self.total_samples.assign_add(num_samples)
        
    def result(self):
        return self.average_accuracy/tf.cast(self.total_samples, tf.float32)

    def reset_state(self):
        # The state of the metric will be reset at the start of each epoch.
        self.average_accuracy.assign(0.0)
        self.total_samples.assign(0)

In [None]:
class average_accuracy(tf.keras.metrics.Metric):
    def __init__(self, name="average_accuracy", **kwargs):
        super().__init__(name=name, **kwargs)
        self.average_accuracy = self.add_weight(name="aa", initializer="zeros")
        self.total_samples = self.add_weight(name="ts", initializer="zeros", dtype = "int32")
        
    def update_state(self, actual, preds, sample_weight=None):
        actual = tf.cast(actual , tf.float32)
        mask_inner = tf.not_equal(actual, -1)

        cumsum_mask_inner = tf.math.cumsum(tf.cast(mask_inner, tf.float32), axis = 1)
        
        
        
        preds = tf.cast(preds > 0.5, tf.float32)
        first_part = tf.cast(tf.math.equal(actual, preds), tf.float32)
        second_part = tf.math.cumsum(first_part, axis = 1)
        third_part = tf.math.multiply(first_part, second_part)
        mymean = tf.math.divide(third_part, cumsum_mask_inner)
        mymean = tf.where(tf.math.is_nan(mymean), tf.zeros_like(mymean), mymean)
        mymean = tf.reduce_sum(tf.where(tf.math.is_nan(mymean), tf.zeros_like(mymean), mymean),1)
        mymean1 = tf.reduce_sum(tf.cast(mask_inner, tf.float32), 1)
#         fourth_part = tf.math.cumsum(tf.ones((tf.shape(actual)[0],tf.shape(actual)[1])), axis = 1)
        fifth_part = tf.math.divide(mymean, mymean1)
#         sixth = K.mean(fifth_part, axis = 1)
        num_samples = tf.shape(preds)[0]
        self.average_accuracy.assign_add(tf.reduce_sum(fifth_part))
        self.total_samples.assign_add(num_samples)
        
    def result(self):
        return self.average_accuracy/tf.cast(self.total_samples, tf.float32)

    def reset_state(self):
        # The state of the metric will be reset at the start of each epoch.
        self.average_accuracy.assign(0.0)
        self.total_samples.assign(0)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.0005),
    loss=custom_binary_cross_entropy, metrics = [average_accuracy()])

In [None]:
model.fit([text_to_seq_train_first_half, text_to_seq_train_first_half_static,
           text_to_seq_train_second_half, text_to_seq_train_second_half_static, 
           np.array(meta_features_train.iloc[:,1:]), playback_embedding_first_half
          ], train_targets_array, verbose = 1, epochs = 10, 
         validation_data = ([text_to_seq_valid_first_half, text_to_seq_valid_first_half_static,
           text_to_seq_valid_second_half, text_to_seq_valid_second_half_static, 
           np.array(meta_features_valid.iloc[:,1:]), playback_embedding_first_half_valid
          ],valid_targets_array) , 
         callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_average_accuracy",
                                                      patience = 3, restore_best_weights=True, 
                                                     mode = "max"), 
         batch_size = 2048, shuffle = True)

In [None]:
model.predict([text_to_seq_valid_first_half[:5,:], text_to_seq_valid_first_half_static[:5,:],
           text_to_seq_valid_second_half[:5,:], text_to_seq_valid_second_half_static[:5,:], 
           np.array(meta_features_valid.iloc[:,1:])[:5,:], playback_embedding_first_half_valid[:5,:]
          ])

In [None]:
valid_targets_array[:5,:]

In [None]:
tf.keras.utils.plot_model(model, show_shapes = True, to_file = "functional_model.png")

## Miscallaneous verify how to mask the binary cross entropy loss

In [None]:
input_layer = tf.keras.layers.Input(shape = (5,))

In [None]:
emb_layer = tf.keras.layers.Embedding(10,8, mask_zero = True)

In [None]:
emb_processed = emb_layer(input_layer)

In [None]:
emb_processed

In [None]:
dense_layer = tf.keras.layers.Dense(1)

In [None]:
dense_output = dense_layer(emb_processed)

In [None]:
dense_output

In [None]:
model = tf.keras.models.Model(input_layer, dense_output)

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True))

In [None]:
input_seq = np.array([0,0,2,3,4]).reshape(1,-1)

In [None]:
input_seq.shape

In [None]:
target = np.array([-1, -1, 1, 0, 1]).reshape(1,-1).reshape(1,5,1)

In [None]:
target.shape

In [None]:
model.evaluate(input_seq, target)

In [None]:
model.predict(input_seq)

In [None]:
new_input = tf.keras.layers.Input(shape = ())

In [None]:
emb_new_processed = emb_layer(new_input)

In [None]:
dense_output_new = dense_layer(emb_new_processed)

In [None]:
dense_output_new

In [None]:
new_model = tf.keras.models.Model(new_input, dense_output_new)

In [None]:
new_model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True))

In [None]:
(new_model.evaluate([2],[1]) + new_model.evaluate([3],[0]) + new_model.evaluate([4],[1]))/5

## with more than 1 sample

In [None]:
input_layer = tf.keras.layers.Input(shape = (5,))

In [None]:
emb_layer = tf.keras.layers.Embedding(10,8, mask_zero = True)

In [None]:
emb_processed = emb_layer(input_layer)

In [None]:
emb_processed

In [None]:
dense_layer = tf.keras.layers.Dense(1)

In [None]:
dense_output = dense_layer(emb_processed)

In [None]:
dense_output

In [None]:
model = tf.keras.models.Model(input_layer, dense_output)

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True))

In [None]:
np.array([[0,0,2,3,4], [0,1,2,3,4]])

In [None]:
input_seq = np.array([[0,0,2,3,4], [0,1,2,3,4]])

In [None]:
input_seq.shape

In [None]:
target = np.array([[-1, -1, 1, 0, 1], 
                 [-1,1,0,0,1]]).reshape(2,5,1)

In [None]:
target.shape

In [None]:
model.evaluate(input_seq, target)

In [None]:
model.predict(input_seq)

In [None]:
new_input = tf.keras.layers.Input(shape = ())

In [None]:
emb_new_processed = emb_layer(new_input)

In [None]:
dense_output_new = dense_layer(emb_new_processed)

In [None]:
dense_output_new

In [None]:
new_model = tf.keras.models.Model(new_input, dense_output_new)

In [None]:
new_model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True))

In [None]:
(new_model.evaluate([2],[1]) + new_model.evaluate([3],[0]) + new_model.evaluate([4],[1]))/5

In [None]:
(new_model.evaluate([1],[1]) + new_model.evaluate([2],[0]) + new_model.evaluate([3],[0]) + new_model.evaluate([4],[1]))/5

In [None]:
(0.5487704753875733 + 0.4106751441955566)/2