In [1]:
#import necessary libraries
import os 
import ast
import json 
import time
import math
import psutil
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import scipy.sparse as sp
import pyarrow.parquet as pq
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
#check working directory
current_dir = os.getcwd() 
print("Current directory:", current_dir)

Current directory: /home/jupyter


#### Load data

In [3]:
#directory
direc = "gs://ncf446-201929129/rdd_data_metadata/"

In [4]:
#read parquet files, every one of whcih is made of 10 raw json files which is equivalent to 1000 playlists
fpath = []

for i in range(1, 6, 1):
    
    path = direc + "batch_" + str(i)+'/' 
    fpath.append(path)

In [5]:
#combine to become a big dataframe
def load(): 
    
    for index, value in tqdm(enumerate(fpath)):
        
        file = pd.read_parquet(value)
        
        if index == 0: #first file
            df = file 
            
        else:
            df = pd.concat([df, file], axis = 0) #combine files together into one big dataframe
            
    return df

In [6]:
df = load()

5it [00:15,  3.09s/it]


In [7]:
print(f"Shape of dataframe: {df.shape}")

Shape of dataframe: (50000, 6)


In [8]:
df.head(5)

Unnamed: 0,pid,name,num_tracks,num_albums,num_followers,tracks
0,0,Throwbacks,52,47,1,"[{'pos': '0', 'track_name': 'Lose Control (fea..."
1,1,Awesome Playlist,39,23,1,"[{'pos': '0', 'track_name': 'Eye of the Tiger'..."
2,2,korean,64,51,1,"[{'pos': '0', 'track_name': 'Like You', 'track..."
3,3,mat,126,107,1,"[{'pos': '0', 'track_name': 'Danse macabre', '..."
4,4,90s,17,16,2,"[{'pos': '0', 'track_name': 'Tonight, Tonight'..."


In [9]:
def load_pidNTrack():

    trackList = []
    
    for index, row in tqdm(df.iterrows()): #iterate through each row in dataframe
        for track in row['tracks']:
            trackList.append([ row['pid'],track['track_uri']]) 

    songPlaylist = pd.DataFrame(trackList, columns=[ 'pid', 'track_uri']) #create dataframe
    
    print(songPlaylist.shape)
    
    return songPlaylist

In [10]:
new_df = load_pidNTrack() 

50000it [00:15, 3137.60it/s]


(3348258, 2)


In [11]:
new_df.head(5)

Unnamed: 0,pid,track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H


#### Encode the data

In [12]:
def encode(): #encode track_uri to integer so that model can analyse and pattern mine

    label_encoder = LabelEncoder() #sklearn module to encode
    encode_df = new_df.copy() #dont change the original dataframe
    encode_df["encoded_track_uri"] = label_encoder.fit_transform(encode_df["track_uri"]) #attach the original info for convienience

    return encode_df

In [13]:
encode_df = encode()
print(f'dimension of the encode_df dataframe: {encode_df.shape}')
encode_df.head()

dimension of the encode_df dataframe: (3348258, 3)


Unnamed: 0,pid,track_uri,encoded_track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,29183
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,369182
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,31359
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,68783
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,104309


In [14]:
# group the tracks back into a list
groupedDF = encode_df.groupby('pid').agg(list).reset_index() #group by pid and aggregate the track_uri into a list
print(f"dimension of groupedDF dataframe: {groupedDF.shape}") #check shape make sure no mistake
groupedDF.head()
#groupedDF['pid'] #print this to check if the pid values are unqiue 

dimension of groupedDF dataframe: (50000, 3)


Unnamed: 0,pid,track_uri,encoded_track_uri
0,0,"[spotify:track:0UaMYEvWZi0ZqiDOoHU3YI, spotify...","[29183, 369182, 31359, 68783, 104309, 31987, 3..."
1,1,"[spotify:track:2HHtWyy5CgaQbC7XSoOb0e, spotify...","[134057, 80210, 232086, 83300, 81170, 201792, ..."
2,2,"[spotify:track:74tqql9zP6JjF5hjkHHUXp, spotify...","[415068, 273492, 275926, 97422, 30879, 122366,..."
3,3,"[spotify:track:4WJ7UMD4i6DOPzyXU5pZSz, spotify...","[265305, 78749, 396965, 109420, 320661, 292327..."
4,4,"[spotify:track:4iCGSi1RonREsPtfEKYj5b, spotify...","[276674, 343402, 36247, 446621, 401973, 208252..."


#### We only want to consider playlist that has more than 20 tracks.   
41302 out of 50000 playlists have more than 20 tracks. 

In [15]:
groupedDF = groupedDF[groupedDF['encoded_track_uri'].apply(lambda x: len(x) > 20)] 
print(f"dimension: {groupedDF.shape}")
# groupedDF2 = groupedDF[groupedDF['track_uri'].apply(lambda x: len(x) > 20)] 
# groupedDF1.equals(groupedDF2)  #check if the two dataframes are the same

dimension: (41302, 3)


#### train test split

In [16]:
train_df, test_df = train_test_split(groupedDF, test_size=0.2, random_state=123) # we will use train_df to train the model and test_df to evaluate the model
print(f"shape of train dataset: {train_df.shape}")
print(f"shape of test dataset: {test_df.shape}")

shape of train dataset: (33041, 3)
shape of test dataset: (8261, 3)


In [17]:
train_df.head()
# train_df.to_csv("C:/Users/Tiam Tee/Documents/Spotify100M_data_project/Models/Data/trainWithoutNegative.csv", sep='\t', encoding='utf-8') #save to local disk for future convenience

Unnamed: 0,pid,track_uri,encoded_track_uri
47929,140929,"[spotify:track:7tTElNyvXsfFcxDXIiH5cm, spotify...","[450680, 54561, 149995, 448139, 232214, 110882..."
30040,123040,"[spotify:track:2QilECqmzYBoI6yS5D8ftS, spotify...","[142911, 11889, 202230, 153176, 74998, 407598,..."
8515,101515,"[spotify:track:7d23MhPFE9eB3U8DPRirnL, spotify...","[435114, 102071, 300781, 421948, 261913, 11343..."
7915,100915,"[spotify:track:1wHZx0LgzFHyeIZkUydNXq, spotify...","[114165, 19037, 244393, 416496, 186082, 58233,..."
24877,117877,"[spotify:track:3ZzrBfE1RChGRCraiB9G2P, spotify...","[210045, 108028, 434986, 239284, 10705, 123744..."


In [18]:
test_df.head()
# test_df.to_csv("C:/Users/Tiam Tee/Documents/Spotify100M_data_project/Models/Data/testWithoutNegative.csv", sep='\t', encoding='utf-8')

Unnamed: 0,pid,track_uri,encoded_track_uri
9489,102489,"[spotify:track:3fqwjXwUGN6vbzIwvyFMhx, spotify...","[215623, 393454, 69228, 4854, 439226, 165886, ..."
2787,10787,"[spotify:track:2dOTkLZFbpNXrhc24CnTFd, spotify...","[154957, 453219, 104309, 389316, 37543, 158183..."
42699,135699,"[spotify:track:4UXpJDvdsfneLvT09oJocg, spotify...","[263624, 78766, 68030, 263570, 266054, 190141,..."
12545,105545,"[spotify:track:5VbBBiLl5Y9NGikFrSTdO6, spotify...","[323172, 426350, 386201, 343344, 294639, 31349..."
7888,100888,"[spotify:track:2BstRQGodshjGpeDGQiNgo, spotify...","[128887, 75980, 51200, 368949, 233305, 438419,..."


From the following output, we can see that the average number of tracks in all playlists in the train test is 78 and median is 61 while in the test set, the mean is 79 and median is 63. 
This is a very important piece of information because in both train and test dataframe, we only have positive data and zero negetive data. Hence, we have to manually create negative data and form a new balance dataset. positive data here refers to tracks with label as 1 and negative means track with label as zero. The label 0 and 1 justify if a track is in a playlist. 

In [19]:
Copy_train_df = train_df.copy() #make a copy and assign the content to a new variable jsut in case we might mutate original data in the following computations
Copy_test_df = test_df.copy()

Copy_train_df['count_track'] = train_df['encoded_track_uri'].apply(lambda x: len(x)) #calcuate amount of songs in every playlist and attach the orignal dataset with a enw column
Copy_test_df['count_track'] = test_df['encoded_track_uri'].apply(lambda x: len(x))

print(Copy_train_df['count_track'].describe())
print(Copy_test_df['count_track'].describe()) 

count    33041.000000
mean        77.999395
std         52.692228
min         21.000000
25%         37.000000
50%         61.000000
75%        104.000000
max        250.000000
Name: count_track, dtype: float64
count    8261.000000
mean       78.879797
std        53.314190
min        21.000000
25%        38.000000
50%        63.000000
75%       104.000000
max       250.000000
Name: count_track, dtype: float64


#### Preparing negative data

In [20]:
# make a list of all unique track items.
distinct_trackList = encode_df['encoded_track_uri'].unique().tolist() 

print(f'the number of unique tracks in all playlists: {len(distinct_trackList)}')

# len(encode_df['encoded_track_uri'].unique().tolist()) == len(encode_df['track_uri'].unique().tolist()) #double make sure 

the number of unique tracks in all playlists: 457016


The idea here is that for every playlist, we generate another column of negative list by random sample n items from a set of items that is not in encoded_track_uri list but in distinct list. 

In [21]:
def createNegative(distinct_trackList, df, negatives):
    
    '''
    arguments:
    distinct_trackList: list of all distinct track_uri  #in our code: encode_df['track_uri'].unique().tolist()
    df: dataframe of pid and list of encoded_track_uri
    negatives: number of negative samples to be created for each pid
    '''
    
    df_ARR = np.array(df)
    negative_list = []
    
    for pidNList in tqdm(df_ARR):
        NpidNList = list()
        pid = pidNList[0]
        trackList = pidNList[1] 
        NpidNList.append(pid)
        sampling = random.sample(set(distinct_trackList) - set(trackList), negatives) #set only keeps unique items
        NpidNList.append(sampling)     
        negative_list.append(NpidNList)
        
    negative_list = pd.DataFrame(negative_list, columns = ['pid', 'tracks_with_label0'])
    
    return negative_list

In [22]:
# trainWnegative = createNegative(distinct_trackList, train_df.loc[:, ['pid', 'encoded_track_uri']], 61) 
# 61 is the median hence we use 61

In [23]:
# trainWnegative.to_csv("trainWnegative61.csv")
trainWnegative = pd.read_csv("trainWnegative61.csv")

In [24]:
trainWnegative['tracks_with_label0'] = trainWnegative['tracks_with_label0'].apply(ast.literal_eval) 
# pandas might have converted the list into string in the saving process for memory efficiency hence we need to convert it back .
trainWnegative = trainWnegative.explode('tracks_with_label0') #explode the list 
trainWnegative['label'] = 0 #assign label to it 
trainWnegative = trainWnegative.loc[:,['pid', 'tracks_with_label0', 'label']]
trainWnegative = trainWnegative.reset_index(drop=True).rename(columns={'tracks_with_label0': 'track_id'}) #reset index and rename column

In [25]:
trainWnegative.head()

Unnamed: 0,pid,track_id,label
0,140929,305528,0
1,140929,132913,0
2,140929,259685,0
3,140929,190244,0
4,140929,278977,0


In [26]:
train_df = train_df.loc[:,['pid', 'encoded_track_uri']]
train_label_1 = train_df.reset_index(drop=True).rename(columns={'encoded_track_uri': 'track_id'}) #make sure the column name is the same as the negative dataframe so we can merge them together 
train_label_1 = train_label_1.explode('track_id')
train_label_1['label'] = 1

In [27]:
train_label_1.head(5)

Unnamed: 0,pid,track_id,label
0,140929,450680,1
0,140929,54561,1
0,140929,149995,1
0,140929,448139,1
0,140929,232214,1


In [28]:
train_DF = pd.concat([trainWnegative, train_label_1], axis=0)
train_DF = train_DF.sample(frac=1, random_state=1)

In [29]:
train_DF.head(5)

Unnamed: 0,pid,track_id,label
284133,114005,36279,0
16284,123250,325426,1
14884,134397,456934,1
26716,1457,446730,1
21282,104337,385110,1


In [30]:
test_df = test_df.loc[:,['pid', 'encoded_track_uri']]
test_df = test_df.reset_index(drop=True).rename(columns={'encoded_track_uri': 'track_id'})
test_DF = test_df.explode('track_id')
test_DF = test_DF.sample(frac=1, random_state=1)

In [31]:
test_DF.head(5)

Unnamed: 0,pid,track_id
6293,118607,254495
926,111806,349783
2693,103285,410110
743,105762,454956
7333,109010,362276


convert all the data into tensor format taht is compatible with tensorflow functions

In [32]:
user_train = np.array(train_DF['pid'], dtype=int)
item_train = np.array(train_DF['track_id'], dtype=int)
label_train = np.array(train_DF['label'], dtype=int)

user_test = np.array(test_DF['pid'], dtype=int)
item_test = np.array(test_DF['track_id'], dtype=int)

In [33]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"user_input": user_train, "item_input": item_train}, label_train))
# train_dataset = tf.random.shuffle(train_dataset)

2023-04-24 13:23:10.032207: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-24 13:23:10.042117: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-24 13:23:10.043790: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-24 13:23:10.046334: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [34]:
user_test = tf.convert_to_tensor(user_test, dtype=tf.int32)
item_test = tf.convert_to_tensor(item_test, dtype=tf.int32)
test_dataset = {"user_input": user_test, "item_input": item_test}

In [35]:
batch_size = 1024 #split into batches for computational parallerization 
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE) #convert the data to the format that is suitable for tensorflow GPU computation

In [36]:
num_users = train_df.shape[0]
num_items = len(distinct_trackList)

For the model architecture, please refer to figure 4 in our report. 

In [37]:
def get_model(num_users, num_items, mf_dim=8, layers=[64, 32, 16, 8], reg_layers=[0, 0, 0, 0], reg_mf = 0): #by the default parameters are the same as official implementation
    
    assert len(layers) == len(reg_layers) #the dimension of layers and reg_layers must be the same 
    num_layer = len(layers) 

    user_input = tf.keras.Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = tf.keras.Input(shape=(1,), dtype='int32', name = 'item_input')
    

    MF_Embedding_User = tf.keras.layers.Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  embeddings_initializer='normal', embeddings_regularizer = tf.keras.regularizers.l2(reg_mf), input_length=1)
    MF_Embedding_Item = tf.keras.layers.Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  embeddings_initializer='normal', embeddings_regularizer = tf.keras.regularizers.l2(reg_mf), input_length=1)   

    MLP_Embedding_User = tf.keras.layers.Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                  embeddings_initializer='normal', embeddings_regularizer = tf.keras.regularizers.l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = tf.keras.layers.Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                  embeddings_initializer='normal', embeddings_regularizer = tf.keras.regularizers.l2(reg_layers[0]), input_length=1)   
    
    #matrix factorisation block
    mf_user_latent = tf.keras.layers.Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = tf.keras.layers.Flatten()(MF_Embedding_Item(item_input))
    mf_vector = tf.keras.layers.Multiply()([mf_user_latent, mf_item_latent]) # element-wise multiply 

    #multi layer perceptron block 
    mlp_user_latent = tf.keras.layers.Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = tf.keras.layers.Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = tf.keras.layers.Concatenate()([mlp_user_latent, mlp_item_latent])
    for idx in range(1, num_layer): #number of layers to be adjusted according to input of this function
        layer = tf.keras.layers.Dense(layers[idx], kernel_regularizer= tf.keras.regularizers.l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)
        
    #concatenation layer
    predict_vector = tf.keras.layers.Concatenate()([mf_vector, mlp_vector])
    
    # output layer
    prediction = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = "prediction")(predict_vector)
    
    #wrap model
    model = tf.keras.Model(inputs=[user_input, item_input], 
                  outputs=prediction)
    
    return model

In [38]:
model = get_model(num_users, num_items)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy']) 
#classification task hence binary cross entropy loss

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [39]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_embedding_user (Embedding)  (None, 1, 32)        1057312     user_input[0][0]                 
__________________________________________________________________________________________________
mlp_embedding_item (Embedding)  (None, 1, 32)        14624512    item_input[0][0]                 
______________________________________________________________________________________________

In [40]:
%%time
history = model.fit(train_dataset, epochs=10, batch_size=1024)

Epoch 1/10


2023-04-24 13:23:12.286881: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 21s, sys: 2min 5s, total: 10min 26s
Wall time: 7min 31s


test set is made of pairs of user and item. let the trained model predicts the probaility of each pair. 

In [42]:
score = model.predict(test_dataset)

In [43]:
score

array([[0.94685906],
       [0.99082804],
       [0.24004045],
       ...,
       [0.85735804],
       [0.95772326],
       [0.9794993 ]], dtype=float32)