# Recommander logic

## Get the data into a dataframe

First we need to import both tracks_tag.json which is our dataset and userdatabase.json which is the place where we store users preference.

In [1]:
import pandas as pd

In [2]:
songs = pd.read_json('./tracks_tag.json')

songs.head()

Unnamed: 0,track_name,artist_name,tags
0,Blinding Lights,The Weeknd,"[synthpop, synthwave, pop, 2010s]"
1,telepatía,Kali Uchis,"[kali uchis, pop, bop, latin, synthpop]"
2,drivers license,Olivia Rodrigo,"[pop, 2021, sad, debut, indie]"
3,Save Your Tears,The Weeknd,"[synthpop, pop, r&b, rnb]"
4,Leave The Door Open,Bruno Mars,"[Anderson paak, soul, Smooth Soul, pop soul]"


In [3]:
import json
with open('../src/userdatabase.json') as f:
    data = json.load(f)

header = ['user_id', 'track_name', 'artist_name', 'score', 'tags']
users = []
#If key is known don't iterate over the key argument
for key in data:
    for song in data[key]:
        users.append([key, song['track_name'], song['artist_name'], song['score'], song['tags']])

users = pd.DataFrame(users, columns=header)
users

Unnamed: 0,user_id,track_name,artist_name,score,tags
0,3682350511826442,C'est pas grave,Columbine,9,[]
1,3682350511826442,Rasputin,Boney M.,10,"[Disco, 70s, pop, 80s, dance]"


We now have to get a list of all of our tags.

In [4]:
all_genre = set()

for elem in songs['tags']:
    for tag in elem:
        all_genre.add(tag)

for elem in users['tags']:
    for tag in elem:
        all_genre.add(tag)

all_genre = list(all_genre)
print(len(all_genre))


736


Let's now encode our data to one hot encoding

In [7]:
import math
songs[all_genre] = 0
i = 0
for tags in songs['tags']:
    for tag in tags:
        songs.loc[i, tag] = 1/math.sqrt(len(tags))
    #Remove song with empty tag
    if(tags == []):
        songs = songs.drop([i])
    i+=1
songs = songs.reset_index(drop=True)

print(songs.loc[0, 'synthpop'])

user = users[users['user_id'] == '3682350511826442']

user[all_genre] = 0
i = 0
for tags in user['tags']:
    for tag in tags:
        user.loc[i, tag] = 1/math.sqrt(len(tags))
    #Remove song with empty tag
    if(tags == []):
        user = user.drop([i])
    i+=1
user = user.reset_index(drop=True)

print(user.loc[0, 'Disco'])
user


0.5
0.4472135954999579


Unnamed: 0,user_id,track_name,artist_name,score,tags,seen live,Megan Thee Stallion,summer,Dreamy,minimal,...,lush,def jam,james bond,kanye,blackpink,melancholy,little mix,incel,music choice: alternative,american
0,3682350511826442,Rasputin,Boney M.,10,"[Disco, 70s, pop, 80s, dance]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0.5 Is good because the first song have 4 tags so 1/√(4) = 0.5

0.4472 is also good because the song have 5 tags so 1/√(5) = 0.44721

Let's now calculate the user preference

In [8]:
def userpreference(user_df):
    user_pref = [0 for i in range(len(all_genre))]
    for j in range(len(user_df)):
        for i in range(len(all_genre)):
            user_pref[i] += user_df.loc[j, 'score'] * user_df.iloc[j, i + 5]
    return user_pref

user_profile = userpreference(user)
    

Let's now build our cosine operator

In [12]:
#Give the cosine score based on user_vector and song_vector
def cosine(user_vector, user_vector_squared, song_vector):
    #We can remove the square(song_vector) to the denominator because it is always = 1 because of the normalization
    res = production(user_vector, song_vector)/(user_vector_squared)
    return res

#Calculate numerator value
def production(user_vector, song_vector):
    res = 0.0
    for i in range(len(user_vector)):
        res += user_vector[i] * song_vector[i]
    return res

#Calculate denominator value
def square(vector):
    res = 0.0
    for i in range(len(vector)):
        res += vector[i] * vector[i]
    return math.sqrt(res)

We finnally have everything to compute a recommandation !

In [13]:
# Return a dictionnary of n key/value ranked by score - Representing the best movie an user represented by userId should watch
def topSongUser(user_df, songs, n):

    # Remove songs that are already in library
    song_alredy_in_library = user_df['track_name']
    song_unlistened = songs[~songs['track_name'].isin(song_alredy_in_library)]

    # We store song name and artist name paired with their score
    song_name = list(song_unlistened['track_name'])
    song_artist = list(song_unlistened['artist_name'])
    #Get score using the cosine distance

    user_vector_squared = square(user_profile)
    scores = songs.apply(lambda x: cosine(user_profile, user_vector_squared, x[3:]), axis = 1)
    
    #Store it in dict
    songsScores = dict()
    for i in range(len(scores)):
        songsScores[str(song_name[i]) + '|' + str(song_artist[i])] = scores[i]
        
    #Sort dictionnary
    songsScores = {k: v for k, v in sorted(songsScores.items(), key=lambda item: item[1], reverse=True)}
    
    #Return n first
    return {k: songsScores[k] for k in list(songsScores)[:n]}

In [14]:
print(topSongUser(user, songs, 10))

{'Billie Jean|Michael Jackson': 0.8, 'Dancing Queen|ABBA': 0.8, 'Hung Up|Madonna': 0.6708203932499369, "doN'T StArT nOw|Dua Lipa": 0.6, 'Break My Heart|Dua Lipa': 0.6, 'Physical|Dua Lipa': 0.6, 'Hallucinate|Dua Lipa': 0.6, 'September|Earth, Wind & Fire': 0.6, 'I Wanna Dance with Somebody (Who Loves Me)|Whitney Houston': 0.6, 'Gimme! Gimme! Gimme! (A Man After Midnight)|ABBA': 0.6}
