In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# Hyperparameters and other parameters
questionNum = 50    # number of questions user will get asks, corresponds to the dimension of the vectors we will conduct k-NN on 
TopArtistNum = 1000 # the top `TopArtistNum` most popular artists will be observed as possible recommendations
ageWeight = 1
genderWeight = 1

Unnamed: 0,user_id,gender,age,country,date
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"


In [None]:
user_plays_path = Path.cwd() / 'lastfm-dataset-360K' / 'user-artist-plays-1mil.tsv'
user_profile_path = Path.cwd() / 'lastfm-dataset-360K' / 'usersha1-profile.tsv'

df = pd.read_csv(user_plays_path, sep='\t', encoding='utf-8-sig')
df.head()

profiles = pd.read_csv(user_profile_path, sep='\t', encoding='utf-8-sig')
profiles.head()

In [83]:
all_considered = df.groupby('artist_name')['plays'].sum().nlargest(TopArtistNum).reset_index(name='plays')

lookup = df[['artist_name', 'artist_id']].drop_duplicates('artist_name')
all_considered = all_considered.merge(right=lookup, on='artist_name', how='left', validate='m:1')
all_considered.to_csv('all_artists_considered.csv', index=True)
all_considered.tail()

Unnamed: 0,artist_name,plays,artist_id
995,bolt thrower,38881,68bd8072-0412-44b1-81dd-807aa6c1918c
996,cute is what we aim for,38811,40a5a225-3279-4d48-bca2-6059ed11b4fc
997,the hush sound,38657,2b7a7c7f-8c18-4e0c-aaf2-8c6f4e93cdbb
998,soda stereo,38654,3f8a5e5b-c24b-4068-9f1c-afad8829e06b
999,bill evans,38638,8247a3f2-3a8e-4256-b322-6c57b03a4e36


In [84]:
topQ = all_considered[:questionNum]
topQ.tail()

Unnamed: 0,artist_name,plays,artist_id
45,queens of the stone age,367053,7dc8f5bd-9d0b-4087-9f73-dc164950bbd8
46,tom waits,365040,c3aeb863-7b26-4388-94e8-5a240f2be21b
47,modest mouse,364123,a96ac800-bfcb-412a-8a63-0a98df600700
48,the rolling stones,363021,b071f9fa-14b0-4217-8e97-eb41da73f598
49,the offspring,351342,23a03e33-a603-404e-bcbf-2c00159d7067


In [85]:
df_only_top_artists = df[df['artist_name'].isin(all_considered['artist_name'])]
norms = df_only_top_artists.groupby('user_id')['plays'].transform(lambda x : x / x.sum())
df_only_top_artists['plays'] = norms
df_only_top_artists.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_top_artists['plays'] = norms


Unnamed: 0,user_id,artist_id,artist_name,plays
999994,0e95d8066868c8e189cfe272f690ba11ab47e874,2b6514cd-d424-45fe-ab85-bd8719f4492d,jacks mannequin,0.015234
999995,0e95d8066868c8e189cfe272f690ba11ab47e874,f2eef649-a6d5-4114-afba-e50ab26254d2,sum 41,0.014791
999997,0e95d8066868c8e189cfe272f690ba11ab47e874,90cda7b7-9112-4ec7-9d6f-3f4675ef5130,scary kids scaring kids,0.013755
999998,0e95d8066868c8e189cfe272f690ba11ab47e874,d8354b38-e942-4c89-ba93-29323432abc3,30 seconds to mars,0.013607
999999,0e95d8066868c8e189cfe272f690ba11ab47e874,bc1c8f3f-6f36-4086-a470-87ed813dd429,alexisonfire,0.01346


In [86]:
artist_list = all_considered['artist_name'].drop_duplicates().tolist()

assert(len(artist_list) == 1000)

ret = (
    df_only_top_artists[df_only_top_artists['artist_name'].isin(artist_list)]
    .pivot_table(index='user_id', columns='artist_name', values='plays', fill_value=0)
    .reindex(columns=artist_list, fill_value=0)
)

In [None]:
users_unique = df['user_id'].drop_duplicates().reset_index(drop=True).to_frame('user_id')

demographics_users = profiles[profiles['user_id'].isin(users_unique['user_id'])].copy()
demographics_users.drop(columns='date', inplace=True)
demographics_users.drop(columns='country', inplace=True)
demographics_users['gender'] = demographics_users['gender'].apply(lambda x: 0 if x == 'm' else 1 * genderWeight)

# we apply min-max scaling for the age
x_min, x_max = demographics_users['age'].min(), demographics_users['age'].max()
demographics_users['age'] = demographics_users['age'].apply(lambda x: 0 if pd.isna(x) else ((x - x_min) / (x_max - x_min)) * ageWeight)

demographics_users.head()

Unnamed: 0,user_id,gender,age
0,00000c289a1829a808ac09c00daf10bc3c4e223b,1,0.939834
1,00001411dc427966b17297bf4d69e7e193135d89,1,0.0
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,1,0.0
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0,0.937759
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,0,0.943983


In [88]:
ret = pd.merge(demographics_users, ret, left_on='user_id', right_on='user_id')
ret.to_csv('data_vectors_whole.csv', index=False)

ret_q_dim = ret.iloc[:, :(questionNum + 2 + 1)] # + 2 for gender, age and + 1 for user_id, which is there for debug purposes
ret_q_dim.to_csv('data_vectors_topQ.csv', index=False)
ret.head()

Unnamed: 0,user_id,gender,age,the beatles,radiohead,coldplay,muse,metallica,pink floyd,linkin park,...,télépopmusik,suicide silence,corinne bailey rae,alter bridge,final fantasy,bolt thrower,cute is what we aim for,the hush sound,soda stereo,bill evans
0,00000c289a1829a808ac09c00daf10bc3c4e223b,1,0.939834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00001411dc427966b17297bf4d69e7e193135d89,1,0.0,0.0,0.0,0.046249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0,0.937759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,0,0.943983,0.0,0.018029,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
