In [23]:
import sys
from pathlib import Path

proj_root = Path.cwd().parent if Path.cwd().name == "preprocessing" else Path.cwd()
sys.path.insert(0, str(proj_root))

output_dir = proj_root / "public"
output_dir.mkdir(parents=True, exist_ok=True)

import pandas as pd
import numpy as np

from app.api.modules import questionNum, TopArtistNum

In [24]:
user_plays_path = Path.cwd() / 'lastfm-dataset-360K' / 'user-artist-plays-200k.tsv'
user_profile_path = Path.cwd() / 'lastfm-dataset-360K' / 'usersha1-profile.tsv'

df = pd.read_csv(user_plays_path, sep='\t', encoding='utf-8-sig')
df.head()

profiles = pd.read_csv(user_profile_path, sep='\t', encoding='utf-8-sig')
profiles.head()

Unnamed: 0,user_id,gender,age,country,date
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"


In [25]:
all_considered = df.groupby('artist_name')['plays'].sum().nlargest(TopArtistNum).reset_index(name='plays')

lookup = df[['artist_name', 'artist_id']].drop_duplicates('artist_name')
all_considered = all_considered.merge(right=lookup, on='artist_name', how='left', validate='m:1')
all_considered.to_csv(output_dir / "all_artists_considered.csv", index=True)
all_considered.tail()

Unnamed: 0,artist_name,plays,artist_id
1995,the allman brothers band,3879,72359492-22be-4ed9-aaa0-efa434fb2b01
1996,東京事変,3879,b3d0f168-cb34-47c6-8529-fc05d1fce3ee
1997,12 stones,3877,6f81a7dc-be31-4498-ae95-6d994ffec614
1998,black lips,3876,e940d7a3-01d0-468c-86ea-5dc4d89dcf80
1999,ingrid michaelson,3876,1fc494a1-9109-4081-a455-2d05bea9d2bf


In [26]:
topQ = all_considered[:questionNum]
topQ.tail()

Unnamed: 0,artist_name,plays,artist_id
45,madonna,74305,79239441-bfd5-4981-a70c-55c3f15c1287
46,the prodigy,74079,4a4ee089-93b1-4470-af9a-6ff575d32704
47,oasis,73498,39ab1aed-75e0-4140-bd47-540276886b60
48,queens of the stone age,72775,7dc8f5bd-9d0b-4087-9f73-dc164950bbd8
49,boards of canada,70625,69158f97-4c07-4c4e-baf8-4e4ab1ed666e


In [27]:
df_only_top_artists = df[df['artist_name'].isin(all_considered['artist_name'])]
norms = df_only_top_artists.groupby('user_id')['plays'].transform(lambda x : x / x.sum())
df_only_top_artists['plays'] = norms
df_only_top_artists.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_top_artists['plays'] = norms


Unnamed: 0,user_id,artist_id,artist_name,plays
199995,02ede2ea5dbba24eb42882c0ecd51e6be310ee83,92a42e82-b36f-4308-82c1-68bad2e03c89,third eye blind,0.08637
199996,02ede2ea5dbba24eb42882c0ecd51e6be310ee83,04591295-6d94-45cb-b6c7-1ae17c6f380e,lifehouse,0.08502
199997,02ede2ea5dbba24eb42882c0ecd51e6be310ee83,2ddd167f-5b8a-4372-b350-6ad50493bac0,the fray,0.083671
199998,02ede2ea5dbba24eb42882c0ecd51e6be310ee83,906cbb69-b793-463a-832d-b5bf850f01a2,athlete,0.072874
199999,02ede2ea5dbba24eb42882c0ecd51e6be310ee83,cbdbfa40-a839-4fa2-a5e1-dbd6e84fe1fd,tahiti 80,0.071525


In [28]:
artist_list = all_considered['artist_name'].drop_duplicates().tolist()

assert(len(artist_list) == TopArtistNum)

ret = (
    df_only_top_artists[df_only_top_artists['artist_name'].isin(artist_list)]
    .pivot_table(index='user_id', columns='artist_name', values='plays', fill_value=0)
    .reindex(columns=artist_list, fill_value=0)
)

In [29]:
users_unique = df['user_id'].drop_duplicates().reset_index(drop=True).to_frame('user_id')

demographics_users = profiles[profiles['user_id'].isin(users_unique['user_id'])].copy()
demographics_users.drop(columns='date', inplace=True)
demographics_users.drop(columns='country', inplace=True)
demographics_users['gender'] = demographics_users['gender'].apply(lambda x: 0 if x == 'm' else 1)

# we apply min-max scaling for the age
x_min, x_max = demographics_users['age'].min(), demographics_users['age'].max()
demographics_users['age'] = demographics_users['age'].apply(lambda x: 0 if pd.isna(x) else ((x - x_min) / (x_max - x_min)))

demographics_users.head()

Unnamed: 0,user_id,gender,age
0,00000c289a1829a808ac09c00daf10bc3c4e223b,1,0.939834
1,00001411dc427966b17297bf4d69e7e193135d89,1,0.0
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,1,0.0
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0,0.937759
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,0,0.943983


In [30]:
ret = pd.merge(demographics_users, ret, left_on='user_id', right_on='user_id')
ret.to_csv( output_dir / 'data_vectors_whole.csv', index=False)

ret_q_dim = ret.iloc[:, :(questionNum + 2 + 1)] # + 2 for gender, age and + 1 for user_id, which is there for debug purposes
ret_q_dim.to_csv( output_dir / 'data_vectors_topQ.csv', index=False)
ret.head()

Unnamed: 0,user_id,gender,age,the beatles,radiohead,linkin park,coldplay,muse,pink floyd,metallica,...,jonathan coulton,acid house kings,townes van zandt,garth brooks,michael giacchino,the allman brothers band,東京事変,12 stones,black lips,ingrid michaelson
0,00000c289a1829a808ac09c00daf10bc3c4e223b,1,0.939834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00001411dc427966b17297bf4d69e7e193135d89,1,0.0,0.0,0.0,0.0,0.02429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0,0.937759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,0,0.943983,0.0,0.016641,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
