In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import time

# Demographic data preprocessing

In [17]:
demo_raw = pd.read_csv('data.nosync/lastfm-dataset-360K/usersha1-profile.tsv', sep='\t', header=None,
                             names=['user_email', 'gender', 'age', 'country', 'signup'])
demo_raw["signup"] = pd.to_datetime(demo_raw.signup)

In [18]:
# Data cleaning of demo

In [19]:
demo_raw.isna().sum(0)

user_email        0
gender        32775
age           74900
country           0
signup            0
dtype: int64

In [22]:
demo_raw.shape

(359347, 5)

In [23]:
ages = demo_raw['age'].unique()
demo_raw['age'].replace(ages[ages > 99], np.nan, inplace=True)
demo_raw['age'].replace(ages[ages < 8], np.nan, inplace=True)

In [24]:
demo_raw.isna().sum(0)

user_email        0
gender        32775
age           77218
country           0
signup            0
dtype: int64

In [26]:
demo_raw = demo_raw.drop('signup', axis=1)

In [29]:
demo_raw = demo_raw.set_index('user_email')

In [30]:
demo_raw.to_csv('data.nosync/lastfm-dataset-360K/demo-360k-processed-neighborhood.csv')

# User-user neighborhood model

In [31]:
demo = pd.read_csv('data.nosync/lastfm-dataset-360K/demo-360k-processed-neighborhood.csv', index_col='user_email')
behav = pd.read_csv('data.nosync/lastfm-dataset-360K/behav-360k-processed.csv', index_col='None')

## 1. User demographic filtering 

For the user neighborhood model, we are going to consider the users using the following. 

- Per country, then per 5 years age chunk, then per gender. 


1. Filter users using demographic data
First step of our user-user neighborhood model is to select of set of users with similar demographic data. 
This helps reduce the dimention of the search space for the users, but also for the items. Especially the country parameters

# We are applying the filters until we have 10000 profiles. 

**Nan values**: For user with nan values, we will take all the user base for similarities. 

In [56]:
demo.describe(include='all')

Unnamed: 0,gender,age,country
count,326572,282129.0,359347
unique,2,,239
top,m,,United States
freq,241642,,67044
mean,,25.097413,
std,,7.937884,
min,,8.0,
25%,,20.0,
50%,,23.0,
75%,,28.0,


In [89]:
default_gender = demo['gender'].mode()[0]
default_age = float(int(demo['age'].mean()))

In [90]:
demo['age'].astype(int, errors='ignore')

user_email
00000c289a1829a808ac09c00daf10bc3c4e223b    22.0
00001411dc427966b17297bf4d69e7e193135d89     NaN
00004d2ac9316e22dc007ab2243d6fcb239e707d     NaN
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf    19.0
00007a47085b9aab8af55f52ec8846ac479ac4fe    28.0
                                            ... 
fffe7823f67b433b45f22056467db921c1d3d7d0    25.0
fffe8637bd8234309e871409c7ebef99a720afc1    25.0
fffe8c7f952d9b960a56ed4dcb40a415d924b224    20.0
ffff9af9ae04d263dae91cb838b1f3a6725f5ffb    20.0
ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac    21.0
Name: age, Length: 359347, dtype: float64

In [91]:
example_user_email = '00024b5b85c40f990c28644d53257819980bf6bb'
example_email_nan = '00004d2ac9316e22dc007ab2243d6fcb239e707d'

In [84]:
gender, age, country = demo.loc[example_user_email].values

In [126]:
def get_user_demo(user_email:str):
    gender, age, country = demo.loc[user_email].values
    if pd.isna(gender):
        gender = default_gender
    if pd.isna(age):
        age = default_age
    return gender, age, country

In [365]:
SAMPLE_SIZE = 10000
def get_demo_similar_users(user_email:str):
    gender, age, country = get_user_demo(user_email)
    print(f"Gender: {gender}, Age: {age}, Country: {country}")
    
    country_users = demo[demo['country'] == country]
    
    if (len(country_users) <= SAMPLE_SIZE):
        print(f"Country filter applied. Length: {len(country_users)}")
        return country_users.index.values
    
    age_users = country_users[(country_users['age'] >= (age - 2)) 
                              & (country_users['age'] <= (age + 2))]
    
    if (len(age_users) <= SAMPLE_SIZE):
        print(f"Country and age filters. Length: {len(age_users)}")
        return age_users.index.values
    
    gender_users = age_users[age_users['gender'] == gender]
    print(f"Country, age, and gender filters applied. Length: {len(gender_users)}")
    return gender_users.index.values


In [129]:
similar_users = get_demo_similar_users(example_email_nan)

Gender: m, Age: 25.0, Country: Germany
Country and age filters. Length: 5909


In [209]:
# Get the rows for selected users
similar_behav = behav[(behav['user_email'].isin(similar_users)) | (behav['user_email'] == example_email_nan)]

In [211]:
len(similar_behav)

288869

In [212]:
len(similar_behav) / len(behav) 

0.016688725224130407

In [213]:
similar_behav

Unnamed: 0.1,Unnamed: 0,user_email,artist_id,artist_name,plays,log_plays,std_plays
100,100,00004d2ac9316e22dc007ab2243d6fcb239e707d,100,current 93,853,6.748760,0.521301
101,101,00004d2ac9316e22dc007ab2243d6fcb239e707d,101,coil,567,6.340359,0.489754
102,102,00004d2ac9316e22dc007ab2243d6fcb239e707d,102,andrew liles,248,5.513429,0.425879
103,103,00004d2ac9316e22dc007ab2243d6fcb239e707d,103,six organs of admittance,242,5.488938,0.423987
104,104,00004d2ac9316e22dc007ab2243d6fcb239e707d,104,16 horsepower,225,5.416100,0.418361
...,...,...,...,...,...,...,...
17309047,17535421,fffe7823f67b433b45f22056467db921c1d3d7d0,62,coldplay,42,3.737670,0.288712
17309048,17535422,fffe7823f67b433b45f22056467db921c1d3d7d0,2746,david guetta,41,3.713572,0.286851
17309049,17535423,fffe7823f67b433b45f22056467db921c1d3d7d0,2071,silbermond,41,3.713572,0.286851
17309050,17535424,fffe7823f67b433b45f22056467db921c1d3d7d0,885,elvis presley,40,3.688879,0.284944


In [214]:
# Compute the total plays per user
user_total_play = similar_behav.groupby('user_email')['plays'].sum()

In [215]:
user_total_play.loc['000bce5b008caef9cce3f2b981ec71ef20a5926e']

2139

In [242]:
# Compute normalized plays per user
similar_behav.loc[:, 'norm_plays'] = (similar_behav['plays'] / user_total_play.loc[similar_behav['user_email']].values)

In [243]:
similar_behav

Unnamed: 0.1,Unnamed: 0,user_email,artist_id,artist_name,plays,log_plays,std_plays,norm_plays,norm_plays2
100,100,00004d2ac9316e22dc007ab2243d6fcb239e707d,100,current 93,853,6.748760,0.521301,0.147834,0.147834
101,101,00004d2ac9316e22dc007ab2243d6fcb239e707d,101,coil,567,6.340359,0.489754,0.098267,0.098267
102,102,00004d2ac9316e22dc007ab2243d6fcb239e707d,102,andrew liles,248,5.513429,0.425879,0.042981,0.042981
103,103,00004d2ac9316e22dc007ab2243d6fcb239e707d,103,six organs of admittance,242,5.488938,0.423987,0.041941,0.041941
104,104,00004d2ac9316e22dc007ab2243d6fcb239e707d,104,16 horsepower,225,5.416100,0.418361,0.038995,0.038995
...,...,...,...,...,...,...,...,...,...
17309047,17535421,fffe7823f67b433b45f22056467db921c1d3d7d0,62,coldplay,42,3.737670,0.288712,0.013241,0.013241
17309048,17535422,fffe7823f67b433b45f22056467db921c1d3d7d0,2746,david guetta,41,3.713572,0.286851,0.012926,0.012926
17309049,17535423,fffe7823f67b433b45f22056467db921c1d3d7d0,2071,silbermond,41,3.713572,0.286851,0.012926,0.012926
17309050,17535424,fffe7823f67b433b45f22056467db921c1d3d7d0,885,elvis presley,40,3.688879,0.284944,0.012610,0.012610


In [244]:
similar_group_artists = similar_behav['artist_id'].value_counts()
len(similar_group_artists)

30202

We will be filtering the songs such that at least 1% of our dataset have listened to it. 

In [245]:
ARTIST_THRESHOLD = len(similar_users) / 100

In [246]:
selected_artist_ids = similar_group_artists[similar_group_artists > ARTIST_THRESHOLD].index.values

In [247]:
similar_behav = similar_behav.groupby(['user_email', 'artist_id'], as_index=False).sum()

In [271]:
similar_user_scores = similar_behav.pivot(index='user_email', columns='artist_id', values='norm_plays')
similar_user_scores.shape

(5904, 30202)

In [250]:
sim_selected_scores = similar_user_scores[selected_artist_ids]
sim_selected_scores.shape

(5904, 931)

In [258]:
sim_selected_scores[sim_selected_scores.index == example_email_nan]

artist_id,1,62,5,192,216,281,350,485,28,757,...,3927,321,276,5050,185,3347,79,878,3506,7679
user_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00004d2ac9316e22dc007ab2243d6fcb239e707d,,,,,,,,,,,...,,,,,,,,,,


In [336]:
filled_scores2 = sim_selected_scores.fillna(0)

In [312]:
filled_scores = similar_user_scores.fillna(0)

In [315]:
correlated_rows = filled_scores.corrwith(filled_scores.loc[example_email_nan], axis=1)

In [334]:
correlated_rows[correlated_rows > 0.5]

user_email
00004d2ac9316e22dc007ab2243d6fcb239e707d    1.000000
e828e22870f59bfa139cd50c5ae6d8c953714541    0.611279
dtype: float64

In [326]:
u1_artists = behav[behav['user_email'] == '00004d2ac9316e22dc007ab2243d6fcb239e707d']['artist_name'].values

In [328]:
u2_artists = behav[behav['user_email'] == 'e828e22870f59bfa139cd50c5ae6d8c953714541']['artist_name'].values

In [335]:
shared_artists = set(u1_artists).intersection(set(u2_artists))
print(f"U1 artists: {len(u1_artists)}, U2 artists: {len(u2_artists)}, Shared: {shared_artists}")

U1 artists: 46, U2 artists: 50, Shared: {'antony and the johnsons', 'current 93', 'marissa nadler'}


In [340]:
u1_artists

array(['current 93', 'coil', 'andrew liles', 'six organs of admittance',
       '16 horsepower', 'angels of light', 'the legendary pink dots',
       'charalambides', 'festival', 'fern knight',
       'nick cave & the bad seeds', 'fovea hex', 'amanda rogers',
       'lux interna', 'nalle', 'antony and the johnsons',
       'marissa nadler', 'a silver mt. zion', 'einstürzende neubauten',
       'marc almond', 'mariee sioux', 'björk', 'bill fay',
       'nurse with wound', 'baby dee', 'ghq', 'daniel higgs', 'flairck',
       'jack rose', 'castanets', 'john jacob niles',
       'the revolutionary army of the infant jesus', 'carter tutti',
       'the one ensemble', 'orion rigel dommisse', 'jocelyn pook',
       'matmos', 'ulver', 'ane brun', 'soisong', 'a hawk and a hacksaw',
       'frank london', 'michael gira', 'larkin grimm', 'pelt',
       'jean parlette'], dtype=object)

In [341]:
u2_artists

array(['current 93', 'mia doi todd', 'hans-joachim roedelius',
       'art bears', 'red krayola', 'max richter', 'meredith monk',
       'scott walker', 'magma', 'laurie anderson',
       "bonnie 'prince' billy", 'swans', 'low', 'cocteau twins',
       'the west coast pop art experimental band', 'beach house',
       'deutsch amerikanische freundschaft',
       'miasma & the carousel of headless horses', 'monte cazazza',
       'animal collective', 'mazzy star', 'the chameleons',
       'casiotone for the painfully alone', 'gonzales', 'talk talk',
       'the iditarod', 'the velvet underground', 'xiu xiu', 'analogy',
       'oriental sunshine', 'slowdive', 'tuxedomoon',
       'angelite & moscow art trio & huun-huur-tu',
       'antony and the johnsons', 'donovan', 'steve reich',
       'ana da silva', 'fujiya & miyagi', 'joy division',
       'marissa nadler', 'mission of burma', 'portishead', 'suede',
       'broadcast', 'fennesz + sakamoto', 'interpol', 'james',
       'the incredib

In [338]:
corr2_rows = filled_scores2.corrwith(filled_scores2.loc[example_email_nan], axis=1)

In [400]:
corr2_rows = corr2_rows[corr2_rows > 0.2]
corr2_rows

user_email
00004d2ac9316e22dc007ab2243d6fcb239e707d    1.000000
074854442dbee22c6cd38cc9367c6a18ae9f8100    0.258952
0dcd2d1478aed2c46bb24ca10290f22cff526cc7    0.249107
11d3cc425bf38258f8c1aaacc6d44dd74a3d47e1    0.343626
1451b5daec73ae37eb6326068fca2f8c96772331    0.488541
                                              ...   
eb8a09eb60ca29eba9751def84bb8097ec751e9b    0.285807
edff96396805711a5941f821e46ea890266dbf13    0.232787
f051d06889be35344e86fcb02325945071f23009    0.254665
f82aa7903797d261566f090f3fd65a8876a8ce94    0.223182
fcc2fa52b2368cbd248410b5d447b3fd2f88c704    0.256162
Length: 63, dtype: float64

In [342]:
u1_artists2 = behav[behav['user_email'] == '00004d2ac9316e22dc007ab2243d6fcb239e707d']['artist_name'].values
u2_artists2 = behav[behav['user_email'] == '53252fbef5eb81512498b2eb2f89157a3f91917c']['artist_name'].values
u3_artists2 = behav[behav['user_email'] == '65f84f90d310fc68d78a1b589adf5bb126f511c4']['artist_name'].values

In [345]:
set(u1_artists2).intersection(u2_artists2)

{'16 horsepower', 'einstürzende neubauten', 'nick cave & the bad seeds'}

In [347]:
set(u1_artists2).intersection(u3_artists2)

{'einstürzende neubauten', 'nick cave & the bad seeds'}

In [404]:
avg_plays = behav[behav['user_email'].isin(corr2_rows.index)].groupby('user_email')['plays'].mean()
avg_plays

user_email
00004d2ac9316e22dc007ab2243d6fcb239e707d    125.434783
074854442dbee22c6cd38cc9367c6a18ae9f8100     41.907692
0dcd2d1478aed2c46bb24ca10290f22cff526cc7    339.320000
11d3cc425bf38258f8c1aaacc6d44dd74a3d47e1    767.133333
1451b5daec73ae37eb6326068fca2f8c96772331    183.210526
                                               ...    
eb8a09eb60ca29eba9751def84bb8097ec751e9b    510.580000
edff96396805711a5941f821e46ea890266dbf13    232.877551
f051d06889be35344e86fcb02325945071f23009    147.755556
f82aa7903797d261566f090f3fd65a8876a8ce94    244.489796
fcc2fa52b2368cbd248410b5d447b3fd2f88c704     85.854167
Name: plays, Length: 63, dtype: float64

In [406]:
# Compute the normalized average number of play
avg_norm_plays = similar_behav[similar_behav['user_email'].isin(corr2_rows.index)].groupby('user_email')['norm_plays'].mean()
avg_norm_plays

user_email
00004d2ac9316e22dc007ab2243d6fcb239e707d    0.021739
074854442dbee22c6cd38cc9367c6a18ae9f8100    0.015385
0dcd2d1478aed2c46bb24ca10290f22cff526cc7    0.020000
11d3cc425bf38258f8c1aaacc6d44dd74a3d47e1    0.016667
1451b5daec73ae37eb6326068fca2f8c96772331    0.017544
                                              ...   
eb8a09eb60ca29eba9751def84bb8097ec751e9b    0.020000
edff96396805711a5941f821e46ea890266dbf13    0.020408
f051d06889be35344e86fcb02325945071f23009    0.022222
f82aa7903797d261566f090f3fd65a8876a8ce94    0.020408
fcc2fa52b2368cbd248410b5d447b3fd2f88c704    0.020833
Name: norm_plays, Length: 63, dtype: float64

In [420]:
# Get the normalized play for given artist id
example_artist = 1
artist_norm_plays = similar_behav[(similar_behav['user_email'].isin(corr2_rows.index)) 
              & (similar_behav['artist_id'] == example_artist)]\
                [['user_email', 'norm_plays']].set_index('user_email')['norm_plays']
artist_norm_plays

user_email
46212907b8f0449e5c83b93b19e85e61847e9d5a    0.017929
c82eb243debb3ca31cd9205ea23daf9aba14ba86    0.008681
d18d70fc1dd22c905f1099a60d447174b69dbb1c    0.007232
e15151eace11324fd7a766be6a47ec36a496a540    0.052034
Name: norm_plays, dtype: float64

In [422]:
avg_norm_plays.loc[example_email_nan]

0.021739130434782605

In [426]:
remaining_users = artist_norm_plays.index.values

In [429]:
avg_norm_plays[remaining_users]

user_email
46212907b8f0449e5c83b93b19e85e61847e9d5a    0.015625
c82eb243debb3ca31cd9205ea23daf9aba14ba86    0.018868
d18d70fc1dd22c905f1099a60d447174b69dbb1c    0.021739
e15151eace11324fd7a766be6a47ec36a496a540    0.016667
Name: norm_plays, dtype: float64

In [433]:
pred_shift = (corr2_rows[remaining_users] * (artist_norm_plays - avg_norm_plays[remaining_users])).sum() / corr2_rows[remaining_users].sum()

In [436]:
predicted_plays = int((avg_norm_plays.loc[example_email_nan] + pred_shift) * user_total_play.loc[example_email_nan])
predicted_plays

144

# Prediction

prediction = user_average_listen + sum_over_x(coor_x * (#play_x - avg_#play_x))/sum_over_x(corr_x)

In order to predict the value for a specific artist, we need to filter on the users that have already listened to this artist. 

In [353]:
behav[behav['artist_id'] == 1]['user_email'].values

array(['00000c289a1829a808ac09c00daf10bc3c4e223b',
       '000a1585c5f65532a9c9187a882892982d345a5c',
       '000cb6427411006fe9a6193d3c4f59efed53fbef', ...,
       'ffe42950d65973be91082d9270a8ebcd15d2c20a',
       'ffe7359143a9fe15b3be2eaac57385e237f82e2c',
       'fffe356b9dab2fae1f887fabb1f08ab4976c91bb'], dtype=object)

In [354]:
def get_users_for_artist(artist_id:int):
    return behav[behav['artist_id'] == artist_id]['user_email'].values

In [453]:
def get_correlation_list(user_email: str, selected_users:list, filter_artists=False):
    
    # Filter the behavioral data for the listed users
    selected_behav = behav[(behav['user_email'].isin(similar_users)) | (behav['user_email'] == user_email)]
    
    # Make sure there is no duplicated rows for given user and artist pair
    selected_behav = selected_behav.groupby(['user_email', 'artist_id'], as_index=False).sum()
    
    # Compute the total number of plays per user
    total_plays = selected_behav.groupby('user_email')['plays'].sum()
    
    # Compute normalized plays per user
    selected_behav.loc[:, 'norm_plays'] = (selected_behav['plays'] / total_plays.loc[selected_behav['user_email']].values)
    
    # Compute the number of artists listened in the selected group
    selected_artists = selected_behav['artist_id'].value_counts().index.values
    print(f"Total artists: {len(selected_artists)}")
    
    # Filter the artist to reduce the dimention
    if filter_artists:
        # Remove all artist that have been listened by less than 1% of the user list. 
        artist_threshold = len(selected_users) / 100
        selected_artists = selected_artists[selected_artists > ARTIST_THRESHOLD].index.values
        selected_behav = selected_behav[selected_behav['artist_id'].isin(selected_artists)]
        
    # Build the user raking dataset (user_email index and one column per artist)
    user_scores = selected_behav.pivot(index='user_email', columns='artist_id', values='norm_plays').fillna(0)
    
    # Compute correlation between users
    correlation_list = user_scores.corrwith(user_scores.loc[user_email], axis=1)
    
    return correlation_list, selected_behav

In [454]:
def compute_prediction(user_email: str, artist_id: int, selected_corr: np.array, selected_behav: pd.DataFrame):
    # Get the behav data for the remaining users
    final_behav = selected_behav[selected_behav['user_email'].isin(selected_corr.index)]

    # Compute the normalized average number of play
    avg_norm_plays = final_behav.groupby('user_email')['norm_plays'].mean()
    
    # Remove user_email from the selected_corr data
    
    
    # Get the normalized number of play for a given artist
    artist_norm_plays = final_behav[(similar_behav['artist_id'] == artist_id)]\
                                    [['user_email', 'norm_plays']].set_index('user_email')['norm_plays']
    
    # Compute the plays variance from the similar users
    var_pred = (selected_corr * (artist_norm_plays - avg_norm_plays)).sum() / selected_corr.sum()
    
    # Compute user total number of play and average number of play
    user_plays = behav[behav['user_email'] == example_user_email]['plays']
    user_avg_plays = user_plays.mean()
    
    # Compute the estimated number of play for new artist (average + normalized var * number of plays)
    predicted_plays = int(user_plays.mean() + var_pred * user_plays.sum())
    
    return predicted_plays

In [457]:
def get_artist_prediction(user_email:str, artist_id:int, corr_threshold=0.3, filter_demo=False, filter_artist=False):
    start = time.time()
    
    # Select users that have already listened to the given artist 
    selected_users = get_users_for_artist(artist_id)
    print(f"Selected users: {len(selected_users)}")
    
    # Filter on users demographic features 
    if filter_demo:
        demo_users = get_users_demo(user_email)
        selected_users = np.array(list(set(demo_users).intersection(set(selected_users))))
        print(f"Selected users (after demo filter): {len(selected_users)}")
    
    # Compute correlation with selected users
    start_corr = time.time()
    correlation_list, selected_behav = get_correlation_list(user_email, selected_users, filter_artist)
    end_corr = time.time()
    print(f"Correlation list computation time: {end_corr - start_corr}")
    
    # Compute prediction
    selected_corr = correlation_list[correlation_list > corr_threshold]
    
    print(f"Selected users (after correlation): {len(selected_corr)}")
    
    
    return compute_prediction(user_email, artist_id, selected_corr, selected_behav)
    

In [459]:
get_artist_prediction(example_user_email, 10)

Selected users: 2301
Total artists: 30190
Correlation list computation time: 24.068799018859863
Selected users (after correlation): 5


  if sys.path[0] == '':


413