In [1]:
import requests
import json
import numpy as np
import pandas as pd

## BACKGROUND

For streaming companies, having a recommendation system can help them provide a better experience for a customer by making personalized recommendations.

In this project, we leverage streaming data from the music streaming platform last.fm to build a recommendation system that can provide recommendations using user history and/or information such as age, country, gender, etc.

The dataset we are using comes from here: https://github.com/nachiketmparanjape/Music-Recommender-last.fm
All credit goes to the original creator.

In [2]:
rec_data = pd.read_csv('merged-subset.csv', encoding='cp1252')

In [3]:
rec_data.shape

(10000, 9)

In [4]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate
0,249130,03b9c645bc4f578ea1dcb6a975e7ea71fab79da2,a16371b9-7d36-497a-a9d4-42b0a0440c5e,slowdive,108,m,28.0,Germany,"Sep 16, 2006"
1,16103327,ee48ac7de691d31176a2843d0f2f8f49cab78e6d,5a24bc1a-c093-4a82-84ed-8d7f2da0570d,skinny puppy,407,m,,Canada,"Dec 14, 2006"
2,12079167,b2e3d27a00fc3035edb439b14e6d30bcb6dddf69,68f1175b-592b-4f28-ab1c-85c7a438c636,turbonegro,91,m,29.0,United Kingdom,"Sep 23, 2005"
3,12210583,b4d4909255dc5bb404ff64866cac60164b465e9a,cfd3727e-7162-4e6a-a3f7-a59343ee6b8d,honey is cool,50,,,Denmark,"Dec 22, 2008"
4,7937614,758de8850b5a07875da1e23296e4021574be8596,3bb24e11-821a-4e4e-bd89-e0a2452474cf,useless id,108,m,25.0,Serbia,"Mar 15, 2009"


# DATA ANALYSIS

In [5]:
print('Min. number of plays: ', rec_data['plays'].min())
print('Max. number of plays: ', rec_data['plays'].max())
avg_plays = rec_data['plays'].mean()
print('Avg. number of plays: ', avg_plays)

Min. number of plays:  1
Max. number of plays:  12558
Avg. number of plays:  220.7543


In [6]:
from datetime import datetime

# Parse the date string into a datetime object
rec_data['signupDate'] = rec_data['signupDate'].apply(lambda x: datetime.strptime(x, '%b %d, %Y'))

# Convert the datetime object to the desired output format "YYYYMMDD"
rec_data['signupDate'] = rec_data['signupDate'].apply(lambda x: x.strftime('%Y%m%d'))
rec_data['signupDate'] = rec_data['signupDate'].apply(int)

print(rec_data['signupDate'].head())  # Output: 20000101

0    20060916
1    20061214
2    20050923
3    20081222
4    20090315
Name: signupDate, dtype: int64


In [7]:
print('First signup date', datetime.strptime(str(rec_data['signupDate'].min()), '%Y%m%d'))
print('Last signup date', datetime.strptime(str(rec_data['signupDate'].max()), '%Y%m%d'))
last_date = rec_data['signupDate'].max()

First signup date 2002-10-29 00:00:00
Last signup date 2009-11-02 00:00:00


In [8]:
rec_data['userId'].value_counts()

userId
ee93d79f9b97cc03067dd3f4d90f6137cb301229    3
7de1e03e8a9b649bcf42ecde96ab301346601c53    3
32ff4f218c44472229c5d16de21a35e294fd5d10    2
a0549f97f461dc75b195aaa8324c05f0fa766851    2
45d0687e694fcef22de4660ef6e51e950cfb34b5    2
                                           ..
6371191a295fb7babeb98d813159bf2f4fb89c93    1
0b4c3386859abb3db2ca5f3c246156081baa6638    1
210c99699f2db1dfceb200e687c82da357e477a5    1
a2d2881ad3df0a75f450276c6b3287c04cfb0b39    1
57719e71d1265ba42aeca1e3bec5324f3d9a3714    1
Name: count, Length: 9871, dtype: int64

# Suliman-Keshavarz Algorithm

$$\text{rating(track, user)} = \text{max}(1, \text{min}(5, (\lfloor \frac{\text{num. plays(track)}}{\text{avg. plays}} \cdot 5) - k \cdot (\frac{\text{last signup date} - \text{signup date(user)}}{\text{last signup date}}) \rfloor))$$

We establish an algorithm to convert number of plays to a rating between 0 and 5 based on associating greater number of plays with greater song enjoyment, with the rating being proportional to the average number of plays recorded in the dataset. We use a weighting factor $k$ to minimize the influence of signing up earlier to the platform on a given user's ratings, as this may have resulted in some users having more opportunity to have their plays recorded in the dataset. We round down to the nearest whole number and take the max between this score and $1$ to negate the possibility of a negative or zero rating.

In [9]:
def suliman_keshavarz(data, k=0.05):
    return round(max(1, min(5, ((data['plays'] / avg_plays) * 5)) - k * ((last_date - data['signupDate']) / last_date)))

In [10]:
rec_data['rating'] = rec_data.apply(suliman_keshavarz, axis = 1)

In [11]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,rating
0,249130,03b9c645bc4f578ea1dcb6a975e7ea71fab79da2,a16371b9-7d36-497a-a9d4-42b0a0440c5e,slowdive,108,m,28.0,Germany,20060916,2
1,16103327,ee48ac7de691d31176a2843d0f2f8f49cab78e6d,5a24bc1a-c093-4a82-84ed-8d7f2da0570d,skinny puppy,407,m,,Canada,20061214,5
2,12079167,b2e3d27a00fc3035edb439b14e6d30bcb6dddf69,68f1175b-592b-4f28-ab1c-85c7a438c636,turbonegro,91,m,29.0,United Kingdom,20050923,2
3,12210583,b4d4909255dc5bb404ff64866cac60164b465e9a,cfd3727e-7162-4e6a-a3f7-a59343ee6b8d,honey is cool,50,,,Denmark,20081222,1
4,7937614,758de8850b5a07875da1e23296e4021574be8596,3bb24e11-821a-4e4e-bd89-e0a2452474cf,useless id,108,m,25.0,Serbia,20090315,2


In [12]:
rec_data['rating'].value_counts()

rating
1    4015
5    2850
2    1400
3    1006
4     729
Name: count, dtype: int64

# DATA CLEANING

Let's reshape our dataset into a more suitable form for designing a recommender system. First we want to assign every user to every artist in our dataset.

In [13]:
rec_data = rec_data.sort_values(by='artist')

In [14]:
ratings = rec_data[['userId', 'rating', 'artist']]

In [15]:
unique_users = rec_data['userId'].unique()
unique_artists = rec_data['artist'].unique()

In [16]:
user_artist_combinations = pd.DataFrame([(artist, user) for artist in unique_artists for user in unique_users],
                                        columns=['artist', 'userId'])

In [17]:
user_artist_combinations.head()

Unnamed: 0,artist,userId
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be
1,!!!,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab
2,!!!,0b749f78e378d5212a95919d13835335e2c7c55d
3,!!!,36049eca9ae1a1deea28977ec092217059cd0cfe
4,!!!,c52649f81e88755be839d6aed0b549e2432326b2


Now let's create a dataset of corresponding information for each unique user in our dataset.

In [18]:
gender_list = []
age_list = []
country_list = []
for user_id in unique_users:
    gender_list.append(rec_data[rec_data['userId'] == user_id]['gender'].values[0])
    age_list.append(rec_data[rec_data['userId'] == user_id]['age'].values[0])
    country_list.append(rec_data[rec_data['userId'] == user_id]['country'].values[0])

In [19]:
unique_users = pd.DataFrame(unique_users)
unique_users['Gender'] = gender_list
unique_users['Age'] = age_list
unique_users['Country'] = country_list

unique_users = unique_users.rename(columns={0: 'userId'})

In [20]:
unique_users.head()

Unnamed: 0,userId,Gender,Age,Country
0,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
1,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab,m,24.0,Russian Federation
2,0b749f78e378d5212a95919d13835335e2c7c55d,,20.0,Russian Federation
3,36049eca9ae1a1deea28977ec092217059cd0cfe,f,20.0,Netherlands
4,c52649f81e88755be839d6aed0b549e2432326b2,m,19.0,Australia


And now we merge the two dataframes together on the user ID column, creating a merged dataframe of every artist associated with every user, and the corresponding user information.

In [21]:
merged_data = pd.merge(user_artist_combinations, unique_users, on='userId')

In [22]:
merged_data.head()

Unnamed: 0,artist,userId,Gender,Age,Country
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom


Now we merge our ratings dataframe with the full dataframe of user-artist combinations. Then we drop the user ratings for the artists that user did not actually listen to and replace them with NA values.

In [23]:
training_df = pd.merge(merged_data, ratings, on='userId', how='inner')

In [24]:
training_df.head()

Unnamed: 0,artist_x,userId,Gender,Age,Country,rating,artist_y
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!


In [25]:
unique_user_artist_plays = ratings.groupby(['userId', 'artist'])['rating'].sum().reset_index()

# Merge merged_data with unique_user_artist_plays
training_df = pd.merge(merged_data, unique_user_artist_plays, on=['userId', 'artist'], how='left')

# Replace NaN values (where user hasn't listened to a specific artist) with 0
#training_df['rating'].fillna(0, inplace=True)

In [26]:
training_df.columns

Index(['artist', 'userId', 'Gender', 'Age', 'Country', 'rating'], dtype='object')

In [27]:
ratings_df = training_df.drop(columns=['Gender', 'Age', 'Country'])

In [28]:
ratings_df.head()

Unnamed: 0,artist,userId,rating
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,1.0
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,


Set multiindex on artist, followed by user ID columns.

In [29]:
ratings_df.set_index('artist', inplace=True)
ratings_df.set_index('userId', append=True, inplace=True)

In [30]:
R_all = ratings_df.unstack(['userId'])
R_all

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,001a92a006061452ea53cadf1f1e1db985f3a51f,001e2ade35f2476b47c15cce7bcb39dafa89b97a,00277ccecc376837e57b6d6b58330d1bafc90c73,002d07853e3d855dc359da5bd23f10ff11444b36,0039f6a10a8afc639e621ec4a6601306bafd9adf,005a5746da2730d0b54578d629a2a48f785b4acf,00643c8f3e0931150a8d37d63ffbfb9620fd9ba3,006cc2d3a76f75a399098eee512b2f645a049fc1,0073e9df704415e9d72f75380ba88059fca4230a,007fcb24cab2dcbaa6dbf5ae4c7084c6538251b1,...,ffaffe99a3617739877b980a031a6c376236d2de,ffb274a3d8bcdfaa68355919cefd184b9ccb7c4a,ffccc2d709e6b0d19227412647fed266e67d1b08,ffd2a97221e5001ff3de05659f7d4a303d2c6d73,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba,ffe19d4560159e0170f1be5bcda651fc022fc7bb,ffe27d9f69d660f9e7f17972c9c4a84fabf32188,ffe503ccc641e9050f17be8ba3805575bdaa9559,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff9cc8f46f8bcd31402bb9f82e020b5dc387f4b
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
!!!,,,,,,,,,,,...,,,,,,,,,,
#####,,,,,,,,,,,...,,,,,,,,,,
*nsync,,,,,,,,,,,...,,,,,,,,,,
+44,,,,,,,,,,,...,,,,,,,,,,
...and oceans,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ángeles del infierno,,,,,,,,,,,...,,,,,,,,,,
Çileke?,,,,,,,,,,,...,,,,,,,,,,
Édith piaf,,,,,,,,,,,...,,,,,,,,,,
Ólafur arnalds,,,,,,,,,,,...,,,,,,,,,,


In [36]:
I = 15
M = 15

# retrieve movies/users combination that is not *too* sparse
top_artists = R_all.agg('sum', axis=1).nlargest(10).tail(M).index
top_10_artists_list = R_all.agg('sum', axis=1).nlargest(10)
top_users = R_all.loc[list((top_10_artists_list).index)].notna().any()

R = R_all.loc[top_artists, top_users]
R

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,00ab72edac8ffa3cba5536a89cc4b976d86d091c,021398a1d70346f3a6aa016ca77b6f2ee5c09576,0433aaec28a8b6411d62a559fe256f3c05803225,0463812123b9eee34d9f5781a7afd87443f20197,04d218fa7614e58cec3894560769ff972f26cf37,06797aad44b5361ff45643463e215453daf60717,06d1eca6c5d36d268e866f54e85776e84def08d5,08e69dc29b28e895943386fb3e91ee007b932787,094f5c470ec2ddff6309543c5077b0d62090397d,0d2c2fc3c9bbaf0fc952e099a52f0f532b643ac9,...,f820ea4032d61119ad1ad2586311ef34dcd8b3c4,f8c5260d7772ffffb1b4a017392f53eec0605586,f8e24831f26574589f6e955a4f64fe88fbaed64e,fadd2320ed3170f4bf50bcd8aa7d0a3df59993b0,fb0bf9b10250dd73daded625b64bbe9cbe899335,fd7340d18ca5c6ba66fb6e847605a74c343ea368,fdc3208ecf3d6830e8b2554ba9b4269b040e2586,fecb2b3723f108d3473d77c35a0552a6aa1efa33,ff08526af6e344bd3350412f22885e29a8eaa54b,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
radiohead,,,,,2.0,1.0,,,,,...,5.0,,,,,5.0,,5.0,5.0,
the beatles,,,,,,,,,,,...,,5.0,,,3.0,,1.0,,,
coldplay,,1.0,,,,,,,1.0,,...,,,,,,,,,,
system of a down,3.0,,,,,,,,,,...,,,5.0,,,,,,,
the cure,,,,5.0,,,,4.0,,,...,,,,,,,,,,
foo fighters,,,,,,,5.0,,,,...,,,,,,,,,,2.0
pink floyd,,,,,,,,,,,...,,,,1.0,,,,,,
rammstein,,,,,,,,,,5.0,...,,,,,,,,,,
nine inch nails,,,,,,,,,,,...,,,,,,,,,,
????,,,5.0,,,,,,,,...,,,,,,,,,,


In [37]:
# R_all.loc["????", :].sum()
# R_all2 = R_all.drop(index='????')  # drop row of missing artists

In [38]:
R_all.shape

(5197, 9871)

In [34]:
(np.isnan(R_all)).mean().mean()

0.999805086150109

We have a very sparse dataset.

In [35]:
R_all_filled = R_all.fillna(0)
(R_all_filled != 0).sum(axis = 0).value_counts()

1    9745
2     124
3       2
Name: count, dtype: int64

It seems that the overwhelming majority of users have only one artist that they've listened to, making it difficult to form a recommendation system.

## User and Movies Matrices

Model rating $r_{mi}$ of movie $m$ by user $i$:
$$ \hat r_{mi} = \sum_{k=1}^K v_{mk} u_{ik} = v_{m} u_{i}^T $$
* $K$ unobserved characteristics (latent factors)
* $v_m=(v_{m1},\dots,v_{mK})$: movie $m$ having characteristic $k=1,\dots,K$
* $u_i=(u_{i1},\dots,u_{iK})$: user $i$'s affinity to characteristic $k=1,\dots,K$
* Rating $r_{mi}$ is high if $v_m$ and $u_i$ are well-aligned

We want to find the optimal number of latent factors $k$ on which to apply this.

## Outline

We will trial-and-error several different ways to optimize $k$.

1. SVD after imputing mean for missing values, threshhold on amount of variance 
2. Use cross-validation across several values for $k$, then choose the one that reduces the RMSE the most, post-gradient descent

## SVD

Since our dataframe is so sparse, we want to use a more clever method for imputing values. We use the following formula to impute missing values per row:
$$\text{impute value(row)} = \text{mean} +/- \frac{\text{mean(row)}}{\text{bias} \cdot \text{mean}}$$
We add or subtract the biased term depending on whether the row mean is greater or lower than the overall mean. This is because artists who receive high/low ratings should be inputted values from existing users should be imputted values that reflect this. The bias term determines how much we want to weigh the effect of existing ratings. Since our dataset is so sparse, and the ratings were only implicitly created from the existing data using the Suliman-Keshavarz algorithm, we want a high bias to lessen the effect of our ratings on the imputation. The minimum possible value is bias $= 1$, since otherwise we can recieve imputations greater than the maximum rating of $5$. 

In [36]:
def imputer(data, bias=3):
    data2 = data.copy()
    mean_value = np.nanmean(R_all.values)
    for i, row in data.iterrows():
        if row.mean() > mean_value:
            imput_val = mean_value + (row.mean() / (bias * mean_value))
        elif row.mean() < mean_value:
            imput_val = mean_value - (row.mean() / (bias * mean_value))
        data2.loc[i] = data2.loc[i].fillna(imput_val)
    return data2

In [None]:
# Fill NaN values using imputer function
R_all3 = imputer(R_all)

In [37]:
R_all3

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,001a92a006061452ea53cadf1f1e1db985f3a51f,001e2ade35f2476b47c15cce7bcb39dafa89b97a,00277ccecc376837e57b6d6b58330d1bafc90c73,002d07853e3d855dc359da5bd23f10ff11444b36,0039f6a10a8afc639e621ec4a6601306bafd9adf,005a5746da2730d0b54578d629a2a48f785b4acf,00643c8f3e0931150a8d37d63ffbfb9620fd9ba3,006cc2d3a76f75a399098eee512b2f645a049fc1,0073e9df704415e9d72f75380ba88059fca4230a,007fcb24cab2dcbaa6dbf5ae4c7084c6538251b1,...,ffaffe99a3617739877b980a031a6c376236d2de,ffb274a3d8bcdfaa68355919cefd184b9ccb7c4a,ffccc2d709e6b0d19227412647fed266e67d1b08,ffd2a97221e5001ff3de05659f7d4a303d2c6d73,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba,ffe19d4560159e0170f1be5bcda651fc022fc7bb,ffe27d9f69d660f9e7f17972c9c4a84fabf32188,ffe503ccc641e9050f17be8ba3805575bdaa9559,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff9cc8f46f8bcd31402bb9f82e020b5dc387f4b
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
!!!,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
#####,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
*nsync,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
+44,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
...and oceans,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ángeles del infierno,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Çileke?,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Édith piaf,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Ólafur arnalds,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017


Here is an example of how the SVD works

In [38]:
u, s, vh = np.linalg.svd(R_all3, full_matrices=False)

threshold = 0.1

# Retain singular values greater than the threshold
selected_singular_values = s[s > threshold]

In [39]:
# Indices of selected singular values
indices = np.where(s > threshold)[0]

# Collect U and V based on retained singular values
U = u[:, indices]
V = vh[indices, :].T

In [40]:
R_all3.shape

(5197, 9871)

In [41]:
print(U.shape)

(5197, 5185)


In [42]:
print(V.shape)

(9871, 5185)


In [43]:
from numpy import sqrt, mean

# calculate RMSE
def rmse(X, Y):
    return sqrt(mean((X - Y)**2))

error = [(0, rmse(R_all3.T, np.inner(V,U)))]

In [44]:
print(error)

[(0, 2.7000768448271013)]


In [45]:
# iterate through different thresholds
import time
max_iterations = 40  # maximum number of iterations
U, S, V = np.linalg.svd(R_all3, full_matrices=False)

In [46]:
print(S.shape)

(5197,)


In [47]:
# this will take some time...
error = []

for t in range(1, max_iterations):
    threshold = 10 / t    
    
    # Indices of selected singular values
    indices = np.where(S > threshold)[0]
    S1 = np.diag(S[indices])

    # Collect U and V based on retained singular values
    U1 = U[:, indices]
    V1 = V[indices, :]
    
    Q1 = np.matmul(U1, np.sqrt(S1))
    W1 = np.matmul(np.sqrt(S1), V1)
    print('threshold: ', threshold)
    print('Q1 shape: ', Q1.shape)
    print('W1 shape: ', W1.shape)
    # compute error after one sweep of updates
    error += [(threshold, rmse(R_all3, np.matmul(Q1, W1)))]
    
error = pd.DataFrame(error, columns=['threshold', 'rmse'])

threshold:  10.0
Q1 shape:  (5197, 4)
W1 shape:  (4, 9871)
threshold:  5.0
Q1 shape:  (5197, 155)
W1 shape:  (155, 9871)
threshold:  3.3333333333333335
Q1 shape:  (5197, 506)
W1 shape:  (506, 9871)
threshold:  2.5
Q1 shape:  (5197, 963)
W1 shape:  (963, 9871)
threshold:  2.0
Q1 shape:  (5197, 2162)
W1 shape:  (2162, 9871)
threshold:  1.6666666666666667
Q1 shape:  (5197, 3988)
W1 shape:  (3988, 9871)
threshold:  1.4285714285714286
Q1 shape:  (5197, 4017)
W1 shape:  (4017, 9871)
threshold:  1.25
Q1 shape:  (5197, 4275)
W1 shape:  (4275, 9871)
threshold:  1.1111111111111112
Q1 shape:  (5197, 4281)
W1 shape:  (4281, 9871)
threshold:  1.0
Q1 shape:  (5197, 4291)
W1 shape:  (4291, 9871)
threshold:  0.9090909090909091
Q1 shape:  (5197, 4312)
W1 shape:  (4312, 9871)
threshold:  0.8333333333333334
Q1 shape:  (5197, 4312)
W1 shape:  (4312, 9871)
threshold:  0.7692307692307693
Q1 shape:  (5197, 4312)
W1 shape:  (4312, 9871)
threshold:  0.7142857142857143
Q1 shape:  (5197, 4337)
W1 shape:  (4337, 

In [48]:
print(error)

    threshold      rmse
0   10.000000  0.023465
1    5.000000  0.020793
2    3.333333  0.017976
3    2.500000  0.015671
4    2.000000  0.010979
5    1.666667  0.004040
6    1.428571  0.003870
7    1.250000  0.002540
8    1.111111  0.002508
9    1.000000  0.002465
10   0.909091  0.002383
11   0.833333  0.002383
12   0.769231  0.002383
13   0.714286  0.002323
14   0.666667  0.000800
15   0.625000  0.000795
16   0.588235  0.000795
17   0.555556  0.000791
18   0.526316  0.000791
19   0.500000  0.000791
20   0.476190  0.000791
21   0.454545  0.000791
22   0.434783  0.000791
23   0.416667  0.000773
24   0.400000  0.000768
25   0.384615  0.000766
26   0.370370  0.000766
27   0.357143  0.000766
28   0.344828  0.000766
29   0.333333  0.000766
30   0.322581  0.000766
31   0.312500  0.000766
32   0.303030  0.000766
33   0.294118  0.000096
34   0.285714  0.000077
35   0.277778  0.000067
36   0.270270  0.000039
37   0.263158  0.000039
38   0.256410  0.000039


We want to choose a threshold for the size of our reduced SVD that will minimize error while also being computationally efficient by being as small as possible. While examining the above error rates, we heuristically choose a threshold of $\frac{2}{3}$. 

In [49]:
u, s, vh = np.linalg.svd(R_all3, full_matrices=False)

threshold = 2/3

# Retain singular values greater than the threshold
selected_singular_values = s[s > threshold]

In [50]:
# Indices of selected singular values
indices = np.where(s > threshold)[0]

# Collect U and V based on retained singular values
U = u[:, indices]
V = vh[indices, :].T

Now we will attempt cross validation instead as a way of choosing $k$.

## Cross Validation

In [87]:
def imputer(data, bias=3):
    data2 = data.copy()
    mean_value = np.nanmean(R_all.values)
    for i, row in data.iterrows():
        if row.mean() > mean_value:
            imput_val = mean_value + (row.mean() / (bias * mean_value))
        elif row.mean() < mean_value:
            imput_val = mean_value - (row.mean() / (bias * mean_value))
        data2.loc[i] = data2.loc[i].fillna(imput_val)
    return data2

In [85]:
# Fill NaN values using imputer function
R_all4 = imputer(R_all)

2.7001700170017
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
2.0
2.453271983741209
4.0
3.193966083522682
2.0
2.453271983741209
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
5.0
3.3174151001529277
2.5
2.391547475426086
5.0
3.3174151001529277
2.8
3.0458272635663874
1.0
2.5767210003714545
1.0
2.5767210003714545
1.7142857142857142
2.4885431313498505
3.0
3.0705170668924366
3.0
3.0705170668924366
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
3.0
3.0705170668924366
3.0
3.0705170668924366
4.0
3.193966083522682
1.3333333333333333
2.5355713281613728
1.0
2.5767210003714545
2.6666666666666665
2.370972639321045
2.0
2.453271983741209
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3

4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
2.6666666666666665
2.370972639321045
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.5
3.1322415752075594
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
3.0
3.0705170668924366
2.1666666666666665
2.432697147636168
4.0
3.193966083522682
4.0
3.193966083522682
2.0
2.453271983741209
4.0
3.193966083522682
3.5
3.1322415752075594
5.0
3.3174151001529277
5.0
3.3174151001529277
3.25
3.1013793210499983
4.0
3.193966083522682
4.0
3.193966083522682
5.0
3.3174151001529277
1.5555555555555556
2.5081382133546515
2.0
2.453271983741209
1.0
2.5767210003714545
3.6
3.1445864768705842
3.0
3.0705170668924366
3.0
3.0705170668924366
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
4.0
3.193966083522682
1.0
2.5767210003714545
1

2.5767210003714545
3.0
3.0705170668924366
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
1.8
2.477961787067258
4.666666666666667
3.276265427942846
2.0
2.453271983741209
1.5
2.5149964920563317
3.75
3.163103829365121
1.0
2.5767210003714545
3.6
3.1445864768705842
3.6
3.1445864768705842
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.6666666666666665
3.1528164113126005
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
2.5
2.391547475426086
5.0
3.3174151001529277
5.0
3.3174151001529277
5.0
3.3174151001529277
2.0
2.453271983741209
4.0
3.193966083522682
1.0
2.5767210003714545
2.0
2.453271983741209
3.0
3.0705170668924366
2.0
2.453271983741209
1.0
2.5767210003714545
4.0
3.193966083522682
5.0
3.3174151001529277
1.0
2.5767210003714545
1.3333333333333333
2.5355713281613728
1.0
2.5767210003714545
3.5
3.1322415752075594
5.0
3.3174151001529277
2.0


5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
2.6
2.3792025737630613
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.6666666666666667
2.4944216559512906
4.0
3.193966083522682
3.9
3.181621181859658
1.0
2.5767210003714545
3.6666666666666665
3.1528164113126005
1.3333333333333333
2.5355713281613728
4.0
3.193966083522682
4.0
3.193966083522682
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
2.0
2.453271983741209
5.0
3.3174151001529277
2.0
2.453271983741209
4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
5.0
3.3174151001529277
2.0
2.453271983741209
2.6666666666666665
2.370972639321045
3.0
3.0705170668924366
1.0
2.5767210003714545
1.0
2.5767210003714545
3.6666666666666665
3.1528164113126005
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.57672100037

2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
4.0
3.193966083522682
2.0
2.453271983741209
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
1.6666666666666667
2.4944216559512906
1.0
2.5767210003714545
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
2.3333333333333335
2.412122311531127
2.6666666666666665
2.370972639321045
1.0
2.5767210003714545
1.0
2.5767210003714545
1.8823529411764706
2.4677953974624143
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
5.0
3.3174151001529277
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
5.0
3.3174151001529277
2.75
3.0396548127348755
5.0
3.3174

1.0
2.5767210003714545
1.0
2.5767210003714545
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.3333333333333335
2.412122311531127
4.0
3.193966083522682
4.0
3.193966083522682
2.0
2.453271983741209
1.0
2.5767210003714545
3.0
3.0705170668924366
4.5
3.255690591837805
1.0
2.5767210003714545
2.2
2.4285821804151597
5.0
3.3174151001529277
1.0
2.5767210003714545
2.3333333333333335
2.412122311531127
5.0
3.3174151001529277
3.0
3.0705170668924366
1.0
2.5767210003714545
5.0
3.3174151001529277
2.5
2.391547475426086
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
2.5
2.391547475426086
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
2.3333333333333335
2.412122311531127
1.0
2.5767210003714545
2.0
2.453271983741209
2.0
2.453271983741209
2.5
2.391547475426086
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0


2.823529411764706
3.0487319463106286
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
4.5
3.255690591837805
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
2.5
2.391547475426086
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
2.0
2.453271983741209
4.0
3.193966083522682
1.0
2.5767210003714545
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.75
3.0396548127348755
1.0
2.5767210003714545
5.0
3.3174151001529277
3.0
3.0705170668924366
3.5
3.1322415752075594
5.0
3.3174151001529277
3.4285714285714284
3.123423788305399
2.0
2.453271983741209
5.0
3.3174151001529277
3.0
3.0705170668924366
2.0
2.45327198

3.0
3.0705170668924366
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
2.0
2.453271983741209
3.0
3.0705170668924366
1.0
2.5767210003714545
2.5
2.391547475426086
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
3.0
3.0705170668924366
1.6666666666666667
2.4944216559512906
3.0
3.0705170668924366
3.5
3.1322415752075594
4.0
3.193966083522682
2.0
2.453271983741209
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
3.5
3.1322415752075594
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
2.5
2.391547475426086
2.3333333333333335
2.412122311531127
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.576721000371

1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
5.0
3.3174151001529277
5.0
3.3174151001529277
1.5
2.5149964920563317
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
5.0
3.3174151001529277
3.6
3.1445864768705842
5.0
3.3174151001529277
3.0
3.0705170668924366
3.0
3.0705170668924366
1.0
2.5767210003714545
2.7142857142857144
3.035245919283795
1.0
2.5767210003714545
3.5
3.1322415752075594
5.0
3.3174151001529277
2.75
3.0396548127348755
3.0
3.0705170668924366
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
2.25
2.4224097295836473
3.0
3.0705170668924366
3.0
3.0705170668924366
3.0
3.0705170668924366
2.0
2.453271983741209
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
2.0
2.453271983741209

3.1116667391025183
2.3333333333333335
2.412122311531127
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
3.1666666666666665
3.0910919029974777
1.0
2.5767210003714545
2.3333333333333335
2.412122311531127
4.0
3.193966083522682
3.0
3.0705170668924366
3.0
3.0705170668924366
2.0
2.453271983741209
3.0
3.0705170668924366
5.0
3.3174151001529277
3.25
3.1013793210499983
2.0
2.453271983741209
1.0
2.5767210003714545
3.5
3.1322415752075594
1.0
2.5767210003714545
1.5
2.5149964920563317
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
2.4
2.4038923770891105
1.0
2.5767210003714545
3.75
3.163103829365121
3.0
3.0705170668924366
1.75
2.48413423789877
1.0
2.5767210003714545
2.3333333333333335
2.412122311531127
5.0
3.3174151001529277
5.0
3.3174151001529277
1.0
2.576

5.0
3.3174151001529277
4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
1.0
2.5767210003714545
2.5
2.391547475426086
1.0
2.5767210003714545
3.5
3.1322415752075594
2.8
3.0458272635663874
2.0
2.453271983741209
1.0
2.5767210003714545
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
1.5
2.5149964920563317
1.0
2.5767210003714545
2.5
2.391547475426086
3.0
3.0705170668924366
5.0
3.3174151001529277
3.0
3.0705170668924366
4.0
3.193966083522682
5.0
3.3174151001529277
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.0
3.0705170668924366
3.0
3.0705170668924366
1.0
2.5767210003714545
4.333333333333333
3.235115755732764
5.0
3.3174151001529277
3.5
3.1322415752075594
1.0
2.5767210003714545
3.3333

2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.5767210003714545
3.25
3.1013793210499983
3.5172413793103448
3.1343700065287705
1.0
2.5767210003714545
3.0
3.0705170668924366
3.0
3.0705170668924366
2.3333333333333335
2.412122311531127
3.0
3.0705170668924366
2.2857142857142856
2.4180008361325673
1.6666666666666667
2.4944216559512906
5.0
3.3174151001529277
1.0
2.5767210003714545
4.0
3.193966083522682
5.0
3.3174151001529277
1.0
2.5767210003714545
1.6666666666666667
2.4944216559512906
3.0
3.0705170668924366
2.5
2.391547475426086
1.25
2.545858746213893
1.0
2.5767210003714545
1.25
2.545858746213893
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
5.0
3.3174151001529277
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
5.0
3.3174151001529277
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210

1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
3.0
3.0705170668924366
1.5
2.5149964920563317
1.0
2.5767210003714545
3.0
3.0705170668924366
5.0
3.3174151001529277
1.0
2.5767210003714545
2.0
2.453271983741209
3.142857142857143
3.0881526406967574
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
4.0
3.193966083522682
1.0
2.5767210003714545
1.0
2.5767210003714545
2.8461538461538463
3.0515249104877835
3.1818181818181817
3.0929623426433905
5.0
3.3174151001529277
2.6666666666666665
2.370972639321045
5.0
3.3174151001529277
3.0
3.0705170668924366
1.6666666666666667
2.4944216559512906
3.0
3.0705170668924366
1.25
2.545858746213893
1.0
2.5767210003714545
4.666666666666667
3.276265427942846
1.0
2.5767210003714545
2.75
3.0396548127348755
4.0
3.193966083522682
2.6875
2.368400784807915
1.0
2.5767210003714545
2.5
2.391547475426086
1.5
2.5149964920563317
3.0
3.0705170668924366
5.0
3.3174151001529277
1.0
2.5767210003714545
1.0
2.5767210003714545
1.0
2.576721000371

2.0
2.453271983741209
1.0
2.5767210003714545
1.0
2.5767210003714545
2.0
2.453271983741209
5.0
3.3174151001529277
3.0
3.0705170668924366
2.5
2.391547475426086
2.0
2.453271983741209
1.0
2.5767210003714545
2.0
2.453271983741209
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
5.0
3.3174151001529277
1.0
2.5767210003714545
3.0
3.0705170668924366
1.0
2.5767210003714545
5.0
3.3174151001529277
1.0
2.5767210003714545
5.0
3.3174151001529277
3.6666666666666665
3.1528164113126005
4.0
3.193966083522682
1.0
2.5767210003714545
5.0
3.3174151001529277
2.0
2.453271983741209
3.0
3.0705170668924366
2.0
2.453271983741209
5.0
3.3174151001529277
5.0
3.3174151001529277
5.0
3.3174151001529277
5.0
3.3174151001529277
5.0
3.3174151001529277
5.0
3.3174151001529277
3.0
3.0705170668924366
3.0
3.0705170668924366
2.0
2.453271983741209
5.0
3.3174151001529277
3.5
3.1322415752075594
5.0
3.3174151001529277
2.0
2.453271983741209
1.0
2.5767210003714545
3.0
3.0705170668924366
5.0
3.3174151001529277
1.0
2.5

In [86]:
R_all4

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,001a92a006061452ea53cadf1f1e1db985f3a51f,001e2ade35f2476b47c15cce7bcb39dafa89b97a,00277ccecc376837e57b6d6b58330d1bafc90c73,002d07853e3d855dc359da5bd23f10ff11444b36,0039f6a10a8afc639e621ec4a6601306bafd9adf,005a5746da2730d0b54578d629a2a48f785b4acf,00643c8f3e0931150a8d37d63ffbfb9620fd9ba3,006cc2d3a76f75a399098eee512b2f645a049fc1,0073e9df704415e9d72f75380ba88059fca4230a,007fcb24cab2dcbaa6dbf5ae4c7084c6538251b1,...,ffaffe99a3617739877b980a031a6c376236d2de,ffb274a3d8bcdfaa68355919cefd184b9ccb7c4a,ffccc2d709e6b0d19227412647fed266e67d1b08,ffd2a97221e5001ff3de05659f7d4a303d2c6d73,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba,ffe19d4560159e0170f1be5bcda651fc022fc7bb,ffe27d9f69d660f9e7f17972c9c4a84fabf32188,ffe503ccc641e9050f17be8ba3805575bdaa9559,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff9cc8f46f8bcd31402bb9f82e020b5dc387f4b
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
!!!,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,...,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721
#####,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,...,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517,3.070517
*nsync,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,...,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721
+44,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,...,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721
...and oceans,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,...,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ángeles del infierno,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,...,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415
Çileke?,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,...,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721,2.576721
Édith piaf,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,...,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816,3.152816
Ólafur arnalds,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,...,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415,3.317415


In [103]:
# Find the indices of where the non NaN values were located
nonnull_indices = pd.DataFrame(np.transpose(np.where(R_all != None)))

In [104]:
nonnull_indices.columns = ['x', 'y']

In [105]:
len(nonnull_indices['x'])

51299587

In [106]:
unique_values = nonnull_indices['x'].unique()

In [107]:
len(unique_values)

5197

In [108]:
# Count occurrences of values in the first column
counts = nonnull_indices['x'].value_counts()

# Filter the DataFrame to include only the users who have multiple ratings across artists
nonnull_indices2 = nonnull_indices[nonnull_indices['x'].isin(counts[counts > 1].index)]

In [109]:
print(counts)

x
0       9871
3452    9871
3470    9871
3469    9871
3468    9871
        ... 
1731    9871
1730    9871
1729    9871
1728    9871
5196    9871
Name: count, Length: 5197, dtype: int64


In [110]:
print(nonnull_indices2.head())

   x  y
0  0  0
1  0  1
2  0  2
3  0  3
4  0  4


In [111]:
print(len(nonnull_indices2))

51299587


In [112]:
R_all4.iloc[1, 394]

5.0

In [113]:
duplicate_mask = nonnull_indices2.duplicated(subset='x', keep='first') & nonnull_indices2.duplicated(subset='x', keep=False)
nonnull_indices2 = nonnull_indices2[~duplicate_mask]

We will use leave-one-out cross validation to optimize the size of our non-negative matrix factorization. Now that we have identified the non-null indices of those artists who have multiple listeners in our dataframe, we will systematically iterate through this list for each potential choice of $k$ (from $3-10$, in line with the literature on recommender systems), and leave out a rating for this artist before performing the matrix factorization. We then take the dot product of the relevant artist row in $W$ and user column in $H$ to predict the missing value, and sum up the prediction error for every missing value across a single choice of $k$. We choose the value of $k$ associated with the lowest PRESS value (predicted residual sum of squares).

In [None]:
# leave-one-out cross validation
from sklearn.decomposition import NMF
for k in range(3,10):
    PRESS = 0
    for x_value in nonnull_indices2['x']:
        y_value = nonnull_indices2[nonnull_indices2['x'] == x_value]['y'].iloc[0]
        remove_coord = (x_value, y_value)
        R_cross_validated = R_all4[~R_all4.index.isin(remove_coord)]
        model = NMF(n_components=k, init='random', random_state=0)
        # Fit the model to the data
        W = model.fit_transform(R_cross_validated)  # Matrix of basis vectors (components)
   #     print(W)
        H = model.components_          # Matrix of coefficients
  #      print(H)
        predicted_value = np.dot(W[x_value], H[:, y_value])
        print(predicted_value)
        PRESS += np.abs(predicted_value - R_all4.iloc[x_value, y_value])
   #     error += rmse(R_all4, np.inner(W,H.T))
    print(k, PRESS)

2.7007576094484294
2.7002804278256916
2.701125657583377
2.7008912492325416
2.700162319588256
2.700245744970873
2.7000329709970137
2.7010409660323176
2.6989574903123503
2.7004253592848393
2.70091137603878
2.69843472382089
2.6994783847399226
2.700344024545627
2.701018423738948
2.6998709527133986
2.701288030604766
2.7018892249714335
2.7041754747929065
2.70526588655156
2.701567622121321
2.699879394941761
2.701866161257624
2.7015669546058048
2.6989017193036697
2.69998628722143
2.700338666150153
2.699573385026783
2.7007658229903213
2.6984757297013227
2.7000423290575
2.7004822629113825
2.7004389263882
2.700635287548331
2.6998209461177485
2.6998283853753855
2.699680536621944
2.6989335173768825
2.699995352736548
2.701121824647965
2.6994633297149253
2.7000959025152595
2.6985825505157774
2.699584277639451
2.704116172503945
2.698033697016295
2.6994725420583094
2.701289637675923
2.7002101521881334
2.700146841196931
2.6997175805339912
2.6997399931656445
2.7013500693618364
2.6996012484897443
2.699888

2.6992910034757878
2.700775465282193
2.700899249613471
2.6994127251482802
2.7016947185533753
2.699058795632972
2.6994137212121054
2.7012591871146077
2.6969356039003247
2.7004224180040426
2.699493008239751
2.7009018726925924
2.6999358680453702
2.699756289617248
2.70031838670981
2.6996900829068315
2.7014366269156316
2.7050145767390745
2.6997762517604533
2.699539404051277
2.70023082096298
2.6993193971070553
2.7006158773448314
2.6993473079919683
2.69967789430222
2.700119334448672
2.700957921434715
2.700957711662772
2.7018028851084055
2.699651972792874
2.70069610025231
2.7016957269903217
2.6992604554202853
2.6991540098411644
2.6999000998466247
2.701052914065896
2.699895488018108
2.7007574106872614
2.7032097314380805
2.6997026158072437
2.6983088190644713
2.699046562682627
2.6984035185720754
2.7017991847476264
2.6995751833321027
2.699813233114519
2.701678757562164
2.699931732339344
2.6992242320613062
2.699079906365143
2.7006336446663406
2.7011176284455365
2.699873810293578
2.699537219355804
2

2.6992336260572145
2.7004893496188394
2.7014114176270434
2.6996149426932234
2.700903396795521
2.7000093510588656
2.699812408718032
2.698051062389198
2.699095231299161
2.699083127686853
2.701463977955565
2.70111529049376
2.6992467658400785
2.6984727611124413
2.702425805467102
2.7018849209140328
2.7007362747162773
2.7004973773614807
2.7004901047267893
2.699287211593637
2.6993497269343645
2.7013122702635783
2.699923961338113
2.701180485789626
2.700457318980635
2.6993774787637363
2.699246096443133
2.698882705723641
2.699570143947967
2.699742718176511
2.6996592984246393
2.7002995055852574
2.699539995487869
2.700775176897978
2.6993953198452156
2.6994612227542927
2.7024472512040916
2.7000912820496237
2.7040749445592587
2.699248746958296
2.7003657392743143
2.6985993628368674
2.702575891234898
2.6993316262959315
2.701385574304881
2.700236540320964
2.700010829097701
2.7012677388459094
2.6999770984483935
2.699039754589545
2.6994537762065915
2.7011900734838594
2.700066028244836
2.6995699481798976


2.699482350827816
2.701068084070879
2.701115953400952
2.698803033868372
2.6998353584664265
2.699662313013217
2.698123459165413
2.7001129864096516
2.7001284461439568
2.701875078901141
2.6985173729299823
2.7017137473280695
2.7012819704340383
2.699808239192696
2.699398653203531
2.699807009908888
2.7008493092962955
2.6994992383687553
2.704968914598493
2.701132929356186
2.7022547927504803
2.7004524325919417
2.7009826562702486
2.6999107482811144
2.6987744222942562
2.6998860716447233
2.7007610839342977
2.7011580990216784
2.6992173066661307
2.701607388003686
2.7030842847748087
2.699994258621989
2.702274816790994
2.7003688477570664
2.6983466391876743
2.7007331497206586
2.7014222818867157
2.6994137478337
2.7002189386170317
2.6999583381644143
2.699534637839044
2.69935993728851
2.7016066882149237
2.6999111090997987
2.699129968051269
2.7009826311477214
2.7005972218529015
2.697546816864885
2.699856224339224
2.6995139190743958
2.7022400604012
2.699161492236919
2.699817498726812
2.7010722594663306
2.6

2.7012969965196336
2.7059650128923973
2.7008634978068393
2.700851406872408
2.7003053554087426
2.6999644734922392
2.6999577510611763
2.7001287922264474
2.7008669525049305
2.700002847592761
2.701287738909414
2.6992816892078455
2.700714697369383
2.699861397143569
2.7008001312003804
2.6993522279529496
2.6998645842304945
2.6996289181988704
2.69993514594674
2.7021534947919554
2.699461981411954
2.6998699054184527
2.7012356940436577
2.700530392185495
2.703167871604638
2.7011176320066674
2.702542140794635
2.7030733051788585
2.699838504179283
2.701521713267768
2.699682330126716
2.699579367443154
2.700418785955943
2.7015554121297667
2.6990238518855074
2.6992859003985368
2.699230949870507
2.699641319240733
2.7053853396140033
2.7001378068329984
2.7015593736491104
2.699752732339787
2.698895943893028
2.6987700235924867
2.7014372278758287
2.6989905083986363
2.6986778945857326
2.699278309740692
2.699701778943657
2.701221016016566
2.6983395210446472
2.6991477060613773
2.6982033005485553
2.70032021296816

2.700469776156481
2.6984358453725474
2.7002884521512716
2.700196473899634
2.699463956474694
2.699749504493966
2.6985603210820472
2.7000293560637534
2.6983882960509926
2.7006972550632504
2.7002461209656734
2.6988540130248264
2.699194522025056
2.7030662793039806
2.7012642483886893
2.6983667200977886
2.699905510629941
2.7027071685824637
2.701965188552615
2.700106241200556
2.6993284972130045
2.7015802648098766
2.699227420243547
2.700542365954023
2.6990526551978724
2.70026914454876
2.7002636578049852
2.7004095764038434
2.702630248844499
2.698962389897475
2.700780881112715
2.699767932157287
2.7039216134907353
2.7014194380617957
2.699401234364399
2.7009949944601237
2.6994431097046574
2.6990635525192195
2.7009947359010473
2.7015086170330376
2.700386282806049
2.6995356773608297
2.7007145789214824
2.699409831949381
2.699150014887718
2.6987604685247835
2.6995538574896805
2.7013152127977254
2.6995456111052247
2.699772683175991
2.699136479050652
2.701695560037573
2.7019041649408524
2.69915707479966

2.698814252919064
2.6992288475163693
2.701904233704119
2.6991556261175083
2.6985753694193466
2.702998983386105
2.699704396523679
2.6990407153456024
2.7010699217123566
2.698005551147776
2.699513074593322
2.7007099237787724
2.699917869155514
2.7003830176590817
2.6985888666765354
2.6994005262669787
2.700217989700998
2.699654735496228
2.699604298642882
2.69969133329975
2.706470133953368
2.701266399596502
2.700001328093933
2.698747966692035
2.6978886539364257
2.7016292620581677
2.6991413354621754
2.697980431289899
2.6989138997116635
2.697564603976543
2.699312539919556
2.6994355036040654
2.6993600298754408
2.7009148002062933
2.7034203359844557
2.69967288886647
2.6989663978522684
2.7010362407578405
2.700889152505345
2.704269899759564
2.6989572675246967
2.6996982128136473
2.701207190551015
2.6982417996977732
2.705753760811449
2.6984426810353854
2.6999738712345724
2.7021285994915583
2.700081403487983
2.7016340451462377
2.6984619136861268
2.7026968043284594
2.7032458505486217
2.6988923789614834


2.702016763188732
2.699987831575309
2.7021598160141256
2.701377484357716
2.699027758373843
2.7010433142083343
2.700144577795656
2.6993596610094817
2.700817460598542
2.6981177648265895
2.6990371502341954
2.7003151938788204
2.7005757491746345
2.7007070354107894
2.700000646425196
2.698965645040241
2.699515591350271
2.698971873978932
2.6991441787581945
2.7016728251190107
2.6991884329261993
2.7002752772252707
2.698126824187224
2.699274951854034
2.703360276053451
2.697978418532421
2.7001493925924
2.70126176285432
2.700489405459023
2.700113241334833
2.6995254480239828
2.6991905505208
2.701844218352457
2.6995214602891178
2.700425475421948
2.6991650493135606
2.697983611443975
2.700094082137638
2.6989097172447294
2.6985726167044133
2.7017459266098305
2.699784304553413
2.699120436822625
2.7000756732792714
2.6990744034623537
2.6995644380710777
2.7003819059993774
2.7004088079321016
2.7013245852522845
2.70081113988027
2.697519622372126
2.6998400677267322
2.701706136085764
2.699455187713161
2.7021424

In [106]:
# k-fold cross validation
# delete later
num_folds = 5
for k in range(3,10):
    PRESS = 0
    prev_fold = 0
    for fold in range(1, num_folds):
        indices = list(range(prev_fold, prev_fold + (len(nonzero_indices2) // num_folds)))
        print(indices)
        R_cross_validated = R_all4[~R_all4.index.isin(indices)]
        print(R_cross_validated.shape)
        model = NMF(n_components=k, init='random', random_state=0)
        # Fit the model to the data
        W = model.fit_transform(R_cross_validated)  # Matrix of basis vectors (components)
        H = model.components_          # Matrix of coefficients
        predicted_value = np.dot(W[x_value], H[:, y_value])
      #  print(predicted_value)
        PRESS += np.abs(predicted_value - R_all4.iloc[x_value, y_value])
        prev_fold = prev_fold + (len(nonzero_indices2) // num_folds)
   #     error += rmse(R_all4, np.inner(W,H.T))
    print(k, PRESS)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

(5197, 9871)
2.70103930244725
[616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809,

(5197, 9871)
2.7009462185741837
5 9.196215125703265
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 21

(5197, 9871)
2.7022518489618568
[616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 80

(5197, 9871)
2.7039429281964598
8 9.184228287214161
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 21

## ARMAN'S PART

amaze me - Sammy

## Default Recommendation
    
We want to provide default recommendations to users with no previous reviews based solely based on secondary characteristics (sex, age, country).

In [31]:
rec_data['country'].value_counts().shape

(146,)

In [32]:
rec_data['age'].value_counts().shape

(85,)

In [33]:
one_hot_encoded = pd.get_dummies(rec_data['gender'], dtype=int)
#one_hot_encoded2 = pd.get_dummies(rec_data['age'], prefix='Age', dtype=int)
one_hot_encoded3 = pd.get_dummies(rec_data['country'], dtype=int)

# Concatenate the one-hot encoded columns with the original DataFrame
rec_data2 = pd.concat([rec_data, one_hot_encoded, one_hot_encoded3], axis=1)
rec_data2.drop(['Unnamed: 0', 'artist', 'userId', 'artistId', 'plays', 'gender', 'country', 'signupDate', 'rating'], axis = 1, inplace=True)

In [34]:
rec_data2.shape

(10000, 149)

In [35]:
rec_data2.head()

Unnamed: 0,age,f,m,Afghanistan,Algeria,Andorra,Antarctica,Argentina,Aruba,Australia,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
1875,26.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334,24.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8189,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6437,20.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337,19.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [36]:
pd.isnull(rec_data2).sum(axis = 0)

age                        2040
f                             0
m                             0
Afghanistan                   0
Algeria                       0
                           ... 
Venezuela                     0
Viet Nam                      0
Virgin Islands, British       0
Wallis and Futuna             0
Zimbabwe                      0
Length: 149, dtype: int64

In [37]:
# imputing mean age for those users whose ages are missing
rec_data2['age'] = rec_data2['age'].fillna(np.mean(rec_data2['age']))

In [38]:
from sklearn.cluster import KMeans

# Number of clusters (you can set this as per your requirement)
num_clusters = 5

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(rec_data2)

# Get cluster labels for each user
cluster_labels = kmeans.labels_

# Add cluster labels to the DataFrame
rec_data['Cluster'] = cluster_labels

  super()._check_params_vs_input(X, default_n_init=10)


In [39]:
rec_data['Cluster']

1875    3
334     3
8189    0
6437    0
337     0
       ..
3699    4
9650    0
3809    2
4756    2
6124    0
Name: Cluster, Length: 10000, dtype: int32

In [40]:
rec_data['Cluster'].value_counts()

Cluster
3    4799
0    3579
2    1317
4     258
1      47
Name: count, dtype: int64

In [41]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,rating,Cluster
1875,5375648,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,28,m,26.0,United Kingdom,20051221,1,3
334,10340416,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab,537db97d-372b-4648-8633-f1dcf52b1f47,#####,41,m,24.0,Russian Federation,20080204,1,3
8189,772229,0b749f78e378d5212a95919d13835335e2c7c55d,537db97d-372b-4648-8633-f1dcf52b1f47,#####,1458,,20.0,Russian Federation,20051214,5,0
6437,3639551,36049eca9ae1a1deea28977ec092217059cd0cfe,603ba565-3967-4be1-931e-9cb945394e86,*nsync,9,f,20.0,Netherlands,20080918,1,0
337,13317975,c52649f81e88755be839d6aed0b549e2432326b2,c2a44e93-3a2b-44aa-bd8b-7a71bb76e3b5,+44,66,m,19.0,Australia,20080701,1,0


In [42]:
cluster0_artists = []
cluster1_artists = []
cluster2_artists = []
cluster3_artists = []
cluster4_artists = []

for i in range(len(rec_data)):
    if rec_data.loc[i]['Cluster'] == 0:
        cluster0_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 1:
        cluster1_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 2:
        cluster2_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 3:
        cluster3_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 4:
        cluster4_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})

In [43]:
print(cluster0_artists)

[{'rating': 1, 'artist': 'murat boz'}, {'rating': 5, 'artist': 'röyksopp'}, {'rating': 5, 'artist': 'blink-182'}, {'rating': 1, 'artist': 'red hot chili peppers'}, {'rating': 2, 'artist': 'cascada'}, {'rating': 3, 'artist': 'pendulum'}, {'rating': 3, 'artist': 'bløf'}, {'rating': 5, 'artist': 'the juliana theory'}, {'rating': 3, 'artist': 'peeping tom'}, {'rating': 5, 'artist': 'mr.children'}, {'rating': 1, 'artist': 'alesana'}, {'rating': 5, 'artist': 'nitin sawhney'}, {'rating': 1, 'artist': 'lady gaga'}, {'rating': 3, 'artist': 'have heart'}, {'rating': 3, 'artist': "guns n' roses"}, {'rating': 1, 'artist': 'katie melua'}, {'rating': 4, 'artist': 'trans-siberian orchestra'}, {'rating': 5, 'artist': 'mother vulpine'}, {'rating': 5, 'artist': 'the white stripes'}, {'rating': 4, 'artist': 'maxïmo park'}, {'rating': 1, 'artist': 'paolo nutini'}, {'rating': 5, 'artist': 'radiohead'}, {'rating': 5, 'artist': 'los bunkers'}, {'rating': 5, 'artist': 'deftones'}, {'rating': 4, 'artist': 'dei

In [44]:
cluster0_artists = sorted(cluster0_artists, key=lambda x: x['rating'], reverse=True)
cluster1_artists = sorted(cluster1_artists, key=lambda x: x['rating'], reverse=True)
cluster2_artists = sorted(cluster2_artists, key=lambda x: x['rating'], reverse=True)
cluster3_artists = sorted(cluster3_artists, key=lambda x: x['rating'], reverse=True)
cluster4_artists = sorted(cluster4_artists, key=lambda x: x['rating'], reverse=True)

In [45]:
print(cluster0_artists)

[{'rating': 5, 'artist': 'röyksopp'}, {'rating': 5, 'artist': 'blink-182'}, {'rating': 5, 'artist': 'the juliana theory'}, {'rating': 5, 'artist': 'mr.children'}, {'rating': 5, 'artist': 'nitin sawhney'}, {'rating': 5, 'artist': 'mother vulpine'}, {'rating': 5, 'artist': 'the white stripes'}, {'rating': 5, 'artist': 'radiohead'}, {'rating': 5, 'artist': 'los bunkers'}, {'rating': 5, 'artist': 'deftones'}, {'rating': 5, 'artist': 'stacie orrico'}, {'rating': 5, 'artist': 'mimi maura'}, {'rating': 5, 'artist': 'firebug'}, {'rating': 5, 'artist': 'widespread panic'}, {'rating': 5, 'artist': 'against me!'}, {'rating': 5, 'artist': 'the verve'}, {'rating': 5, 'artist': 'gangsta boo'}, {'rating': 5, 'artist': 'clutch'}, {'rating': 5, 'artist': 'ruiner'}, {'rating': 5, 'artist': 'l.a.o.s.'}, {'rating': 5, 'artist': 'my chemical romance'}, {'rating': 5, 'artist': 'soundtrack'}, {'rating': 5, 'artist': 'the killers'}, {'rating': 5, 'artist': 'neglected fields'}, {'rating': 5, 'artist': 'john wi

Here is an example of how we could assign recommendations to a new user without number of plays or artist information.

In [46]:
new_user = pd.DataFrame([[249130, '178e7gd3278dy238d732y823yd92', None, None, None, 'm', 21.0, 'United States', 'Dec 31, 2001']], columns=['Unnamed: 0', 'userId', 'artistId', 'artist', 'plays', 'gender', 'age',
       'country', 'signupDate'])

In [47]:
rec_data2.columns

Index(['age', 'f', 'm', 'Afghanistan', 'Algeria', 'Andorra', 'Antarctica',
       'Argentina', 'Aruba', 'Australia',
       ...
       'United States', 'United States Minor Outlying Islands', 'Uruguay',
       'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam',
       'Virgin Islands, British', 'Wallis and Futuna', 'Zimbabwe'],
      dtype='object', length=149)

In [48]:
import pprint

# Convert DataFrame columns to a list
column_list = rec_data2.columns.tolist()

# Display the column names without truncation
pprint.pprint(column_list, width=200)  # Adjust width as needed

['age',
 'f',
 'm',
 'Afghanistan',
 'Algeria',
 'Andorra',
 'Antarctica',
 'Argentina',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Canada',
 'Cayman Islands',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Costa Rica',
 "Cote D'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Southern Territories',
 'Georgia',
 'Germany',
 'Greece',
 'Guatemala',
 'Guinea-Bissau',
 'Haiti',
 'Heard Island and Mcdonald Islands',
 'Holy See (Vatican City State)',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Republic of',
 'Ire

In [49]:
new_columns = ['f','m','Afghanistan','Algeria','Andorra','Antarctica','Argentina','Aruba','Australia','Austria','Azerbaijan','Bahamas',
               'Bahrain','Barbados','Belarus','Belgium','Bermuda','Bhutan','Bolivia','Bosnia and Herzegovina','Botswana','Brazil',
               'Brunei Darussalam','Bulgaria','Burkina Faso','Cambodia','Canada','Cayman Islands','Chile','China','Christmas Island',
               'Cocos (Keeling) Islands','Colombia','Costa Rica',"Cote D'Ivoire",'Croatia','Cuba','Cyprus','Czech Republic','Denmark',
               'Djibouti','Dominica','Dominican Republic','Ecuador','Egypt','El Salvador','Estonia','Faroe Islands','Fiji','Finland',
               'France','French Southern Territories','Georgia','Germany','Greece','Guatemala','Guinea-Bissau','Haiti','Heard Island and Mcdonald Islands',
               'Holy See (Vatican City State)','Honduras','Hong Kong','Hungary','Iceland','India','Indonesia','Iran, Islamic Republic of',
               'Ireland','Israel','Italy','Jamaica','Japan','Kazakhstan','Kiribati',"Korea, Democratic People's Republic of",'Korea, Republic of',
               'Kuwait','Kyrgyzstan','Latvia','Lebanon','Libyan Arab Jamahiriya','Liechtenstein','Lithuania','Luxembourg','Macedonia',
               'Malaysia','Malta','Martinique','Mexico','Micronesia, Federated States of','Moldova','Mongolia','Montenegro','Morocco',
               'Nauru','Netherlands','Netherlands Antilles','New Zealand','Nicaragua','Niue','Norway','Pakistan','Panama','Papua New Guinea',
               'Paraguay','Peru','Philippines','Pitcairn','Poland','Portugal','Puerto Rico','Reunion','Romania','Russian Federation',
               'Sao Tome and Principe','Saudi Arabia','Serbia','Sierra Leone','Singapore','Slovakia','Slovenia','Solomon Islands',
               'Somalia','South Africa','Spain','Sri Lanka','Sweden','Switzerland','Taiwan','Thailand','Togo','Tunisia','Turkey',
               'Turks and Caicos Islands','Tuvalu','Ukraine','United Arab Emirates','United Kingdom','United States','United States Minor Outlying Islands',
               'Uruguay','Uzbekistan','Vanuatu','Venezuela','Viet Nam','Virgin Islands, British','Wallis and Futuna','Zimbabwe']

In [50]:
for col in new_columns:
    new_user[col] = 0

  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0


In [51]:
new_user

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,f,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,249130,178e7gd3278dy238d732y823yd92,,,,m,21.0,United States,"Dec 31, 2001",0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
for country in new_user.columns:
    if country == 'country':
        unique_countries = new_user['country'].unique()
        for c in unique_countries:
            new_user[c] = (new_user['country'] == c).astype(int)
        new_user.drop('country', axis=1, inplace=True)

In [53]:
for gender in new_user.columns:
    if gender == 'gender':
        unique_gender = new_user['gender'].unique()
        for g in unique_gender:
            new_user[g] = (new_user['gender'] == g).astype(int)
        new_user.drop('gender', axis=1, inplace=True)

In [54]:
new_user

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,age,signupDate,f,m,Afghanistan,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,249130,178e7gd3278dy238d732y823yd92,,,,21.0,"Dec 31, 2001",0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [55]:
rec_data2.columns

Index(['age', 'f', 'm', 'Afghanistan', 'Algeria', 'Andorra', 'Antarctica',
       'Argentina', 'Aruba', 'Australia',
       ...
       'United States', 'United States Minor Outlying Islands', 'Uruguay',
       'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam',
       'Virgin Islands, British', 'Wallis and Futuna', 'Zimbabwe'],
      dtype='object', length=149)

In [56]:
new_user.drop(['Unnamed: 0', 'userId', 'artistId', 'artist', 'plays', 'signupDate'], axis=1, inplace=True)

In [57]:
new_user

Unnamed: 0,age,f,m,Afghanistan,Algeria,Andorra,Antarctica,Argentina,Aruba,Australia,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,21.0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [58]:
# Predict clusters for new user(s)
predicted_cluster = kmeans.predict(new_user)

In [59]:
print(predicted_cluster)

[0]


The new user has been categorized as belonging to cluster 0.

We want to find the top artists for each cluster, judged by criteria of highest rated, with more emphasis given to artists with more positive ratings.

The algorithm we have written iterates through each artist - user rating pair for a particular cluster, then updates the "ranking rating" for sorting the cluster. If both the current "ranking rating" and the user rating we are considering are below the mean rating of the dataset, we take the mean of the two, then substract one. If they are both above, we take the mean of the two then add one. If one is above the overall mean and the other is below, we simply take the mean of both values. This is to weigh the cumulative effect of the ratings more highly than simply taking the average over all ratings. So if an artist is getting repeatedly high ratings across multiple users in the cluster, we want to rank this artist more highly in our recommendations than an artist who only received one positive rating from the users in the cluster.

In [60]:
cluster0_artists

[{'rating': 5, 'artist': 'röyksopp'},
 {'rating': 5, 'artist': 'blink-182'},
 {'rating': 5, 'artist': 'the juliana theory'},
 {'rating': 5, 'artist': 'mr.children'},
 {'rating': 5, 'artist': 'nitin sawhney'},
 {'rating': 5, 'artist': 'mother vulpine'},
 {'rating': 5, 'artist': 'the white stripes'},
 {'rating': 5, 'artist': 'radiohead'},
 {'rating': 5, 'artist': 'los bunkers'},
 {'rating': 5, 'artist': 'deftones'},
 {'rating': 5, 'artist': 'stacie orrico'},
 {'rating': 5, 'artist': 'mimi maura'},
 {'rating': 5, 'artist': 'firebug'},
 {'rating': 5, 'artist': 'widespread panic'},
 {'rating': 5, 'artist': 'against me!'},
 {'rating': 5, 'artist': 'the verve'},
 {'rating': 5, 'artist': 'gangsta boo'},
 {'rating': 5, 'artist': 'clutch'},
 {'rating': 5, 'artist': 'ruiner'},
 {'rating': 5, 'artist': 'l.a.o.s.'},
 {'rating': 5, 'artist': 'my chemical romance'},
 {'rating': 5, 'artist': 'soundtrack'},
 {'rating': 5, 'artist': 'the killers'},
 {'rating': 5, 'artist': 'neglected fields'},
 {'rating

In [61]:
mean_value = np.nanmean(R_all.values)

artist_ratings0 = {}
artist_ratings1 = {}
artist_ratings2 = {}
artist_ratings3 = {}
artist_ratings4 = {}

for dict_ in cluster0_artists:
    if dict_['artist'] not in artist_ratings0.keys():
        artist_ratings0.update({dict_['artist']: dict_['rating']})
    else:
        prev_rating = artist_ratings0[dict_['artist']]
        artist_ratings0.pop(dict_['artist'])
        if (dict_['rating'] < mean_value) and (prev_rating < mean_value):
            artist_ratings0.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) - 1})
        elif (dict_['rating'] > mean_value) and (prev_rating > mean_value):
            artist_ratings0.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) + 1})
        else:
            artist_ratings0.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2)})
            
for dict_ in cluster1_artists:
    if dict_['artist'] not in artist_ratings1.keys():
        artist_ratings1.update({dict_['artist']: dict_['rating']})
    else:
        prev_rating = artist_ratings1[dict_['artist']]
        artist_ratings1.pop(dict_['artist'])
        if (dict_['rating'] < mean_value) and (prev_rating < mean_value):
            artist_ratings1.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) - 1})
        elif (dict_['rating'] > mean_value) and (prev_rating > mean_value):
            artist_ratings1.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) + 1})
        else:
            artist_ratings1.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2)})
            
for dict_ in cluster2_artists:
    if dict_['artist'] not in artist_ratings2.keys():
        artist_ratings2.update({dict_['artist']: dict_['rating']})
    else:
        prev_rating = artist_ratings2[dict_['artist']]
        artist_ratings2.pop(dict_['artist'])
        if (dict_['rating'] < mean_value) and (prev_rating < mean_value):
            artist_ratings2.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) - 1})
        elif (dict_['rating'] > mean_value) and (prev_rating > mean_value):
            artist_ratings2.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) + 1})
        else:
            artist_ratings2.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2)})
            
for dict_ in cluster3_artists:
    if dict_['artist'] not in artist_ratings3.keys():
        artist_ratings3.update({dict_['artist']: dict_['rating']})
    else:
        prev_rating = artist_ratings3[dict_['artist']]
        artist_ratings3.pop(dict_['artist'])
        if (dict_['rating'] < mean_value) and (prev_rating < mean_value):
            artist_ratings3.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) - 1})
        elif (dict_['rating'] > mean_value) and (prev_rating > mean_value):
            artist_ratings3.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) + 1})
        else:
            artist_ratings3.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2)})
            
for dict_ in cluster4_artists:
    if dict_['artist'] not in artist_ratings4.keys():
        artist_ratings4.update({dict_['artist']: dict_['rating']})
    else:
        prev_rating = artist_ratings4[dict_['artist']]
        artist_ratings4.pop(dict_['artist'])
        if (dict_['rating'] < mean_value) and (prev_rating < mean_value):
            artist_ratings4.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) - 1})
        elif (dict_['rating'] > mean_value) and (prev_rating > mean_value):
            artist_ratings4.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2) + 1})
        else:
            artist_ratings4.update({dict_['artist']: ((prev_rating + dict_['rating']) / 2)})

In [62]:
print(artist_ratings0)

{'the juliana theory': 5, 'mr.children': 5, 'mother vulpine': 5, 'los bunkers': 5, 'stacie orrico': 5, 'mimi maura': 5, 'firebug': 5, 'widespread panic': 5, 'against me!': 5, 'gangsta boo': 5, 'ruiner': 5, 'l.a.o.s.': 5, 'soundtrack': 5, 'neglected fields': 5, 'john williams': 5, 'the avalanches': 5, 'the black heart procession': 5, 'vanessa da mata': 5, 'the berzerker': 5, 'kudai': 5, 'tamtrum': 5, 'does it offend you, yeah?': 5, 'exclaim': 5, 'al green': 5, 'blossom dearie': 5, 'treaty of paris': 5, 'the crimson ghosts': 5, 'angelus apatrida': 5, 'sloth': 5, 'the honorary title': 5, 'ott': 5, 'akercocke': 5, 'monster magnet': 5, 'cut copy': 5, 'his hero is gone': 5, '16volt': 5, 'azad': 5, 'yendri': 5, 'rebecca st. james': 5, 'the doobie brothers': 5, 'esbjörn svensson trio': 5, 'marcus miller': 5, 'black uhuru': 5, 'roger subirana': 5, 'pillar': 5, 'rumpistol': 5, 'the misfits': 5, 'lorentz & m.sakarias': 5, 'del tha funkee homosapien': 5, 'deathstars': 5, 'the cribs': 5, 'patti smi

In [63]:
sorted_artist_ratings0 = dict(sorted(artist_ratings0.items(), key=lambda item: item[1], reverse=True))
sorted_artist_ratings1 = dict(sorted(artist_ratings1.items(), key=lambda item: item[1], reverse=True))
sorted_artist_ratings2 = dict(sorted(artist_ratings2.items(), key=lambda item: item[1], reverse=True))
sorted_artist_ratings3 = dict(sorted(artist_ratings3.items(), key=lambda item: item[1], reverse=True))
sorted_artist_ratings4 = dict(sorted(artist_ratings4.items(), key=lambda item: item[1], reverse=True))

In [64]:
print(sorted_artist_ratings0)

{'devendra banhart': 6.75, 'rush': 6.5, 'bring me the horizon': 6.5, '????????': 6.5, 'the mars volta': 6.5, 'dream theater': 6.1875, 'foreigner': 6.0, 'ratatat': 6.0, 'the smashing pumpkins': 6.0, 'depeche mode': 6.0, 'trial by fire': 6.0, 'the national': 6.0, 'dir en grey': 6.0, 'thrice': 6.0, 'yellowcard': 6.0, 'pulp': 6.0, 'franz ferdinand': 6.0, 'forgive durden': 6.0, 'madball': 6.0, 'gary moore': 6.0, 'moonsorrow': 6.0, 'rilo kiley': 6.0, 'kaizers orchestra': 6.0, 'kapasiteettiyksikkö': 6.0, "blackmore's night": 6.0, 'taylor swift': 6.0, 'demons & wizards': 6.0, 'arch enemy': 6.0, 'capital inicial': 6.0, 'indios bravos': 6.0, 'dane cook': 6.0, 'horrorpops': 6.0, 'adolescents': 6.0, 'showbread': 6.0, 'cute is what we aim for': 6.0, 'the sorrow': 6.0, 'frank zappa': 6.0, 'flogging molly': 6.0, 'slayer': 6.0, 'alexisonfire': 6.0, 'three days grace': 5.75, 'billy talent': 5.75, 'three 6 mafia': 5.5, 'kasabian': 5.5, 'venetian snares': 5.5, 'bob marley & the wailers': 5.5, 'coil': 5.5

In [65]:
cluster0_top_artists = list(sorted_artist_ratings0.keys())[:10]
cluster1_top_artists = list(sorted_artist_ratings1.keys())[:10]
cluster2_top_artists = list(sorted_artist_ratings2.keys())[:10]
cluster3_top_artists = list(sorted_artist_ratings3.keys())[:10]
cluster4_top_artists = list(sorted_artist_ratings4.keys())[:10]

In [66]:
# prev way of getting top artists - delete later
'''
cluster0_top_artists = [artist for artist in cluster0_artists if artist.get('rating') == 5]
cluster1_top_artists = [artist for artist in cluster1_artists if artist.get('rating') == 5]
cluster2_top_artists = [artist for artist in cluster2_artists if artist.get('rating') == 5]
cluster3_top_artists = [artist for artist in cluster3_artists if artist.get('rating') == 5]
cluster4_top_artists = [artist for artist in cluster4_artists if artist.get('rating') == 5]
'''

"\ncluster0_top_artists = [artist for artist in cluster0_artists if artist.get('rating') == 5]\ncluster1_top_artists = [artist for artist in cluster1_artists if artist.get('rating') == 5]\ncluster2_top_artists = [artist for artist in cluster2_artists if artist.get('rating') == 5]\ncluster3_top_artists = [artist for artist in cluster3_artists if artist.get('rating') == 5]\ncluster4_top_artists = [artist for artist in cluster4_artists if artist.get('rating') == 5]\n"

In [67]:
print(cluster0_top_artists)

['devendra banhart', 'rush', 'bring me the horizon', '????????', 'the mars volta', 'dream theater', 'foreigner', 'ratatat', 'the smashing pumpkins', 'depeche mode']


Randomly select 5 recommendations from cluster0_top_artists - having a changing slate of recommendations contributes to user enjoyment

In [68]:
import random

k = 5 # change to how ever many recommendations you want to show
new_user_recommendations2 = []
new_user_recommendations = random.sample(cluster0_top_artists, k=k)
for recommendation in new_user_recommendations:
    new_user_recommendations2.append(recommendation)

In [69]:
print(new_user_recommendations2)

['foreigner', 'the smashing pumpkins', 'ratatat', '????????', 'bring me the horizon']
