In [1]:
import requests
import json
import numpy as np
import pandas as pd

In [2]:
rec_data = pd.read_csv('C:/Users/sammy/Downloads/merged-subset.csv', encoding='cp1252')

In [3]:
rec_data.shape

(10000, 9)

In [4]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate
0,249130,03b9c645bc4f578ea1dcb6a975e7ea71fab79da2,a16371b9-7d36-497a-a9d4-42b0a0440c5e,slowdive,108,m,28.0,Germany,"Sep 16, 2006"
1,16103327,ee48ac7de691d31176a2843d0f2f8f49cab78e6d,5a24bc1a-c093-4a82-84ed-8d7f2da0570d,skinny puppy,407,m,,Canada,"Dec 14, 2006"
2,12079167,b2e3d27a00fc3035edb439b14e6d30bcb6dddf69,68f1175b-592b-4f28-ab1c-85c7a438c636,turbonegro,91,m,29.0,United Kingdom,"Sep 23, 2005"
3,12210583,b4d4909255dc5bb404ff64866cac60164b465e9a,cfd3727e-7162-4e6a-a3f7-a59343ee6b8d,honey is cool,50,,,Denmark,"Dec 22, 2008"
4,7937614,758de8850b5a07875da1e23296e4021574be8596,3bb24e11-821a-4e4e-bd89-e0a2452474cf,useless id,108,m,25.0,Serbia,"Mar 15, 2009"


# DATA ANALYSIS

In [5]:
print('Min. number of plays: ', rec_data['plays'].min())
print('Max. number of plays: ', rec_data['plays'].max())
avg_plays = rec_data['plays'].mean()

Min. number of plays:  1
Max. number of plays:  12558


In [6]:
from datetime import datetime

# Parse the date string into a datetime object
rec_data['signupDate'] = rec_data['signupDate'].apply(lambda x: datetime.strptime(x, '%b %d, %Y'))

# Convert the datetime object to the desired output format "YYYYMMDD"
rec_data['signupDate'] = rec_data['signupDate'].apply(lambda x: x.strftime('%Y%m%d'))
rec_data['signupDate'] = rec_data['signupDate'].apply(int)

print(rec_data['signupDate'].head())  # Output: 20000101

0    20060916
1    20061214
2    20050923
3    20081222
4    20090315
Name: signupDate, dtype: int64


In [7]:
print('First signup date', datetime.strptime(str(rec_data['signupDate'].min()), '%Y%m%d'))
print('Last signup date', datetime.strptime(str(rec_data['signupDate'].max()), '%Y%m%d'))
last_date = rec_data['signupDate'].max()

First signup date 2002-10-29 00:00:00
Last signup date 2009-11-02 00:00:00


In [8]:
rec_data['userId'].value_counts()

userId
ee93d79f9b97cc03067dd3f4d90f6137cb301229    3
7de1e03e8a9b649bcf42ecde96ab301346601c53    3
32ff4f218c44472229c5d16de21a35e294fd5d10    2
a0549f97f461dc75b195aaa8324c05f0fa766851    2
45d0687e694fcef22de4660ef6e51e950cfb34b5    2
                                           ..
6371191a295fb7babeb98d813159bf2f4fb89c93    1
0b4c3386859abb3db2ca5f3c246156081baa6638    1
210c99699f2db1dfceb200e687c82da357e477a5    1
a2d2881ad3df0a75f450276c6b3287c04cfb0b39    1
57719e71d1265ba42aeca1e3bec5324f3d9a3714    1
Name: count, Length: 9871, dtype: int64

# Suliman-Keshavarz Algorithm

$$\text{rating(track, user)} = \text{max}(1, \text{min}(5, (\lfloor \frac{\text{num. plays(track)}}{\text{avg. plays}} \cdot 5) - k \cdot (\frac{\text{last signup date} - \text{signup date(user)}}{\text{last signup date}}) \rfloor))$$

We establish an algorithm to convert number of plays to a rating between 0 and 5 based on associating greater number of plays with greater song enjoyment, with the rating being proportional to the average number of plays recorded in the dataset. We use a weighting factor $k$ to minimize the influence of signing up earlier to the platform on a given user's ratings, as this may have resulted in some users having more opportunity to have their plays recorded in the dataset. We round down to the nearest whole number and take the max between this score and $1$ to negate the possibility of a negative or zero rating.

In [9]:
def suliman_keshavarz(data, k=0.05):
    return round(max(1, min(5, ((data['plays'] / avg_plays) * 5)) - k * ((last_date - data['signupDate']) / last_date)))

In [10]:
rec_data['rating'] = rec_data.apply(suliman_keshavarz, axis = 1)

In [11]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,rating
0,249130,03b9c645bc4f578ea1dcb6a975e7ea71fab79da2,a16371b9-7d36-497a-a9d4-42b0a0440c5e,slowdive,108,m,28.0,Germany,20060916,2
1,16103327,ee48ac7de691d31176a2843d0f2f8f49cab78e6d,5a24bc1a-c093-4a82-84ed-8d7f2da0570d,skinny puppy,407,m,,Canada,20061214,5
2,12079167,b2e3d27a00fc3035edb439b14e6d30bcb6dddf69,68f1175b-592b-4f28-ab1c-85c7a438c636,turbonegro,91,m,29.0,United Kingdom,20050923,2
3,12210583,b4d4909255dc5bb404ff64866cac60164b465e9a,cfd3727e-7162-4e6a-a3f7-a59343ee6b8d,honey is cool,50,,,Denmark,20081222,1
4,7937614,758de8850b5a07875da1e23296e4021574be8596,3bb24e11-821a-4e4e-bd89-e0a2452474cf,useless id,108,m,25.0,Serbia,20090315,2


In [12]:
rec_data['rating'].value_counts()

rating
1    4015
5    2850
2    1400
3    1006
4     729
Name: count, dtype: int64

# DATA CLEANING

In [13]:
rec_data = rec_data.sort_values(by='artist')

In [14]:
ratings = rec_data[['userId', 'rating', 'artist']]

In [15]:
unique_users = rec_data['userId'].unique()
unique_artists = rec_data['artist'].unique()

In [16]:
user_artist_combinations = pd.DataFrame([(artist, user) for artist in unique_artists for user in unique_users],
                                        columns=['artist', 'userId'])

In [17]:
user_artist_combinations.head()

Unnamed: 0,artist,userId
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be
1,!!!,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab
2,!!!,0b749f78e378d5212a95919d13835335e2c7c55d
3,!!!,36049eca9ae1a1deea28977ec092217059cd0cfe
4,!!!,c52649f81e88755be839d6aed0b549e2432326b2


In [18]:
gender_list = []
age_list = []
country_list = []
for user_id in unique_users:
    gender_list.append(rec_data[rec_data['userId'] == user_id]['gender'].values[0])
    age_list.append(rec_data[rec_data['userId'] == user_id]['age'].values[0])
    country_list.append(rec_data[rec_data['userId'] == user_id]['country'].values[0])

In [19]:
unique_users = pd.DataFrame(unique_users)
unique_users['Gender'] = gender_list
unique_users['Age'] = age_list
unique_users['Country'] = country_list

unique_users = unique_users.rename(columns={0: 'userId'})

In [20]:
unique_users.head()

Unnamed: 0,userId,Gender,Age,Country
0,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
1,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab,m,24.0,Russian Federation
2,0b749f78e378d5212a95919d13835335e2c7c55d,,20.0,Russian Federation
3,36049eca9ae1a1deea28977ec092217059cd0cfe,f,20.0,Netherlands
4,c52649f81e88755be839d6aed0b549e2432326b2,m,19.0,Australia


In [21]:
merged_data = pd.merge(user_artist_combinations, unique_users, on='userId')

In [22]:
merged_data.head()

Unnamed: 0,artist,userId,Gender,Age,Country
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom


In [23]:
training_df = pd.merge(merged_data, ratings, on='userId', how='inner')

In [24]:
training_df.head()

Unnamed: 0,artist_x,userId,Gender,Age,Country,rating,artist_y
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,m,26.0,United Kingdom,1,!!!


In [25]:
unique_user_artist_plays = ratings.groupby(['userId', 'artist'])['rating'].sum().reset_index()

# Merge merged_data with unique_user_artist_plays
training_df = pd.merge(merged_data, unique_user_artist_plays, on=['userId', 'artist'], how='left')

# Replace NaN values (where user hasn't listened to a specific artist) with 0
#training_df['rating'].fillna(0, inplace=True)

In [26]:
training_df.columns

Index(['artist', 'userId', 'Gender', 'Age', 'Country', 'rating'], dtype='object')

In [27]:
ratings_df = training_df.drop(columns=['Gender', 'Age', 'Country'])

In [28]:
ratings_df.head()

Unnamed: 0,artist,userId,rating
0,!!!,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,1.0
1,#####,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
2,*nsync,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
3,+44,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,
4,...and oceans,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,


In [29]:
ratings_df.set_index('artist', inplace=True)
ratings_df.set_index('userId', append=True, inplace=True)

In [30]:
R_all = ratings_df.unstack(['userId'])
R_all

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,001a92a006061452ea53cadf1f1e1db985f3a51f,001e2ade35f2476b47c15cce7bcb39dafa89b97a,00277ccecc376837e57b6d6b58330d1bafc90c73,002d07853e3d855dc359da5bd23f10ff11444b36,0039f6a10a8afc639e621ec4a6601306bafd9adf,005a5746da2730d0b54578d629a2a48f785b4acf,00643c8f3e0931150a8d37d63ffbfb9620fd9ba3,006cc2d3a76f75a399098eee512b2f645a049fc1,0073e9df704415e9d72f75380ba88059fca4230a,007fcb24cab2dcbaa6dbf5ae4c7084c6538251b1,...,ffaffe99a3617739877b980a031a6c376236d2de,ffb274a3d8bcdfaa68355919cefd184b9ccb7c4a,ffccc2d709e6b0d19227412647fed266e67d1b08,ffd2a97221e5001ff3de05659f7d4a303d2c6d73,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba,ffe19d4560159e0170f1be5bcda651fc022fc7bb,ffe27d9f69d660f9e7f17972c9c4a84fabf32188,ffe503ccc641e9050f17be8ba3805575bdaa9559,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff9cc8f46f8bcd31402bb9f82e020b5dc387f4b
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
!!!,,,,,,,,,,,...,,,,,,,,,,
#####,,,,,,,,,,,...,,,,,,,,,,
*nsync,,,,,,,,,,,...,,,,,,,,,,
+44,,,,,,,,,,,...,,,,,,,,,,
...and oceans,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ángeles del infierno,,,,,,,,,,,...,,,,,,,,,,
Çileke?,,,,,,,,,,,...,,,,,,,,,,
Édith piaf,,,,,,,,,,,...,,,,,,,,,,
Ólafur arnalds,,,,,,,,,,,...,,,,,,,,,,


In [31]:
I = 15
M = 15

# retrieve movies/users combination that is not *too* sparse
top_users = R_all.agg('sum', axis=0).nlargest(10).tail(I).index
top_artists = R_all.agg('sum', axis=1).nlargest(10).tail(M).index

R = R_all.loc[top_artists, top_users]
R

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,7de1e03e8a9b649bcf42ecde96ab301346601c53,0109c75eab891c15234f90511c56771e109844e7,157e08eed5176ec311d27e5612b83fe47a97d8b2,1695ed8057dff69da32ce8d2374631e874ba459e,16ecbbf3ed6e868c3131e09e71b3639a7b0818ae,18ade3f1ee326f07812ce546a998c2878fc28cfd,19a6ec8270f83f08483ff883857e75f355454ce8,1f075e304f813fa33ab52d6430b5d3e425026b72,2464090ca04654adeb5c48ea45554bdc195b08ca,24704506d5537eab278ba38aa40f14ba39134ec1
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
radiohead,,,,,,,,,,
the beatles,,,,,,,,,,
coldplay,,,,,,,,,,
system of a down,,,,,,,,,,
the cure,,,,,,,,,,
foo fighters,,,,,,,,,,
pink floyd,,,,,,,,,,
rammstein,,,,,,,,,,
nine inch nails,,,,,,,,,,
????,,,,,,,,,,


In [32]:
# R_all.loc["????", :].sum()
# R_all2 = R_all.drop(index='????')  # drop row of missing artists

In [33]:
R_all.shape

(5197, 9871)

In [34]:
(np.isnan(R_all)).mean().mean()

0.999805086150109

We have a very sparse dataset.

In [35]:
R_all_filled = R_all.fillna(0)
(R_all_filled != 0).sum(axis = 0).value_counts()

1    9745
2     124
3       2
Name: count, dtype: int64

It seems that the overwhelming majority of users have only one artist that they've listened to, making it difficult to form a recommendation system.

## User and Movies Matrices

Model rating $r_{mi}$ of movie $m$ by user $i$:
$$ \hat r_{mi} = \sum_{k=1}^K v_{mk} u_{ik} = v_{m} u_{i}^T $$
* $K$ unobserved characteristics (latent factors)
* $v_m=(v_{m1},\dots,v_{mK})$: movie $m$ having characteristic $k=1,\dots,K$
* $u_i=(u_{i1},\dots,u_{iK})$: user $i$'s affinity to characteristic $k=1,\dots,K$
* Rating $r_{mi}$ is high if $v_m$ and $u_i$ are well-aligned

We want to find the optimal number of latent factors $k$ on which to apply this.

## Outline

We will trial-and-error several different ways to optimize $k$.

1. SVD after imputing median for missing values, threshhold on amount of variance 
2. Use cross-validation across several values for $k$, then choose the one that reduces the RMSE the most, post-gradient descent

## SVD

In [36]:
mean_value = np.nanmean(R_all.values)

# Fill NaN values with the calculated mean
R_all3 = R_all.fillna(mean_value)

In [37]:
R_all3

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,001a92a006061452ea53cadf1f1e1db985f3a51f,001e2ade35f2476b47c15cce7bcb39dafa89b97a,00277ccecc376837e57b6d6b58330d1bafc90c73,002d07853e3d855dc359da5bd23f10ff11444b36,0039f6a10a8afc639e621ec4a6601306bafd9adf,005a5746da2730d0b54578d629a2a48f785b4acf,00643c8f3e0931150a8d37d63ffbfb9620fd9ba3,006cc2d3a76f75a399098eee512b2f645a049fc1,0073e9df704415e9d72f75380ba88059fca4230a,007fcb24cab2dcbaa6dbf5ae4c7084c6538251b1,...,ffaffe99a3617739877b980a031a6c376236d2de,ffb274a3d8bcdfaa68355919cefd184b9ccb7c4a,ffccc2d709e6b0d19227412647fed266e67d1b08,ffd2a97221e5001ff3de05659f7d4a303d2c6d73,ffd8d4a17bb5013bcadeb0da06ad086b98d90cba,ffe19d4560159e0170f1be5bcda651fc022fc7bb,ffe27d9f69d660f9e7f17972c9c4a84fabf32188,ffe503ccc641e9050f17be8ba3805575bdaa9559,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff9cc8f46f8bcd31402bb9f82e020b5dc387f4b
artist,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
!!!,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
#####,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
*nsync,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
+44,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
...and oceans,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ángeles del infierno,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Çileke?,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Édith piaf,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017
Ólafur arnalds,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,...,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017,2.70017


In [38]:
u, s, vh = np.linalg.svd(R_all3, full_matrices=False)

threshold = 0.1

# Retain singular values greater than the threshold
selected_singular_values = s[s > threshold]

In [40]:
# Indices of selected singular values
indices = np.where(s > threshold)[0]

# Collect U and V based on retained singular values
U = u[:, indices]
V = vh[indices, :].T

In [41]:
print(U.shape)

(5197, 5185)


In [42]:
print(V.shape)

(9871, 5185)


In [43]:
from numpy import sqrt, mean

# calculate RMSE
def rmse(X, Y):
    return sqrt(mean((X - Y)**2))

error = [(0, rmse(R_all3.T, np.inner(V,U)))]

In [44]:
print(error)

[(0, 2.7000768448271013)]


In [45]:
# iterate through different thresholds
import time
max_iterations = 40  # maximum number of iterations
U, S, V = np.linalg.svd(R_all3, full_matrices=False)

In [49]:
print(S.shape)

(5197,)


In [48]:
# this will take some time...
error = []

for t in range(1, max_iterations):
    threshold = 10 / t    
    
    # Indices of selected singular values
    indices = np.where(S > threshold)[0]
    print(t, indices.shape)

    # Collect U and V based on retained singular values
    U1 = U[:, indices]
    V1 = V[indices, :].T
        
    # compute error after one sweep of updates
    error += [(threshold, rmse(R_all3.T, np.inner(V1,U1)))]
    
error = pd.DataFrame(error, columns=['threshold', 'rmse'])

1 (4,)
2 (155,)
3 (506,)
4 (963,)
5 (2162,)
6 (3988,)
7 (4017,)
8 (4275,)
9 (4281,)
10 (4291,)
11 (4312,)
12 (4312,)
13 (4312,)
14 (4337,)
15 (4835,)
16 (4836,)
17 (4836,)
18 (4837,)
19 (4837,)
20 (4837,)
21 (4837,)
22 (4837,)
23 (4837,)
24 (4845,)
25 (4847,)
26 (4848,)
27 (4848,)
28 (4848,)
29 (4848,)
30 (4848,)
31 (4848,)
32 (4848,)
33 (4848,)
34 (5178,)
35 (5180,)
36 (5181,)
37 (5183,)
38 (5183,)
39 (5183,)
40 (5183,)
41 (5183,)
42 (5183,)
43 (5183,)
44 (5183,)
45 (5183,)
46 (5183,)
47 (5183,)
48 (5184,)
49 (5184,)
50 (5184,)
51 (5184,)
52 (5184,)
53 (5184,)
54 (5184,)
55 (5184,)
56 (5184,)
57 (5185,)
58 (5185,)
59 (5185,)
60 (5185,)
61 (5185,)
62 (5185,)
63 (5185,)
64 (5185,)
65 (5185,)
66 (5185,)
67 (5185,)
68 (5185,)
69 (5185,)
70 (5185,)
71 (5185,)
72 (5185,)
73 (5185,)
74 (5185,)
75 (5185,)
76 (5185,)
77 (5185,)
78 (5185,)
79 (5185,)
80 (5185,)
81 (5185,)
82 (5185,)
83 (5185,)
84 (5185,)
85 (5185,)
86 (5185,)
87 (5185,)
88 (5185,)
89 (5185,)
90 (5185,)
91 (5185,)
92 (5185,)
93 

In [50]:
print(error)

    threshold      rmse
0   10.000000  2.700134
1    5.000000  2.700127
2    3.333333  2.700119
3    2.500000  2.700111
4    2.000000  2.700095
..        ...       ...
94   0.105263  2.700077
95   0.104167  2.700077
96   0.103093  2.700077
97   0.102041  2.700077
98   0.101010  2.700077

[99 rows x 2 columns]


It appears the error rate is largely invariant to changes in the size of the decomposed matrices.

## Default Recommendation
    
We want to provide default recommendations to users with no previous reviews based solely based on secondary characteristics (sex, age, country).

In [339]:
rec_data['country'].value_counts().shape

(146,)

In [340]:
rec_data['age'].value_counts().shape

(85,)

In [341]:
one_hot_encoded = pd.get_dummies(rec_data['gender'], dtype=int)
#one_hot_encoded2 = pd.get_dummies(rec_data['age'], prefix='Age', dtype=int)
one_hot_encoded3 = pd.get_dummies(rec_data['country'], dtype=int)

# Concatenate the one-hot encoded columns with the original DataFrame
rec_data2 = pd.concat([rec_data, one_hot_encoded, one_hot_encoded3], axis=1)
rec_data2.drop(['Unnamed: 0', 'artist', 'userId', 'artistId', 'plays', 'gender', 'country', 'signupDate', 'rating'], axis = 1, inplace=True)

In [342]:
rec_data2.shape

(10000, 149)

In [343]:
rec_data2.head()

Unnamed: 0,age,f,m,Afghanistan,Algeria,Andorra,Antarctica,Argentina,Aruba,Australia,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
1875,26.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334,24.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8189,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6437,20.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337,19.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [344]:
pd.isnull(rec_data2).sum(axis = 0)

age                        2040
f                             0
m                             0
Afghanistan                   0
Algeria                       0
                           ... 
Venezuela                     0
Viet Nam                      0
Virgin Islands, British       0
Wallis and Futuna             0
Zimbabwe                      0
Length: 149, dtype: int64

In [345]:
# imputing mean age for those users whose ages are missing
rec_data2['age'] = rec_data2['age'].fillna(np.mean(rec_data2['age']))

In [346]:
from sklearn.cluster import KMeans

# Number of clusters (you can set this as per your requirement)
num_clusters = 5

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(rec_data2)

# Get cluster labels for each user
cluster_labels = kmeans.labels_

# Add cluster labels to the DataFrame
rec_data['Cluster'] = cluster_labels

  super()._check_params_vs_input(X, default_n_init=10)


In [347]:
rec_data['Cluster']

1875    3
334     3
8189    0
6437    0
337     0
       ..
3699    4
9650    0
3809    2
4756    2
6124    0
Name: Cluster, Length: 10000, dtype: int32

In [348]:
rec_data['Cluster'].value_counts()

Cluster
3    4799
0    3579
2    1317
4     258
1      47
Name: count, dtype: int64

In [349]:
rec_data.head()

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,rating,Cluster
1875,5375648,4fb83e93ec1e10ef97e67fc6d912fb7e778339be,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,28,m,26.0,United Kingdom,20051221,1,3
334,10340416,9901740c9c7ba5ac45bfa4e044d7aff496b1fbab,537db97d-372b-4648-8633-f1dcf52b1f47,#####,41,m,24.0,Russian Federation,20080204,1,3
8189,772229,0b749f78e378d5212a95919d13835335e2c7c55d,537db97d-372b-4648-8633-f1dcf52b1f47,#####,1458,,20.0,Russian Federation,20051214,5,0
6437,3639551,36049eca9ae1a1deea28977ec092217059cd0cfe,603ba565-3967-4be1-931e-9cb945394e86,*nsync,9,f,20.0,Netherlands,20080918,1,0
337,13317975,c52649f81e88755be839d6aed0b549e2432326b2,c2a44e93-3a2b-44aa-bd8b-7a71bb76e3b5,+44,66,m,19.0,Australia,20080701,1,0


In [350]:
cluster0_artists = []
cluster1_artists = []
cluster2_artists = []
cluster3_artists = []
cluster4_artists = []

for i in range(len(rec_data)):
    if rec_data.loc[i]['Cluster'] == 0:
        cluster0_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 1:
        cluster1_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 2:
        cluster2_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 3:
        cluster3_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})
    elif rec_data.loc[i]['Cluster'] == 4:
        cluster4_artists.append({'rating': rec_data.loc[i]['rating'], 'artist': rec_data.loc[i]['artist']})

In [351]:
print(cluster0_artists)

[{'rating': 1, 'artist': 'murat boz'}, {'rating': 5, 'artist': 'röyksopp'}, {'rating': 5, 'artist': 'blink-182'}, {'rating': 1, 'artist': 'red hot chili peppers'}, {'rating': 2, 'artist': 'cascada'}, {'rating': 3, 'artist': 'pendulum'}, {'rating': 3, 'artist': 'bløf'}, {'rating': 5, 'artist': 'the juliana theory'}, {'rating': 3, 'artist': 'peeping tom'}, {'rating': 5, 'artist': 'mr.children'}, {'rating': 1, 'artist': 'alesana'}, {'rating': 5, 'artist': 'nitin sawhney'}, {'rating': 1, 'artist': 'lady gaga'}, {'rating': 3, 'artist': 'have heart'}, {'rating': 3, 'artist': "guns n' roses"}, {'rating': 1, 'artist': 'katie melua'}, {'rating': 4, 'artist': 'trans-siberian orchestra'}, {'rating': 5, 'artist': 'mother vulpine'}, {'rating': 5, 'artist': 'the white stripes'}, {'rating': 4, 'artist': 'maxïmo park'}, {'rating': 1, 'artist': 'paolo nutini'}, {'rating': 5, 'artist': 'radiohead'}, {'rating': 5, 'artist': 'los bunkers'}, {'rating': 5, 'artist': 'deftones'}, {'rating': 4, 'artist': 'dei

In [352]:
cluster0_artists = sorted(cluster0_artists, key=lambda x: x['rating'], reverse=True)
cluster1_artists = sorted(cluster1_artists, key=lambda x: x['rating'], reverse=True)
cluster2_artists = sorted(cluster2_artists, key=lambda x: x['rating'], reverse=True)
cluster3_artists = sorted(cluster3_artists, key=lambda x: x['rating'], reverse=True)
cluster4_artists = sorted(cluster4_artists, key=lambda x: x['rating'], reverse=True)

In [353]:
print(cluster0_artists)

[{'rating': 5, 'artist': 'röyksopp'}, {'rating': 5, 'artist': 'blink-182'}, {'rating': 5, 'artist': 'the juliana theory'}, {'rating': 5, 'artist': 'mr.children'}, {'rating': 5, 'artist': 'nitin sawhney'}, {'rating': 5, 'artist': 'mother vulpine'}, {'rating': 5, 'artist': 'the white stripes'}, {'rating': 5, 'artist': 'radiohead'}, {'rating': 5, 'artist': 'los bunkers'}, {'rating': 5, 'artist': 'deftones'}, {'rating': 5, 'artist': 'stacie orrico'}, {'rating': 5, 'artist': 'mimi maura'}, {'rating': 5, 'artist': 'firebug'}, {'rating': 5, 'artist': 'widespread panic'}, {'rating': 5, 'artist': 'against me!'}, {'rating': 5, 'artist': 'the verve'}, {'rating': 5, 'artist': 'gangsta boo'}, {'rating': 5, 'artist': 'clutch'}, {'rating': 5, 'artist': 'ruiner'}, {'rating': 5, 'artist': 'l.a.o.s.'}, {'rating': 5, 'artist': 'my chemical romance'}, {'rating': 5, 'artist': 'soundtrack'}, {'rating': 5, 'artist': 'the killers'}, {'rating': 5, 'artist': 'neglected fields'}, {'rating': 5, 'artist': 'john wi

Here is an example of how we could assign recommendations to a new user without number of plays or artist information.

In [354]:
new_user = pd.DataFrame([[249130, '178e7gd3278dy238d732y823yd92', None, None, None, 'm', 21.0, 'United States', 'Dec 31, 2001']], columns=['Unnamed: 0', 'userId', 'artistId', 'artist', 'plays', 'gender', 'age',
       'country', 'signupDate'])

In [355]:
rec_data2.columns

Index(['age', 'f', 'm', 'Afghanistan', 'Algeria', 'Andorra', 'Antarctica',
       'Argentina', 'Aruba', 'Australia',
       ...
       'United States', 'United States Minor Outlying Islands', 'Uruguay',
       'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam',
       'Virgin Islands, British', 'Wallis and Futuna', 'Zimbabwe'],
      dtype='object', length=149)

In [356]:
import pprint

# Convert DataFrame columns to a list
column_list = rec_data2.columns.tolist()

# Display the column names without truncation
pprint.pprint(column_list, width=200)  # Adjust width as needed

['age',
 'f',
 'm',
 'Afghanistan',
 'Algeria',
 'Andorra',
 'Antarctica',
 'Argentina',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Canada',
 'Cayman Islands',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Costa Rica',
 "Cote D'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Southern Territories',
 'Georgia',
 'Germany',
 'Greece',
 'Guatemala',
 'Guinea-Bissau',
 'Haiti',
 'Heard Island and Mcdonald Islands',
 'Holy See (Vatican City State)',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Republic of',
 'Ire

In [357]:
new_columns = ['f','m','Afghanistan','Algeria','Andorra','Antarctica','Argentina','Aruba','Australia','Austria','Azerbaijan','Bahamas',
               'Bahrain','Barbados','Belarus','Belgium','Bermuda','Bhutan','Bolivia','Bosnia and Herzegovina','Botswana','Brazil',
               'Brunei Darussalam','Bulgaria','Burkina Faso','Cambodia','Canada','Cayman Islands','Chile','China','Christmas Island',
               'Cocos (Keeling) Islands','Colombia','Costa Rica',"Cote D'Ivoire",'Croatia','Cuba','Cyprus','Czech Republic','Denmark',
               'Djibouti','Dominica','Dominican Republic','Ecuador','Egypt','El Salvador','Estonia','Faroe Islands','Fiji','Finland',
               'France','French Southern Territories','Georgia','Germany','Greece','Guatemala','Guinea-Bissau','Haiti','Heard Island and Mcdonald Islands',
               'Holy See (Vatican City State)','Honduras','Hong Kong','Hungary','Iceland','India','Indonesia','Iran, Islamic Republic of',
               'Ireland','Israel','Italy','Jamaica','Japan','Kazakhstan','Kiribati',"Korea, Democratic People's Republic of",'Korea, Republic of',
               'Kuwait','Kyrgyzstan','Latvia','Lebanon','Libyan Arab Jamahiriya','Liechtenstein','Lithuania','Luxembourg','Macedonia',
               'Malaysia','Malta','Martinique','Mexico','Micronesia, Federated States of','Moldova','Mongolia','Montenegro','Morocco',
               'Nauru','Netherlands','Netherlands Antilles','New Zealand','Nicaragua','Niue','Norway','Pakistan','Panama','Papua New Guinea',
               'Paraguay','Peru','Philippines','Pitcairn','Poland','Portugal','Puerto Rico','Reunion','Romania','Russian Federation',
               'Sao Tome and Principe','Saudi Arabia','Serbia','Sierra Leone','Singapore','Slovakia','Slovenia','Solomon Islands',
               'Somalia','South Africa','Spain','Sri Lanka','Sweden','Switzerland','Taiwan','Thailand','Togo','Tunisia','Turkey',
               'Turks and Caicos Islands','Tuvalu','Ukraine','United Arab Emirates','United Kingdom','United States','United States Minor Outlying Islands',
               'Uruguay','Uzbekistan','Vanuatu','Venezuela','Viet Nam','Virgin Islands, British','Wallis and Futuna','Zimbabwe']

In [358]:
for col in new_columns:
    new_user[col] = 0

  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0
  new_user[col] = 0


In [359]:
new_user

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,gender,age,country,signupDate,f,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,249130,178e7gd3278dy238d732y823yd92,,,,m,21.0,United States,"Dec 31, 2001",0,...,0,0,0,0,0,0,0,0,0,0


In [360]:
for country in new_user.columns:
    if country == 'country':
        unique_countries = new_user['country'].unique()
        for c in unique_countries:
            new_user[c] = (new_user['country'] == c).astype(int)
        new_user.drop('country', axis=1, inplace=True)

In [361]:
for gender in new_user.columns:
    if gender == 'gender':
        unique_gender = new_user['gender'].unique()
        for g in unique_gender:
            new_user[g] = (new_user['gender'] == g).astype(int)
        new_user.drop('gender', axis=1, inplace=True)

In [362]:
new_user

Unnamed: 0.1,Unnamed: 0,userId,artistId,artist,plays,age,signupDate,f,m,Afghanistan,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,249130,178e7gd3278dy238d732y823yd92,,,,21.0,"Dec 31, 2001",0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [363]:
rec_data2.columns

Index(['age', 'f', 'm', 'Afghanistan', 'Algeria', 'Andorra', 'Antarctica',
       'Argentina', 'Aruba', 'Australia',
       ...
       'United States', 'United States Minor Outlying Islands', 'Uruguay',
       'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam',
       'Virgin Islands, British', 'Wallis and Futuna', 'Zimbabwe'],
      dtype='object', length=149)

In [364]:
new_user.drop(['Unnamed: 0', 'userId', 'artistId', 'artist', 'plays', 'signupDate'], axis=1, inplace=True)

In [365]:
new_user

Unnamed: 0,age,f,m,Afghanistan,Algeria,Andorra,Antarctica,Argentina,Aruba,Australia,...,United States,United States Minor Outlying Islands,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British",Wallis and Futuna,Zimbabwe
0,21.0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [366]:
# Predict clusters for new user(s)
predicted_cluster = kmeans.predict(new_user)

In [367]:
print(predicted_cluster)

[0]


The new user has been categorized as belonging to cluster 0.

In [369]:
cluster0_top_artists = [artist for artist in cluster0_artists if artist.get('rating') == 5]
cluster1_top_artists = [artist for artist in cluster1_artists if artist.get('rating') == 5]
cluster2_top_artists = [artist for artist in cluster2_artists if artist.get('rating') == 5]
cluster3_top_artists = [artist for artist in cluster3_artists if artist.get('rating') == 5]
cluster4_top_artists = [artist for artist in cluster4_artists if artist.get('rating') == 5]

In [370]:
print(cluster0_top_artists)

[{'rating': 5, 'artist': 'röyksopp'}, {'rating': 5, 'artist': 'blink-182'}, {'rating': 5, 'artist': 'the juliana theory'}, {'rating': 5, 'artist': 'mr.children'}, {'rating': 5, 'artist': 'nitin sawhney'}, {'rating': 5, 'artist': 'mother vulpine'}, {'rating': 5, 'artist': 'the white stripes'}, {'rating': 5, 'artist': 'radiohead'}, {'rating': 5, 'artist': 'los bunkers'}, {'rating': 5, 'artist': 'deftones'}, {'rating': 5, 'artist': 'stacie orrico'}, {'rating': 5, 'artist': 'mimi maura'}, {'rating': 5, 'artist': 'firebug'}, {'rating': 5, 'artist': 'widespread panic'}, {'rating': 5, 'artist': 'against me!'}, {'rating': 5, 'artist': 'the verve'}, {'rating': 5, 'artist': 'gangsta boo'}, {'rating': 5, 'artist': 'clutch'}, {'rating': 5, 'artist': 'ruiner'}, {'rating': 5, 'artist': 'l.a.o.s.'}, {'rating': 5, 'artist': 'my chemical romance'}, {'rating': 5, 'artist': 'soundtrack'}, {'rating': 5, 'artist': 'the killers'}, {'rating': 5, 'artist': 'neglected fields'}, {'rating': 5, 'artist': 'john wi

In [378]:
# Randomly select 5 recommendations from cluster0_top_artists
import random

k = 5 # change to how ever many recommendations you want to show
new_user_recommendations2 = []
new_user_recommendations = random.sample(cluster0_top_artists, k=k)
for recommendation in new_user_recommendations:
    new_user_recommendations2.append(recommendation['artist'])

In [379]:
print(new_user_recommendations2)

['kaizers orchestra', 'gojira', 'judas priest', 'rush', 'edward shearmur']
