# Preprocessing the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('spotify.csv', index_col=[0])
df.head()

Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_1,2,2,8,8,13,1,4,9,1,2,...,14,2,14,1,9,0,11,6,14,7
user_2,13,5,5,5,12,8,10,10,2,2,...,10,6,11,1,1,5,12,8,3,0
user_3,3,9,2,8,0,1,11,7,3,7,...,9,5,7,15,12,13,14,5,0,14
user_4,2,6,7,8,14,0,12,7,8,1,...,9,15,9,14,10,6,11,13,6,0
user_5,11,12,8,6,13,7,0,7,3,13,...,12,14,11,11,11,7,3,6,11,7


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, user_1 to user_1000
Columns: 5000 entries, song_1 to song_5000
dtypes: int64(5000)
memory usage: 38.2+ MB


In [4]:
print('Length of columns: ',len(df.columns)) # Length of columns
print('Length of users: ',len(df)) # Length of users

Length of columns:  5000
Length of users:  1000


In [5]:
print(df.iloc[9,:].values)

[12  3 10 ... 10 13 14]


In [6]:
# Top 5 songs heard by user 10?
user10 = np.argsort(df.iloc[9,:].values)[:5]
user10

array([2280, 1212, 1954, 4421, 2990], dtype=int64)

In [7]:
# Write a funtion to return top5 songs heard by all users

In [8]:
#Convert the data into numpy array because we can perform matrix operation on data now
X = np.asarray(df)
print(X.shape)
X

(1000, 5000)


array([[ 2,  2,  8, ...,  6, 14,  7],
       [13,  5,  5, ...,  8,  3,  0],
       [ 3,  9,  2, ...,  5,  0, 14],
       ...,
       [ 6,  2,  1, ..., 13,  6, 13],
       [ 7,  0,  8, ..., 11,  9, 12],
       [ 6, 15,  0, ..., 15,  4,  5]], dtype=int64)

In [9]:
# IR Service - Information retriver only takes care of collecting songs
# Decompose the user song matrix into user matrix and song matrix
# NMF - Non-negative matrix factorization 
# We can mention any value for n-components ( uses the following library)
from sklearn.decomposition import NMF

In [10]:
nmf  = NMF(n_components=100, max_iter=50, random_state=0)

In [11]:
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=50,
    n_components=100, random_state=0, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [12]:
# Extracting the component of row matrix or User matrix
user_mat = nmf.transform(X)
user_mat.shape
#user_mat
# 1000 users and 100 components, ( Here the concept of song is lost because of factorisation)

(1000, 100)

In [13]:
# Extracting the column matrix components or song matrix
song_mat = nmf.components_
song_mat.shape

(100, 5000)

In [14]:
song_mat = song_mat.T
song_mat.shape

(5000, 100)

In [15]:
# Preprocessing is done till now

# Building a recommendation engine

In [16]:
# Steps to build
# 1. Find the euclidian distance between the given user1 and every other users
# 2. Find 5 closest users
# 3. Find songs that these 5 closest listen to
# 4. Recommend these songs to the given user
# 5. Pow(sum([pow(vector1[idx]-vector2[idx]),2) for idx in range (len(vector1))]), 0.5)

In [17]:
# Just an example to check if function step5 working
v1=[1,1]
v2=[1,2]
pow(sum([pow(v1[idx]-v2[idx],2) for idx in range( len(v1))]), 0.5)

1.0

In [18]:
def euc_dist (v1,v2):
    return pow(sum([pow(v1[idx]-v2[idx],2) for idx in range( len(v1))]), 0.5)

In [19]:
eucl_dist_list = []
type(eucl_dist_list)

list

In [20]:
user1_vect = user_mat[0]
for idx in range (1000):
    other_user = user_mat[idx]
    eu = euc_dist (user1_vect, other_user)
    eucl_dist_list.append(eu)

In [21]:
candidate_index = np.argsort(eucl_dist_list)
candidate_index[0:6]

array([  0, 352, 737, 284, 501, 609], dtype=int64)

In [22]:
# Finding the top 5 closest user
eucl_dist_list.sort()
print(eucl_dist_list[0:6])

[0.0, 8.459183342410208, 9.092154910089086, 9.127038882370476, 9.223700380884242, 9.271531945996875]


In [23]:
# Finding euclidian dist between the given user to all the users
import sys
eucl_dist_list1 = []
user_id = 0
counter = 0
for any_user in user_mat[:]:
    if user_id == counter:
        eucl_dist_list1.append(sys.float_info.max)
    else:
        user1 = user_mat[0]
        temp = euc_dist(user1, any_user)
        eucl_dist_list1.append(temp)
    counter += 1

In [24]:
# Songs heard by the 5 users with lowest euc_dist

closest_to_user1 = [np.argsort(eucl_dist_list1)[:5]]
closest_to_user1

[array([352, 737, 284, 501, 609], dtype=int64)]

In [25]:
# Convert this entire code into fuction so that the function output is the above results

In [26]:
# What song do these 5 users listen to?

for idx in closest_to_user1:
    temp = pd.DataFrame(df.iloc[idx])
    print(temp[:].index)
temp.head()

Index(['user_353', 'user_738', 'user_285', 'user_502', 'user_610'], dtype='object')


Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_353,9,2,1,12,4,1,7,10,10,3,...,6,6,12,4,2,1,13,9,0,0
user_738,14,12,2,11,10,0,13,1,13,12,...,15,1,14,12,8,0,12,6,12,6
user_285,7,14,5,2,2,6,10,10,1,9,...,11,8,7,2,11,8,5,15,3,8
user_502,2,1,9,7,15,12,9,7,8,8,...,6,5,9,10,14,10,0,3,0,4
user_610,4,10,0,14,12,9,9,2,1,1,...,10,7,1,12,14,4,7,9,5,15


# Implementation of K-Means

In [27]:
# Assume that user1 is listening to song1
#Steps to implement K-means
# 1. Define the number of clusters for different songs 
k = 10
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k)
kmeans.fit(song_mat)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [28]:
# 2. Find which cluster does the song1 belong to
# Find the vector for song1
index_in_df = list(df.columns).index('song_1')
index_in_df

0

In [29]:
song1_vector = song_mat[index_in_df]
kmeans.predict([song1_vector])

array([8])

In [30]:
cluster_id = 7 #song1 belongs to cluster 7

In [31]:
all_labels = list(kmeans.predict(song_mat))
len(all_labels)

5000

In [32]:
predicted_cluster_id = [idx for idx in range(len(all_labels)) if all_labels[idx] == cluster_id]
predicted_cluster_candidate = song_mat[predicted_cluster_id]
print(predicted_cluster_id)
len(predicted_cluster_id)

[18, 24, 39, 49, 83, 88, 89, 91, 108, 110, 122, 123, 142, 149, 151, 156, 161, 191, 194, 199, 211, 216, 235, 256, 286, 291, 296, 314, 315, 319, 321, 332, 342, 351, 375, 379, 418, 429, 481, 489, 493, 497, 508, 520, 528, 535, 542, 548, 552, 574, 592, 593, 633, 659, 661, 666, 667, 676, 684, 701, 703, 708, 778, 796, 806, 823, 835, 840, 842, 846, 848, 853, 855, 873, 886, 896, 911, 923, 927, 936, 968, 973, 974, 977, 980, 982, 1006, 1010, 1016, 1018, 1025, 1026, 1035, 1063, 1110, 1118, 1120, 1146, 1148, 1151, 1152, 1153, 1158, 1190, 1203, 1204, 1209, 1221, 1225, 1234, 1257, 1277, 1286, 1297, 1316, 1324, 1367, 1374, 1377, 1391, 1424, 1430, 1448, 1464, 1467, 1473, 1489, 1493, 1499, 1521, 1524, 1527, 1537, 1543, 1569, 1575, 1576, 1617, 1626, 1637, 1648, 1655, 1675, 1688, 1708, 1709, 1736, 1738, 1749, 1751, 1795, 1797, 1801, 1806, 1811, 1819, 1844, 1867, 1898, 1904, 1924, 1933, 1940, 1952, 1958, 1963, 1968, 1969, 1973, 1989, 1990, 1992, 2043, 2053, 2079, 2086, 2087, 2090, 2121, 2124, 2131, 2134, 2

437

In [33]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=5)

In [34]:
knn.fit(predicted_cluster_candidate)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [35]:
knn.kneighbors([song_mat[index_in_df]])

(array([[1.93680729, 1.9449503 , 1.95899739, 2.00942528, 2.01079771]]),
 array([[ 45,  95, 234, 222,  54]], dtype=int64))

Song_45, Song_95, Song_234, Song_222 & Song_54 can be suggested after current song