# Preprocessing the data

In [180]:
# https://www.youtube.com/watch?v=ZspR5PZemcs&feature=youtu.be
import numpy as np
import pandas as pd

In [182]:
df = pd.read_csv('spotify.csv', index_col=[0])
df.head()

Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_1,2,2,8,8,13,1,4,9,1,2,...,14,2,14,1,9,0,11,6,14,7
user_2,13,5,5,5,12,8,10,10,2,2,...,10,6,11,1,1,5,12,8,3,0
user_3,3,9,2,8,0,1,11,7,3,7,...,9,5,7,15,12,13,14,5,0,14
user_4,2,6,7,8,14,0,12,7,8,1,...,9,15,9,14,10,6,11,13,6,0
user_5,11,12,8,6,13,7,0,7,3,13,...,12,14,11,11,11,7,3,6,11,7


In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, user_1 to user_1000
Columns: 5000 entries, song_1 to song_5000
dtypes: int64(5000)
memory usage: 38.2+ MB


In [137]:
print('Length of columns: ',len(df.columns)) # Length of columns
print('Length of users: ',len(df)) # Length of users

Length of columns:  5000
Length of users:  1000


In [186]:
print(df.iloc[9,:].values)

[12  3 10 ... 10 13 14]


In [192]:
# Top 5 songs heard by user 10?
user10 = np.argsort(df.iloc[9,:].values)[:5]
user10

array([2280, 1212, 1954, 4421, 2990], dtype=int64)

In [140]:
# Write a funtion to return top5 songs heard by all users

In [141]:
#Convert the data into numpy array because we can perform matrix operation on data now
X = np.asarray(df)
print(X.shape)
X

(1000, 5000)


array([[ 2,  2,  8, ...,  6, 14,  7],
       [13,  5,  5, ...,  8,  3,  0],
       [ 3,  9,  2, ...,  5,  0, 14],
       ...,
       [ 6,  2,  1, ..., 13,  6, 13],
       [ 7,  0,  8, ..., 11,  9, 12],
       [ 6, 15,  0, ..., 15,  4,  5]], dtype=int64)

In [142]:
# IR Service - Information retriver only takes care of collecting songs
# Decompose the user song matrix into user matrix and song matrix
# NMF - Non-negative matrix factorization 
# We can mention any value for n-components ( uses the following library)
from sklearn.decomposition import NMF

In [143]:
nmf  = NMF(n_components=100, max_iter=50, random_state=0)

In [144]:
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=50,
    n_components=100, random_state=0, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [195]:
# Extracting the component of row matrix or User matrix
user_mat = nmf.transform(X)
user_mat.shape
#user_mat
# 1000 users and 100 components, ( Here the concept of song is lost because of factorisation)

(1000, 100)

In [146]:
# Extracting the column matrix components or song matrix
song_mat = nmf.components_
song_mat.shape

(100, 5000)

In [147]:
song_mat = song_mat.T
song_mat.shape

(5000, 100)

In [148]:
# Preprocessing is done till now

# Building a recommendation engine

In [149]:
# Steps to build
# 1. Find the euclidian distance between the given user1 and every other users
# 2. Find 5 closest users
# 3. Find songs that these 5 closest listen to
# 4. Recommend these songs to the given user
# 5. Pow(sum([pow(vector1[idx]-vector2[idx]),2) for idx in range (len(vector1))]), 0.5)

In [150]:
# Just an example to check if function step5 working
v1=[1,1]
v2=[1,2]
pow(sum([pow(v1[idx]-v2[idx],2) for idx in range( len(v1))]), 0.5)

1.0

In [151]:
def euc_dist (v1,v2):
    return pow(sum([pow(v1[idx]-v2[idx],2) for idx in range( len(v1))]), 0.5)

In [152]:
eucl_dist_list = []
type(eucl_dist_list)

list

In [198]:
user1_vect = user_mat[0]
for idx in range (1000):
    other_user = user_mat[idx]
    eu = euc_dist (user1_vect, other_user)
    eucl_dist_list.append(eu)

In [200]:
candidate_index = np.argsort(eucl_dist_list)
candidate_index[0:6]

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [201]:
# Finding the top 5 closest user
eucl_dist_list.sort()
print(eucl_dist_list[0:6])

[0.0, 0.0, 8.459183342410208, 8.459183342410208, 9.092154910089086, 9.092154910089086]


In [157]:
# Finding euclidian dist between the given user to all the users
import sys
eucl_dist_list1 = []
user_id = 0
counter = 0
for any_user in user_mat[:]:
    if user_id == counter:
        eucl_dist_list1.append(sys.float_info.max)
    else:
        user1 = user_mat[0]
        temp = euc_dist(user1, any_user)
        eucl_dist_list1.append(temp)
    counter += 1

In [158]:
# Songs heard by the 5 users with lowest euc_dist

closest_to_user1 = [np.argsort(eucl_dist_list1)[:5]]
closest_to_user1

[array([352, 737, 284, 501, 609], dtype=int64)]

In [159]:
# Convert this entire code into fuction so that the function output is the above results

In [160]:
# What song do these 5 users listen to?

for idx in closest_to_user1:
    temp = pd.DataFrame(df.iloc[idx])
    print(temp[:].index)
temp.head()

Index(['user_353', 'user_738', 'user_285', 'user_502', 'user_610'], dtype='object')


Unnamed: 0,song_1,song_2,song_3,song_4,song_5,song_6,song_7,song_8,song_9,song_10,...,song_4991,song_4992,song_4993,song_4994,song_4995,song_4996,song_4997,song_4998,song_4999,song_5000
user_353,9,2,1,12,4,1,7,10,10,3,...,6,6,12,4,2,1,13,9,0,0
user_738,14,12,2,11,10,0,13,1,13,12,...,15,1,14,12,8,0,12,6,12,6
user_285,7,14,5,2,2,6,10,10,1,9,...,11,8,7,2,11,8,5,15,3,8
user_502,2,1,9,7,15,12,9,7,8,8,...,6,5,9,10,14,10,0,3,0,4
user_610,4,10,0,14,12,9,9,2,1,1,...,10,7,1,12,14,4,7,9,5,15


# Implementation of K-Means

In [161]:
# Assume that user1 is listening to song1
#Steps to implement K-means
# 1. Define the number of clusters for different songs 
k = 10
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k)
kmeans.fit(song_mat)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [162]:
# 2. Find which cluster does the song1 belong to
# Find the vector for song1
index_in_df = list(df.columns).index('song_1')
index_in_df

0

In [165]:
song1_vector = song_mat[index_in_df]
kmeans.predict([song1_vector])

array([7])

In [166]:
cluster_id = 7 #song1 belongs to cluster 7

In [202]:
all_labels = list(kmeans.predict(song_mat))
len(all_labels)

5000

In [203]:
predicted_cluster_id = [idx for idx in range(len(all_labels)) if all_labels[idx] == cluster_id]
predicted_cluster_candidate = song_mat[predicted_cluster_id]
print(predicted_cluster_id)
len(predicted_cluster_id)

[0, 4, 6, 12, 14, 15, 20, 26, 28, 43, 48, 50, 66, 68, 76, 94, 111, 113, 129, 130, 165, 178, 180, 198, 202, 203, 213, 223, 231, 233, 242, 250, 258, 259, 266, 339, 344, 350, 354, 355, 356, 361, 367, 400, 407, 413, 423, 430, 450, 451, 458, 469, 470, 511, 515, 518, 519, 529, 533, 534, 540, 544, 550, 554, 555, 558, 568, 584, 588, 612, 621, 622, 623, 647, 652, 678, 680, 688, 690, 709, 713, 720, 724, 730, 745, 747, 749, 758, 767, 773, 776, 779, 780, 781, 797, 798, 813, 817, 818, 827, 845, 852, 864, 877, 892, 893, 901, 903, 934, 938, 943, 949, 954, 970, 981, 984, 988, 1013, 1014, 1048, 1060, 1072, 1077, 1078, 1094, 1100, 1113, 1125, 1129, 1133, 1134, 1161, 1171, 1175, 1189, 1197, 1200, 1203, 1224, 1242, 1245, 1250, 1254, 1255, 1272, 1284, 1292, 1297, 1311, 1313, 1315, 1329, 1332, 1333, 1335, 1336, 1364, 1367, 1376, 1378, 1379, 1412, 1419, 1426, 1444, 1453, 1458, 1462, 1468, 1470, 1478, 1480, 1483, 1487, 1497, 1503, 1506, 1518, 1520, 1525, 1531, 1533, 1539, 1549, 1550, 1553, 1564, 1567, 1572, 1

503

In [176]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=5)

In [177]:
knn.fit(predicted_cluster_candidate)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [178]:
knn.kneighbors([song_mat[index_in_df]])

(array([[0.        , 1.90104376, 1.91097529, 1.94473839, 1.9504807 ]]),
 array([[  0,  30, 112, 150,  43]], dtype=int64))

In [179]:
# Modularise the entire code is the task

# 