## Import

In [13]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

## DataFrame

In [3]:
df1 = pd.read_excel('cleaned_data1.xlsx')
display(df1.head(2) )
df2 = pd.read_excel('cleaned_data2.xlsx')
display(df2.head(2) )

Unnamed: 0,Song,use_count,Name,Followers,Posts,Bio,URL,ER,title,release,artist_name,year,Delta,Curr_ER,Comments,Likes,Time
0,SOAKIMP12A8C130995,1,tejaswini wagh,83789,348,‚Äú You'll never find peace of mind until you li...,https://www.instagram.com/waghtejaswini/,9.9,The Cove,Thicker Than Water,Jack Johnson,0,-7.719191,2.180809,1827,16445,3d
1,SOAKIMP12A8C130995,1,Shounak Nayak,7151,505,UK/India All posts my own unless stated. Email:,https://www.instagram.com/shounaknayak/,3.4,The Cove,Thicker Than Water,Jack Johnson,0,1.337458,4.737458,161,967,1d


Unnamed: 0,Song,use_count,Name,Followers,Posts,Bio,URL,ER,title,release,artist_name,year,Delta,Curr_ER,Comments,Likes,Time
0,SOPZKGR12A6D4F3F3A,1,RIDHIMA,3322,400,Wooden works by hand Shop: +998 97 543 0000 Ma...,https://www.instagram.com/ridz5014/,0.7,Red Red Wine (Edit),Original Hits - Party,UB40,2008,4.623247,5.323247,2750,13754,4w
1,SOPZKGR12A6D4F3F3A,1,,26241,912,Si vede bene solo con il cuore. L'essenziale √®...,https://www.instagram.com/lady_golf_mk4/,0.3,Red Red Wine (Edit),Original Hits - Party,UB40,2008,3.376269,3.676269,9325,83928,4w


In [4]:
df = df1.append(df2, ignore_index=False)
df.tail(2)

Unnamed: 0,Song,use_count,Name,Followers,Posts,Bio,URL,ER,title,release,artist_name,year,Delta,Curr_ER,Comments,Likes,Time
44904,SOGCHYZ12AF72A69EC,2,Riya Deepsi,25955,515,Human being Email for any collaboration or enq...,https://www.instagram.com/riya_d_m/,3.2,That Tree (feat. Kid Cudi),That Tree Featuring Kid Cudi,Snoop Dogg featuring Kid Cudi,2010,-0.05021,3.14979,7721,38605,2w
44905,SOSSZPW12A8C13843D,20,Kashika Kapur Makeupartist,49130,1921,Dreamer of making this world a better placeüôèüèª ...,https://www.instagram.com/kashikakapurmua/,2.9,Figures,Dreams,The Whitest Boy Alive,2006,-2.33451,0.56549,92,833,1d


In [5]:
n_users = df.URL.nunique()
n_items = df.Song.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 4218
Num of Movies: 9921


In [14]:
#Let us create Ids for user

user_codes = df.URL.drop_duplicates().reset_index()
user_codes.rename(columns={'index':'user_old_index'}, inplace=True)
user_codes['user_new_index'] = list(user_codes.index)
user_codes.head(3)

Unnamed: 0,user_old_index,URL,user_new_index
0,0,https://www.instagram.com/waghtejaswini/,0
1,1,https://www.instagram.com/shounaknayak/,1
2,2,https://www.instagram.com/jdinstitute/,2


In [15]:
#Let us create Ids for Item

song_codes = df.Song.drop_duplicates().reset_index()
song_codes.rename(columns={'index':'song_old_index'}, inplace=True)
song_codes['song_new_index'] = list(song_codes.index)
song_codes.head(7)

Unnamed: 0,song_old_index,Song,song_new_index
0,0,SOAKIMP12A8C130995,0
1,9,SOBBMDR12A8C13253B,1
2,14,SOBXHDL12A81C204C0,2
3,62,SOBYHAJ12A6701BF1D,3
4,89,SODACBL12A8C13C273,4
5,134,SODDNQT12A6D4F5F7E,5
6,140,SODXRTY12AB0180F3B,6


In [8]:
# Merge song ids, and User ids

df = pd.merge(df, song_codes,how='left')
df = pd.merge(df, user_codes,how='left')
df.head(3)

Unnamed: 0,Song,use_count,Name,Followers,Posts,Bio,URL,ER,title,release,...,year,Delta,Curr_ER,Comments,Likes,Time,song_old_index,song_new_index,user_old_index,user_new_index
0,SOAKIMP12A8C130995,1,tejaswini wagh,83789,348,‚Äú You'll never find peace of mind until you li...,https://www.instagram.com/waghtejaswini/,9.9,The Cove,Thicker Than Water,...,0,-7.719191,2.180809,1827,16445,3d,0,0,0,0
1,SOAKIMP12A8C130995,1,Shounak Nayak,7151,505,UK/India All posts my own unless stated. Email:,https://www.instagram.com/shounaknayak/,3.4,The Cove,Thicker Than Water,...,0,1.337458,4.737458,161,967,1d,0,0,1,1
2,SOAKIMP12A8C130995,3,JD Institute of Fashion,16047,3120,"Empowering creative minds since 1988, JD insti...",https://www.instagram.com/jdinstitute/,0.5,The Cove,Thicker Than Water,...,0,0.130806,0.630806,1012,9110,4w,0,0,2,2


# Collaborative Filtering Using k-Nearest Neighbors (kNN)

kNN is a machine learning algorithm to find clusters of similar users based on common book ratings, and make predictions using the average rating of top-k nearest neighbors. For example, we first present ratings in a matrix with the matrix having one row for each item (song) and one column for each user,

In [18]:
pivot = df.pivot_table(index = ["Song"],columns = ["URL"],values = "Delta").fillna(0)

# What is the Recommendation System?
Based on previous(past) behaviours, it predicts the likelihood that a user would prefer an item.
For example, Netflix uses recommendation system. It suggest people new movies according to their past activities that are like watching and voting movies.
The purpose of recommender systems is recommending new things that are not seen before from people.
There are several methods in recommendation systems. In this kernel, I used to collaborative filtering method.
## Collaborative Filtering
Collaborative filtering is making recommend according to combination of your experience and experiences of other people. There are two collaborative filtering methods: User Based CF and Item Based CF

### User Based Collaborative Filtering
It is calculated similarity people in user vs item matrix. For example, we let think that there are two people. First one watched 2 movies that are lord of the rings and hobbit. Second one watched only lord of the rings movie. it recommends hobbit movie to second one.

User based collaborative filtering has some problems. In this system, each row of matrix is user. Therefore, comparing and finding similarity between of them is computationaly hard and spend too much computational power. Also, habits of people can be changed. Therefore making correct and useful recommendation can be hard in time.

In order to solve these problems, lets look at another recommender system that is item based collaborative filtering

### Item Based Collaborative Filtering
It is calculated similarity items in user vs item matrix. For example, we let think that there are two movies: Lord of the Rings and Hobbit. Three people watched lord of the rings and hobbit. If fourth person watched lord of the rings. He/she could like Hobbit. So that the system recommends Hobbit to fourth people.

In general recommendation systems use to item based collaborative filtering. Item based CF improved to solve the problem of user based CF. As people minds and habits can change and items doesn't change. It is prefered.

In [22]:
# We choose random song.
query_index = np.random.choice(pivot.shape[0])
print("Choosen Song is: ",pivot.index[query_index])

Choosen Song is:  SOXKVWC12A6701FB97


KNN is used for both classification and regression problems. In classification problems to predict the label of a instance we first find k closest instances to the given one based on the distance metric and based on the majority voting scheme or weighted majority voting(neighbors which are closer are weighted higher) we predict the labels.

K-nearest neighbor finds the k most similar items to a particular instance based on a given distance metric like euclidean, jaccard similarity , minkowsky or custom distance measures. In this my model, I used to cosine as metric.

In [24]:
pivot_matrix = csr_matrix(pivot.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(pivot_matrix)
distances, indices = model_knn.kneighbors(pivot.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 6)

In [26]:
song = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        song.append(pivot.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])    

m=pd.Series(song,name='song')
d=pd.Series(distance,name='distance')
recommend = pd.concat([m,d], axis=1)
recommend = recommend.sort_values('distance',ascending=False)

print('Recommendations for Song {0}:\n'.format(pivot.index[query_index]))
for i in range(0,recommend.shape[0]):
    print('{0}: {1}, with distance of {2}'.format(i, recommend["song"].iloc[i], recommend["distance"].iloc[i]))

Recommendations for SOXKVWC12A6701FB97:

0: SOPVMHA12A67ADC096, with distance of 0.6096763306989301
1: SOOMGGT12AB01810FB, with distance of 0.5785761351158816
2: SOTPVCT12A8C135D16, with distance of 0.5503086364649933
3: SONIKQT12A8AE475DF, with distance of 0.5171437725660789
4: SOODPSC12A6D4F6220, with distance of 0.437410524117549
