# Building a song recommender to users using collaborative filtering and KNN


#Import pandas 

In [1]:
import pandas as pd

#Load music data

In [2]:
song_data = pd.read_csv('song_data.csv')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [3]:
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


##Showing the most popular songs in the dataset

In [5]:
song_data['song'].value_counts()

Sehr kosmisch - Harmonia                                                                                                                        5970
Undo - Björk                                                                                                                                    5281
You\'re The One - Dwight Yoakam                                                                                                                 4806
Dog Days Are Over (Radio Edit) - Florence + The Machine                                                                                         4536
Revelry - Kings Of Leon                                                                                                                         4339
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) - Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner    3949
Secrets - OneRepublic                                                                                     

In [6]:
len(song_data)

1116609

##Count number of unique users in the dataset

In [7]:
users = song_data['user_id'].unique()

In [8]:
len(users)

66346


# Create a song recommender

# Using Collaborative Filtering

Create a matrix with number of users * number of songs shape.
Each entry in the array represent the number of times songs listened by a particular user

In [9]:
song=song_data.drop_duplicates(['user_id','song','title','artist'])
song=song.reset_index(drop=True)

Use only 50000 items in song_data (jupyter notebook has memory limitatons)

In [10]:
song=song.iloc[0:50000,]
song.shape

(50000, 6)

In [11]:
song.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


This is going to be the filtering table

In [107]:
song_pivot=song.pivot(index='user_id',columns='song_id',values='listen_count')

In [108]:
song_pivot.shape

(2952, 9242)

In [109]:
song_pivot=song_pivot.fillna(0)

In [110]:
song_pivot[song_pivot.SOBYHAJ12A6701BF1D==1]

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACSG12AB018DC80,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAJMQ12A6D4F7D17,SOAAKPM12A58A77210,SOAALWN12A6D4F7FDA,SOAAMOW12AB018149B,...,SOZZEID12A58A7AB89,SOZZHQT12AB018B714,SOZZIOH12A67ADE300,SOZZKPR12A6D4F8147,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1645b689f873529ab85e3b72742be44813e82bd3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2a8a8f48fd4eb5ca4b64874162df4fdf584d89c4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33280fc74b168e2667a2da5c6ab4df4cc6edfb23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
572da9d6331782b8c48924968f0778a331170c20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6f8453b0d9d2199f98c1992995a8445ad6837fd8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7a4b8e7d2905d13422418b4f48cc85100892e013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7ff09036b3ba258b13621875d18d43511ed2e09c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8d0bd6922dc3d0379444207a5ae3c02943dd5fc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b2cbcf5ea3c6ea3ee41ceac0ef247c2b1ddedbdc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b80344d063b5ccb3212f76538f3d9e43d87dca9e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train a Knn model using pivot_table

In [111]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=10,metric='cosine')
Model=knn.fit(song_pivot)

In [117]:
song_pivot[song_pivot.index == 1]

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACSG12AB018DC80,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAJMQ12A6D4F7D17,SOAAKPM12A58A77210,SOAALWN12A6D4F7FDA,SOAAMOW12AB018149B,...,SOZZEID12A58A7AB89,SOZZHQT12AB018B714,SOZZIOH12A67ADE300,SOZZKPR12A6D4F8147,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Predict neighbors for a particular user

In [118]:
user = song_pivot.iloc[1,]

In [119]:
distances, indices = Model.kneighbors([user])
distances, indices

(array([[0.        , 0.71428571, 0.72196466, 0.73917973, 0.75658779,
         0.77757173, 0.81488636, 0.85245778, 0.85857864, 0.86516003]]),
 array([[   1, 1931, 2879, 2724,  505,  884, 2542,  196, 1655,   40]],
       dtype=int64))

The indices contains only the index values of users from the user table. It doesnt show user_id.
We've to find out user_id of neighbors

In [137]:
neighbors=[]
for item in indices[0][1:]:
    neighbors.append(song[song.index == item].user_id.values[0])
neighbors

['e0388a836f13f4c1f8aa600ae61625f087f40353',
 '116a4c95d63623a967edf2f3456c90ebbf964e6f',
 'b3de3bf6a0d2e5953d3095a58ee99fd362b97e8f',
 '523a8a39456d5a96ae8f4d5e8b8b60f3bfb31528',
 '8cbb5066924ec788e3fea9a4aae59586f46f38fa',
 '999902c7302ca251b23f5c0d643debce084076d3',
 '0afaa5d9d04bf85af720fe8cc566a41ca3e41c97',
 'd775b2f1e62284a3ff407b91fe53d08525f6f086',
 'b80344d063b5ccb3212f76538f3d9e43d87dca9e']

Make a dataframe with details of only the neighbors.


In [191]:
neighbor_songs=pd.DataFrame(columns=['user_id','song_id','listen_count','title','artist','song'])
for item in neighbors:
    neighbor_songs=neighbor_songs.append(song[song.user_id == item],ignore_index = True)
neighbor_songs

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,e0388a836f13f4c1f8aa600ae61625f087f40353,SOALRMF12A8C1436A7,1,Soldier On,The Temper Trap,Soldier On - The Temper Trap
1,e0388a836f13f4c1f8aa600ae61625f087f40353,SOCEMSF12A8C145464,1,Resurrection,The Temper Trap,Resurrection - The Temper Trap
2,e0388a836f13f4c1f8aa600ae61625f087f40353,SOFBFVG12A6D4F61A7,1,Trouble (Album Version),Ray LaMontagne,Trouble (Album Version) - Ray LaMontagne
3,e0388a836f13f4c1f8aa600ae61625f087f40353,SOFEJPJ12A8C145455,2,Fools,The Temper Trap,Fools - The Temper Trap
4,e0388a836f13f4c1f8aa600ae61625f087f40353,SOFVGPP12A8C141FA6,2,Up From Below,Edward Sharpe & The Magnetic Zeros,Up From Below - Edward Sharpe & The Magnetic Z...
5,e0388a836f13f4c1f8aa600ae61625f087f40353,SOFWMNN12A8C13C3F2,1,Blindsided,Bon Iver,Blindsided - Bon Iver
6,e0388a836f13f4c1f8aa600ae61625f087f40353,SOGBFOO12A6D4FC933,8,Adrenalin,Eisbrecher,Adrenalin - Eisbrecher
7,e0388a836f13f4c1f8aa600ae61625f087f40353,SOHALKJ12AB017FCC2,3,Speak,Godsmack,Speak - Godsmack
8,e0388a836f13f4c1f8aa600ae61625f087f40353,SOHTTNB12A8C1357E9,3,Homeward Bound,Simon & Garfunkel,Homeward Bound - Simon & Garfunkel
9,e0388a836f13f4c1f8aa600ae61625f087f40353,SOINYMI12A8C13C3F8,1,Creature Fear,Bon Iver,Creature Fear - Bon Iver


For each song in neighbor_songs find the total listen count of all the neighbors

In [193]:
neighbor_songs=pd.DataFrame({'Count':neighbor_songs['listen_count'],'Song':neighbor_songs.index.tolist()})

Unnamed: 0_level_0,listen_count
song,Unnamed: 1_level_1
(I Just) Died In Your Arms - Cutting Crew,2
83 - John Mayer,1
A Beggar On A Beach Of Gold - Mike And The Mechanics,7
A Time To Be So Small - Interpol,1
ATWA - System of a Down,1
Across The Universe - Jim Sturgess,3
Adrenalin - Eisbrecher,8
Again Again - Lady GaGa,1
Ain\'t No Rest For The Wicked (Original Version) - Cage The Elephant,2
Alejandro - Lady GaGa,1


In [202]:
neighbor_songs=neighbor_songs.sort_values('Count',ascending=False)
neighbor_songs

Unnamed: 0_level_0,Count,Song
song,Unnamed: 1_level_1,Unnamed: 2_level_1
You\'re The One - Dwight Yoakam,25,You\'re The One - Dwight Yoakam
Undo - Björk,23,Undo - Björk
NAISEN KANSSA - Sami Saari,15,NAISEN KANSSA - Sami Saari
Revelry - Kings Of Leon,11,Revelry - Kings Of Leon
All My Loving - Jim Sturgess,11,All My Loving - Jim Sturgess
Relax - Frankie Goes To Hollywood,10,Relax - Frankie Goes To Hollywood
Super Freak - Rick James,9,Super Freak - Rick James
Behind Blue Eyes - Limp Bizkit,9,Behind Blue Eyes - Limp Bizkit
Moonshine - Jack Johnson,8,Moonshine - Jack Johnson
Rain - Subhumans,8,Rain - Subhumans


In [205]:
neighbor_songs=neighbor_songs.reset_index(drop=True)
neighbor_songs

Unnamed: 0,Count,Song
0,25,You\'re The One - Dwight Yoakam
1,23,Undo - Björk
2,15,NAISEN KANSSA - Sami Saari
3,11,Revelry - Kings Of Leon
4,11,All My Loving - Jim Sturgess
5,10,Relax - Frankie Goes To Hollywood
6,9,Super Freak - Rick James
7,9,Behind Blue Eyes - Limp Bizkit
8,8,Moonshine - Jack Johnson
9,8,Rain - Subhumans


In [207]:
Recommended_Song=neighbor_songs['Song'][0:10]
print(Recommended_Song)

0      You\'re The One - Dwight Yoakam
1                         Undo - Björk
2           NAISEN KANSSA - Sami Saari
3              Revelry - Kings Of Leon
4         All My Loving - Jim Sturgess
5    Relax - Frankie Goes To Hollywood
6             Super Freak - Rick James
7       Behind Blue Eyes - Limp Bizkit
8             Moonshine - Jack Johnson
9                     Rain - Subhumans
Name: Song, dtype: object


# The Top 10 song reccomendations for the user!!!