In [1]:
#imports
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import recommenderClass as recommender

In [4]:
#define dataset locations
ids_file='https://static.turi.com/datasets/millionsong/10000.txt' # is a file loc with song ids, user ids and listen count
metadata_loc='https://static.turi.com/datasets/millionsong/song_data.csv' # is a file loc which contains song names, song ids, artist details

#combine both of these dataframes such that there exists 1 dataframe with prominence given to song id, user id and listen count
song_pd=pd.read_table('song_metadata.txt',header=None)
song_pd.columns=['user_id','song_id','listen_count']
metadata_pd=pd.read_csv('song_data.csv')

song_df=pd.merge(song_pd,metadata_pd.drop_duplicates(['song_id']), on='song_id', how='left')

In [3]:
#visualize combined dataset
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [4]:
len(song_df)

2000000

In [5]:
#define a new column entry called 'song' that combines title and artist name
song_df=song_df.head(10000)
song_df['song']=song_df['title'].map(str)+"-"+song_df['artist_name'].map(str)


In [10]:
#get the aggregate listen count and percentage value for each song and sort the data
songs_count=song_df.groupby(['song']).agg({'listen_count':'count'}).reset_index()
grouped_sum=songs_count['listen_count'].sum()
songs_count['percentage']=songs_count['listen_count']/grouped_sum
songs_count.sort_values(['listen_count','song'], ascending=[0,1])


Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch-Harmonia,45,0.0045
4678,Undo-Björk,32,0.0032
5105,You're The One-Dwight Yoakam,32,0.0032
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.0028
3655,Secrets-OneRepublic,28,0.0028
4378,The Scientist-Coldplay,27,0.0027
4712,Use Somebody-Kings Of Leon,27,0.0027
3476,Revelry-Kings Of Leon,26,0.0026
1387,Fireflies-Charttraxx Karaoke,24,0.0024
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.0023


In [15]:
unique_users=song_df['user_id'].unique()
len(unique_users)

76353

In [11]:
songs_count['percentage'].sum()

0.999999999999944

In [13]:
#unique song count
len(song_df['song'].unique())


5151

In [5]:
#creating a song recommender based on popularity based sys and colloborative system
#creating train-test data with testdata size being 20%
trainData,testData=train_test_split(song_df,test_size=0.20,random_state=0)
print(trainData.head(5))

                                          user_id             song_id  \
608812   7b8fbe766a49e5d7618452149dfab920621fc4fb  SOJJYDE12AF729FC16   
623729   d24956cd68ff84b6d0271286ae6866ee1c89ff77  SOPQGWI12A8C135DDB   
583106   da7b91b6cab1ca11227ee7720c4d2e03e8c31579  SOCOIIG12A58A7D151   
435735   5f633da6ad4845350949c3c76ce6c4ef6f167476  SOQQTBB12AB0182F1D   
1361953  01ad0fabd01af750700a1e80bb0055abcb3edd28  SOVYNVS12AC3DF64AB   

         listen_count                      title  \
608812              1     Two Is Better Than One   
623729              2              Royal Gregory   
583106              1                 Mr Sandman   
435735              2  A Days Work (feat. P.O.S)   
1361953             2      Rockin' Rollin' Stone   

                              release                             artist_name  \
608812                     Love Drunk  Boys Like Girls featuring Taylor Swift   
623729                             LP                               Holy Fuck   
58310

In [8]:
trainData.keys()

Index(['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
       'year'],
      dtype='object')

In [9]:
trainData['song_id'].unique()

array(['SOJJYDE12AF729FC16', 'SOPQGWI12A8C135DDB', 'SOCOIIG12A58A7D151',
       ..., 'SOSNGXL12A6D4F7A09', 'SOMPKDT12AAA8C6759',
       'SOHWSIS12A8C136B46'], dtype=object)

In [10]:
trainData['user_id'].unique()

array(['7b8fbe766a49e5d7618452149dfab920621fc4fb',
       'd24956cd68ff84b6d0271286ae6866ee1c89ff77',
       'da7b91b6cab1ca11227ee7720c4d2e03e8c31579', ...,
       '0d36fd1b02deb79387e41b33178b8dc6abdec809',
       'fcb1536a33388d2dbc0b2c3c9d60b934e5d59aba',
       'f74d483555f960fdfd0b958ec6e0fc7856d04814'], dtype=object)

In [20]:
p_recommender=recommender.popularityRecommender()

In [22]:
p_recommender.create(trainData,'user_id','song_id')


In [23]:
user_id=unique_users[5]
p_recommender.recommend(user_id)

Unnamed: 0,song_id,score,rank
2220,SOFRQTD12A81C233C0,6630,1.0
317,SOAUWYT12A81C206F1,5639,2.0
352,SOAXGDH12A8C13F8A1,5592,3.0
614,SOBONKR12A58A7A7E0,5143,4.0
7416,SOSXLTC12AF72A7F54,4938,5.0
5531,SONYKOW12AB01849C9,4627,6.0
1664,SOEGIYH12A6D4FC0E3,4368,7.0
4448,SOLFXKT12AB017E3E0,3835,8.0
1334,SODJWHY12A8C142CCE,3819,9.0
2115,SOFLJQZ12A6D4FADA6,3707,10.0
