In [1]:
#imports
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import recommenderClass as recommender

In [2]:
#define dataset locations
ids_file='https://static.turi.com/datasets/millionsong/10000.txt' # is a file loc with song ids, user ids and listen count
metadata_loc='https://static.turi.com/datasets/millionsong/song_data.csv' # is a file loc which contains song names, song ids, artist details

#combine both of these dataframes such that there exists 1 dataframe with prominence given to song id, user id and listen count
song_pd=pd.read_table('song_metadata.txt',header=None)
song_pd.columns=['user_id','song_id','listen_count']
metadata_pd=pd.read_csv('song_data.csv')

song_df=pd.merge(song_pd,metadata_pd.drop_duplicates(['song_id']), on='song_id', how='left')

In [None]:
#visualize combined dataset
song_df.head()

In [None]:
len(song_df)

In [3]:
#define a new column entry called 'song' that combines title and artist name
song_df=song_df.head(10000)
song_df['song']=song_df['title'].map(str)+"-"+song_df['artist_name'].map(str)


In [4]:
#get the aggregate listen count and percentage value for each song and sort the data
songs_count=song_df.groupby(['song']).agg({'listen_count':'count'}).reset_index()
grouped_sum=songs_count['listen_count'].sum()
songs_count['percentage']=songs_count['listen_count']/grouped_sum
songs_count.sort_values(['listen_count','song'], ascending=[0,1])


Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch-Harmonia,45,0.0045
4678,Undo-Björk,32,0.0032
5105,You're The One-Dwight Yoakam,32,0.0032
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.0028
3655,Secrets-OneRepublic,28,0.0028
4378,The Scientist-Coldplay,27,0.0027
4712,Use Somebody-Kings Of Leon,27,0.0027
3476,Revelry-Kings Of Leon,26,0.0026
1387,Fireflies-Charttraxx Karaoke,24,0.0024
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.0023


In [5]:
unique_users=song_df['user_id'].unique()
len(unique_users)

365

In [None]:
songs_count['percentage'].sum()

In [None]:
#unique song count
len(song_df['song'].unique())


In [6]:
#creating a song recommender based on popularity based sys and colloborative system
#creating train-test data with testdata size being 20%
trainData,testData=train_test_split(song_df,test_size=0.20,random_state=0)
print(trainData.head(5))

                                       user_id             song_id  \
7389  94d5bdc37683950e90c56c9b32721edb5d347600  SOXNZOW12AB017F756   
9275  1012ecfd277b96487ed8357d02fa8326b13696a5  SOXHYVQ12AB0187949   
2995  15415fa2745b344bce958967c346f2a89f792f63  SOOSZAZ12A6D4FADF8   
5316  ffadf9297a99945c0513cd87939d91d8b602936b  SOWDJEJ12A8C1339FE   
356   5a905f000fc1ff3df7ca807d57edb608863db05d  SOAMPRJ12A8AE45F38   

      listen_count                 title  \
7389             2      Half Of My Heart   
9275             1  The Beautiful People   
2995             1     Sanctify Yourself   
5316             4     Heart Cooks Brain   
356             20                 Rorol   

                                                release      artist_name  \
7389                                     Battle Studies       John Mayer   
9275             Antichrist Superstar (Ecopac Explicit)   Marilyn Manson   
2995                             Glittering Prize 81/92     Simple Minds   
5316  Ever

In [None]:
trainData.keys()

In [None]:
trainData['song_id'].unique()

In [None]:
trainData['user_id'].unique()

In [7]:
p_recommender=recommender.popularityRecommender()

In [8]:
p_recommender.create(trainData,'user_id','song')


printing 2 rows of training data
                        song  score
0          & Down-Boys Noize      3
1  '97 Bonnie & Clyde-Eminem      2


In [9]:
user_id=unique_users[5]
p_recommender.recommend(user_id)

generating recommendations for user_id 4bd88bfb25263a75bbdd467e74018f4ae570e5df


Unnamed: 0,user_id,song,score,rank
2241,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Love Don't Live Here Anymore-Rose Royce,1,1716.0
2653,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Novocaine For The Soul-Eels,1,1717.0
2654,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Now Behold The Lamb-Kirk Franklin & The Family,1,1718.0
2655,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Now I'm High_ Really High-Triple Six Mafia,1,1719.0
2656,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Now My Feet Won't Touch The Ground-Coldplay,1,1720.0
2658,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Nuestra Cancion-Ruben Gonzalez,1,1721.0
2661,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Numb-U2,1,1722.0
2662,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Number One-John Legend featuring Kanye west,1,1723.0
2663,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Nut City (2000 Digital Remaster) (The Rudy Van...,1,1724.0
2666,4bd88bfb25263a75bbdd467e74018f4ae570e5df,O Meu Amor Anda Em Fama-Camané,1,1725.0


lets try to understand how the code in method 'popularityRecommender.create()' is written

In [None]:

trainData.keys()

In [None]:
groupedData=trainData.groupby(['song']).agg({'user_id':'count'}).reset_index()
groupedData

In [None]:
groupedData.rename(columns={'user_id':'score'}, inplace=True)
groupedData

In [None]:
groupedData=groupedData.sort_values(['score'],ascending=False)

In [None]:
groupedData['rank']=groupedData['score'].rank(ascending=0,method='first')
groupedData

In [None]:
user_recommendations=groupedData
user_id=5
user_recommendations['user_id']=user_id
cols=user_recommendations.columns.tolist()
cols

In [None]:
cols[-1:]+cols[:-1]