In [7]:
#imports
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import recommenderClass as recommender

In [2]:
#define dataset locations
ids_file='https://static.turi.com/datasets/millionsong/10000.txt' # is a file loc with song ids, user ids and listen count
metadata_loc='https://static.turi.com/datasets/millionsong/song_data.csv' # is a file loc which contains song names, song ids, artist details

#combine both of these dataframes such that there exists 1 dataframe with prominence given to song id, user id and listen count
song_pd=pd.read_table('song_metadata.txt',header=None)
song_pd.columns=['user_id','song_id','listen_count']
metadata_pd=pd.read_csv('song_data.csv')

song_df=pd.merge(song_pd,metadata_pd.drop_duplicates(['song_id']), on='song_id', how='left')

In [None]:
#visualize combined dataset
song_df.head()

In [None]:
len(song_df)

In [3]:
#define a new column entry called 'song' that combines title and artist name
song_df=song_df.head(10000)
song_df['song']=song_df['title'].map(str)+"-"+song_df['artist_name'].map(str)


In [4]:
#get the aggregate listen count and percentage value for each song and sort the data
songs_count=song_df.groupby(['song']).agg({'listen_count':'count'}).reset_index()
grouped_sum=songs_count['listen_count'].sum()
songs_count['percentage']=songs_count['listen_count']/grouped_sum
songs_count.sort_values(['listen_count','song'], ascending=[0,1])


Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch-Harmonia,45,0.0045
4678,Undo-Björk,32,0.0032
5105,You're The One-Dwight Yoakam,32,0.0032
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.0028
3655,Secrets-OneRepublic,28,0.0028
4378,The Scientist-Coldplay,27,0.0027
4712,Use Somebody-Kings Of Leon,27,0.0027
3476,Revelry-Kings Of Leon,26,0.0026
1387,Fireflies-Charttraxx Karaoke,24,0.0024
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.0023


In [5]:
unique_users=song_df['user_id'].unique()
len(unique_users)

365

In [None]:
songs_count['percentage'].sum()

In [None]:
#unique song count
len(song_df['song'].unique())


In [8]:
#creating a song recommender based on popularity based sys and colloborative system
#creating train-test data with testdata size being 20%
trainData,testData=train_test_split(song_df,test_size=0.20,random_state=0)
print(trainData.head(5))

                                       user_id             song_id  \
7389  94d5bdc37683950e90c56c9b32721edb5d347600  SOXNZOW12AB017F756   
9275  1012ecfd277b96487ed8357d02fa8326b13696a5  SOXHYVQ12AB0187949   
2995  15415fa2745b344bce958967c346f2a89f792f63  SOOSZAZ12A6D4FADF8   
5316  ffadf9297a99945c0513cd87939d91d8b602936b  SOWDJEJ12A8C1339FE   
356   5a905f000fc1ff3df7ca807d57edb608863db05d  SOAMPRJ12A8AE45F38   

      listen_count                 title  \
7389             2      Half Of My Heart   
9275             1  The Beautiful People   
2995             1     Sanctify Yourself   
5316             4     Heart Cooks Brain   
356             20                 Rorol   

                                                release      artist_name  \
7389                                     Battle Studies       John Mayer   
9275             Antichrist Superstar (Ecopac Explicit)   Marilyn Manson   
2995                             Glittering Prize 81/92     Simple Minds   
5316  Ever

In [None]:
trainData.keys()

In [None]:
trainData['song_id'].unique()

In [None]:
trainData['user_id'].unique()

# Lets look at how popularity based recommendation model works


In [None]:
p_recommender=recommender.popularityRecommender()

In [None]:
p_recommender.create(trainData,'user_id','song')


In [None]:
user_id=unique_users[5]
p_recommender.recommend(user_id)

lets try to understand how the code in method 'popularityRecommender.create()' is written

In [None]:

trainData.keys()

In [None]:
groupedData=trainData.groupby(['song']).agg({'user_id':'count'}).reset_index()
groupedData

In [None]:
groupedData.rename(columns={'user_id':'score'}, inplace=True)
groupedData

In [None]:
groupedData=groupedData.sort_values(['score'],ascending=False)

In [None]:
groupedData['rank']=groupedData['score'].rank(ascending=0,method='first')
groupedData

In [None]:
user_recommendations=groupedData
user_id=5
user_recommendations['user_id']=user_id
cols=user_recommendations.columns.tolist()
cols

In [None]:
cols[-1:]+cols[:-1]

# Lets look at how item based similarity recommendation model works

In [9]:
irecommender=recommender.item_similarity_recommender()


In [10]:
irecommender.create(trainData,'user_id','song')

In [11]:
user_id=unique_users[5]
irecommender.get_unique_songs_for_user(user_id)

['Just Lose It-Eminem',
 'Without Me-Eminem',
 '16 Candles-The Crests',
 'Speechless-Lady GaGa',
 'Push It-Salt-N-Pepa',
 "Ghosts 'n' Stuff (Original Instrumental Mix)-Deadmau5",
 "Say My Name-Destiny's Child",
 "My Dad's Gone Crazy-Eminem / Hailie Jade",
 'The Real Slim Shady-Eminem',
 'Somebody To Love-Justin Bieber',
 'Forgive Me-Leona Lewis',
 'Missing You-John Waite',
 'Ya Nada Queda-Kudai']

In [16]:
inputlist=list(trainData[trainData['user_id']==user_id]['song'])


In [12]:
irecommender.recommend(user_id)

No. of unique songs for the user: 13
no. of unique songs in the training set: 4483
non zero value in coocurence matrix 0


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Cover My Eyes-La Roux,0.0,1.0
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Carpal Tunnel Of Love-Fall Out Boy,0.0,2.0
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Whole World-Outkast Featuring Killer Mike,0.0,3.0
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Go With The Flow-Queens Of The Stone Age,0.0,4.0
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,She Just Likes To Fight-Four Tet,0.0,5.0
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mourning Air-Portishead,0.0,6.0
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Break Through-Colbie Caillat,0.0,7.0
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Creepin Up The Backstairs-The Fratellis,0.0,8.0
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Warning Sign-Coldplay,0.0,9.0
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Your Arms Feel Like home-3 Doors Down,0.0,10.0


In [20]:
user_list=[]
for i in inputlist:
    user_list.append(trainData[trainData['song']==i]['user_id'])
user_list    
#list of users who listen to songs derived by inputlist

[83      4bd88bfb25263a75bbdd467e74018f4ae570e5df
 3563    bd64f193f0f53f09d44ff48fd52830ff2fded392
 6120    c1fc436b58e28b3e3f1b43a4e955baa19d8a69ba
 Name: user_id, dtype: object, 6123    c1fc436b58e28b3e3f1b43a4e955baa19d8a69ba
 85      4bd88bfb25263a75bbdd467e74018f4ae570e5df
 6640    6b7a5895d266599bc414b9eb8fbad59c95b9a99d
 1695    930d2be6c85315d72cab9823ec0f7bfe7e477794
 7938    9c4e14bbd043846b632d3737f08104dcaeb88142
 7785    e427f647c231c1bde8881eca5b2f5db9b3bcb2b4
 7769    390c2e81bc9cf885608a0891c0a7eb13f1fd3336
 1307    a58de017cbeda1763ea002fe027ed41b4ed53109
 Name: user_id, dtype: object, 6122    c1fc436b58e28b3e3f1b43a4e955baa19d8a69ba
 7936    9c4e14bbd043846b632d3737f08104dcaeb88142
 2866    3ff7a31452eeabd7a4e07f0d243c674e3d0adf46
 84      4bd88bfb25263a75bbdd467e74018f4ae570e5df
 2385    c2cffe9ccaa09a327e8134e9a1f24901801fb2f8
 1694    930d2be6c85315d72cab9823ec0f7bfe7e477794
 1360    2b6c2f33bc0e887ea7c4411f58106805a1923280
 7137    0c306ce1440dec3b5b07b425880e43b