In [16]:
import numpy as np
import pandas

In [44]:
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations

In [45]:
#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations

In [18]:
import pandas
from sklearn.model_selection import train_test_split
import numpy as np
import time
import Recommenders as Recommenders

In [19]:
#Read user_id, song_id, listen_count 
#This step might take time to download data from external sources


song_df_a = pandas.read_csv('triplets_file.csv')
song_df_a.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_df_b =  pandas.read_csv('song_data.csv')

#Merge the two dataframes above to create input dataframe for recommender systems
song_df1 = pandas.merge(song_df_a, song_df_b.drop_duplicates(['song_id']), on="song_id", how="left")
song_df1.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [20]:
print("Total no of songs:",len(song_df1))

Total no of songs: 2000000


In [21]:
song_df1 = song_df1.head(100000)

#Merge song title and artist_name columns to make a new column
song_df1['song'] = song_df1['title'].map(str) + " - " + song_df1['artist_name']

In [22]:
song_gr = song_df1.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_gr['listen_count'].sum()
song_gr['percentage']  = song_gr['listen_count'].div(grouped_sum)*100
song_gr.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
7079,Sehr kosmisch - Harmonia,427,0.427
9030,Undo - Björk,367,0.367
2052,Dog Days Are Over (Radio Edit) - Florence + Th...,363,0.363
9818,You're The One - Dwight Yoakam,317,0.317
7067,Secrets - OneRepublic,308,0.308
...,...,...,...
9746,You Dont Understand Me - The Raconteurs,1,0.001
9788,You Shouldn't Kiss Me Like This - Toby Keith,1,0.001
9811,You're In My Heart - Rhonda Vincent,1,0.001
9853,Your Time Has Come - Audioslave,1,0.001


In [23]:
u = song_df1['user_id'].unique()
print("The no. of unique users:", len(u))

The no. of unique users: 3863


In [24]:
train, test_data = train_test_split(song_df1, test_size = 0.20, random_state=0)
print(train.head(5))

                                        user_id             song_id  \
10382  ec0bb33707cdc61a6999b41585a0e1f39d0ae6d3  SORGAPD12AB017D9BB   
73171  a110dde1926c639c3230efc90fe7abfa52997f73  SOCJHPS12A6D4F8523   
30938  bf19818e6cd5b15250fc490fd7049429e728ffa4  SOJMVJD12A6701DE81   
99310  0e2f6624a9205bbfec266299bdfcf01c1fbef6d9  SOOFYTN12A6D4F9B35   
58959  103a439d41748ecc78ad5510d2a3c9042281ed4f  SOWUJPS12AB018ACA8   

       listen_count                         title          release  \
10382             1               Osaka Loop Line               LP   
73171             1  The Diary Of Jane (Acoustic)           Phobia   
30938            14               Quality Control  Quality Control   
99310             4                    Représente    Fat Come Back   
58959             2                        Broken    Plastic Beach   

             artist_name  year  \
10382          Discovery     0   
73171  Breaking Benjamin  2006   
30938         Jurassic 5  2000   
99310    Allianc

In [27]:
pm = Recommenders.popularity_recommender_py()                               #create an instance of the class
pm.create(train, 'user_id', 'song')

user_id1 = u[5]                                                          #Recommended songs list for a user
pm.recommend(user_id1)

Unnamed: 0,user_id,song,score,Rank
7005,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Sehr kosmisch - Harmonia,345,1.0
2034,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Dog Days Are Over (Radio Edit) - Florence + Th...,290,2.0
8939,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Undo - Björk,289,3.0
9719,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You're The One - Dwight Yoakam,255,4.0
6994,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Secrets - OneRepublic,249,5.0
6659,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Revelry - Kings Of Leon,245,6.0
3559,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Horn Concerto No. 4 in E flat K495: II. Romanc...,213,7.0
2672,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Fireflies - Charttraxx Karaoke,204,8.0
8704,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Tive Sim - Cartola,204,9.0
3432,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hey_ Soul Sister - Train,183,10.0


In [28]:
user_id2 = u[8]
pm.recommend(user_id2)

Unnamed: 0,user_id,song,score,Rank
7005,9bb911319fbc04f01755814cb5edb21df3d1a336,Sehr kosmisch - Harmonia,345,1.0
2034,9bb911319fbc04f01755814cb5edb21df3d1a336,Dog Days Are Over (Radio Edit) - Florence + Th...,290,2.0
8939,9bb911319fbc04f01755814cb5edb21df3d1a336,Undo - Björk,289,3.0
9719,9bb911319fbc04f01755814cb5edb21df3d1a336,You're The One - Dwight Yoakam,255,4.0
6994,9bb911319fbc04f01755814cb5edb21df3d1a336,Secrets - OneRepublic,249,5.0
6659,9bb911319fbc04f01755814cb5edb21df3d1a336,Revelry - Kings Of Leon,245,6.0
3559,9bb911319fbc04f01755814cb5edb21df3d1a336,Horn Concerto No. 4 in E flat K495: II. Romanc...,213,7.0
2672,9bb911319fbc04f01755814cb5edb21df3d1a336,Fireflies - Charttraxx Karaoke,204,8.0
8704,9bb911319fbc04f01755814cb5edb21df3d1a336,Tive Sim - Cartola,204,9.0
3432,9bb911319fbc04f01755814cb5edb21df3d1a336,Hey_ Soul Sister - Train,183,10.0


In [34]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train, 'user_id', 'song')

In [35]:
#Print the songs for the user
user_id1 = u[5]
user_items1 = is_model.get_user_items(user_id1)
print("------------------------------------------------------------------------------------")
print("Songs played by first user %s:" % user_id1)
print("------------------------------------------------------------------------------------")

for user_item in user_items1:
    print(user_item)

print("----------------------------------------------------------------------")
print("Similar songs recommended for the first user:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id1)

------------------------------------------------------------------------------------
Songs played by first user 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
The Real Slim Shady - Eminem
16 Candles - The Crests
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
Forgive Me - Leona Lewis
Just Lose It - Eminem
Missing You - John Waite
Without Me - Eminem
Push It - Salt-N-Pepa
Say My Name - Destiny's Child
My Dad's Gone Crazy - Eminem / Hailie Jade
Speechless - Lady GaGa
Somebody To Love - Justin Bieber
----------------------------------------------------------------------
Similar songs recommended for the first user:
----------------------------------------------------------------------
No. of unique songs for the user: 12
no. of unique songs in the training set: 9791
Non zero values in cooccurence_matrix :13056


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mockingbird - Eminem,0.054953,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,My Name Is - Eminem,0.05275,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman - Eminem / Dina Rae,0.048068,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile - Justin Bieber,0.042689,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Terre Promise - O'Rosko Raricim,0.041974,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hailie's Song - Eminem,0.040898,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,'Till I Collapse - Eminem / Nate Dogg,0.040557,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Marry Me - Train,0.039575,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,The Only Exception (Album Version) - Paramore,0.039518,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Lucky (Album Version) - Jason Mraz & Colbie Ca...,0.0393,10


In [38]:
user_id2 = u[7]
#Fill in the code here
user_items2 = is_model.get_user_items(user_id2)
print("------------------------------------------------------------------------------------")
print("Songs played by second user %s:" % user_id2)
print("------------------------------------------------------------------------------------")

for user_item in user_items2:
    print(user_item)

print("----------------------------------------------------------------------")
print("Similar songs recommended for the second user:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id2)

------------------------------------------------------------------------------------
Songs played by second user 9d6f0ead607ac2a6c2460e4d14fb439a146b7dec:
------------------------------------------------------------------------------------
Trouble - Coldplay
Strawberry Swing - Coldplay
Speed Of Sound - Coldplay
Life In Technicolor - Coldplay
Clocks - Coldplay
The Scientist - Coldplay
Swallowed In The Sea - Coldplay
Lost! - Coldplay
----------------------------------------------------------------------
Similar songs recommended for the second user:
----------------------------------------------------------------------
No. of unique songs for the user: 8
no. of unique songs in the training set: 9791
Non zero values in cooccurence_matrix :12357


Unnamed: 0,user_id,song,score,rank
0,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,Politik - Coldplay,0.142727,1
1,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,Warning Sign - Coldplay,0.124591,2
2,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,Life In Technicolor ii - Coldplay,0.112341,3
3,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,A Whisper - Coldplay,0.11219,4
4,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,Cemeteries Of London - Coldplay,0.10884,5
5,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,42 - Coldplay,0.108349,6
6,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,Yellow - Coldplay,0.105721,7
7,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,A Rush Of Blood To The Head - Coldplay,0.101951,8
8,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,God Put A Smile Upon Your Face - Coldplay,0.101542,9
9,9d6f0ead607ac2a6c2460e4d14fb439a146b7dec,In My Place - Coldplay,0.099949,10


In [42]:
def user(user__id):
    user_ids = u[user__id]
#Fill in the code here
    user_itemss = is_model.get_user_items(user_ids)
    print("------------------------------------------------------------------------------------")
    print("Songs played by  user %s:" % user_ids)
    print("------------------------------------------------------------------------------------")

    for user_item in user_itemss:
        print(user_item)

    print("----------------------------------------------------------------------")
    print("Similar songs recommended for the  user:")
    print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
    is_model.recommend(user_ids)
    

In [43]:
user(5)

------------------------------------------------------------------------------------
Songs played by  user 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
The Real Slim Shady - Eminem
16 Candles - The Crests
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
Forgive Me - Leona Lewis
Just Lose It - Eminem
Missing You - John Waite
Without Me - Eminem
Push It - Salt-N-Pepa
Say My Name - Destiny's Child
My Dad's Gone Crazy - Eminem / Hailie Jade
Speechless - Lady GaGa
Somebody To Love - Justin Bieber
----------------------------------------------------------------------
Similar songs recommended for the  user:
----------------------------------------------------------------------
No. of unique songs for the user: 12
no. of unique songs in the training set: 9791
Non zero values in cooccurence_matrix :13056
