# Music Recommender System

#### Creating a music recommender system deployed via flask app. This notebook contains popularity based model to overcome cold start problem for new users. At later stage this notebook will be updated with collaborative filtering model.

#### This project will make use of Million Song Dataset.
In order to improve performance of recommender system modification of data is done. Detailed code for modification of data is commented out below.

In [1]:
'''
import numpy as np
import pandas as pd
import sqlite3

# Loading Data
users_df=pd.read_csv('Dataset/train_triplets.txt',sep='\t',header=None)
users_df.columns=['user_id','song_id','play_count']

conn=sqlite3.connect('Dataset/track_metadata.db')
meta_df=pd.read_sql_query('SELECT * FROM songs',conn)
conn.close()

#meta_df consist of:
['track_id',
 'title',
 'song_id',
 'release',
 'artist_id',
 'artist_mbid',
 'artist_name',
 'duration',
 'artist_familiarity',
 'artist_hotttnesss',
 'year',
 'track_7digitalid',
 'shs_perf',
 'shs_work']

genre_df=pd.read_csv('Dataset/MAGD_generes.cls',sep='\t',header=None)
genre_df.columns=['track_id','genre']

#Merging user_data with song_metadata
main_df=users_df.merge(meta_df.drop_duplicates(['song_id']),how='left',on='song_id')

#Creating a popularity based dataset
popularity_df=users_df[['song_id','play_count']] 
popularity_df=popularity_df.groupby(['song_id']).agg({'play_count': 'count'}).reset_index()
popularity_df=popularity_df.merge(meta_df[['track_id','song_id','title','artist_name','artist_familiarity']].drop_duplicates(['song_id']),on="song_id",how="left")  

#In order to choose good songs play_count>=50 is used. You can increase threshold to reduce the dataset.
popularity_df=popularity_df[popularity_df.play_count>=50].sort_values(by=['play_count'],ascending=False)

#Addition of genres to popularity based data.
popularity_df=popularity_df.merge(genre_df,on='track_id',how='left')

#play_count from popularity_df contains number of times song played by all users combined
#while play_count from users_df contains number of times song played by each user
users_combined=popularity_df[['song_id','title','artist_name']]
users_combined['song_popularity']=range(1,len(users_combined)+1)    
users_combined=users_df.merge(users_combined,on="song_id",how="left") 
users_combined=users_combined.dropna()

#To get rich data filled with balanced amount of users and items(songs)
#This step can be improved further...
users_combined=users_combined.sort_values(by=['user_id','song_popularity'],ascending=[True,True])  
test=users_combined.head(1000000)

#Saving datasets
popularity_df.to_csv('Dataset/popularity_based.csv',index=True)
test.to_csv('Dataset/user_data.csv',index=True)

'''

'\nimport numpy as np\nimport pandas as pd\nimport sqlite3\n\n# Loading Data\nusers_df=pd.read_csv(\'Dataset/train_triplets.txt\',sep=\'\t\',header=None)\nusers_df.columns=[\'user_id\',\'song_id\',\'play_count\']\n\nconn=sqlite3.connect(\'Dataset/track_metadata.db\')\nmeta_df=pd.read_sql_query(\'SELECT * FROM songs\',conn)\nconn.close()\n\n#meta_df consist of:\n[\'track_id\',\n \'title\',\n \'song_id\',\n \'release\',\n \'artist_id\',\n \'artist_mbid\',\n \'artist_name\',\n \'duration\',\n \'artist_familiarity\',\n \'artist_hotttnesss\',\n \'year\',\n \'track_7digitalid\',\n \'shs_perf\',\n \'shs_work\']\n\ngenre_df=pd.read_csv(\'Dataset/MAGD_generes.cls\',sep=\'\t\',header=None)\ngenre_df.columns=[\'track_id\',\'genre\']\n\n#Merging user_data with song_metadata\nmain_df=users_df.merge(meta_df.drop_duplicates([\'song_id\']),how=\'left\',on=\'song_id\')\n\n#Creating a popularity based dataset\npopularity_df=users_df[[\'song_id\',\'play_count\']] \npopularity_df=popularity_df.groupby([\'

#### Creating popularity based model for music recommendation.

In [16]:
# Popularity based recommender using self modified data
# Modification of data is commented out
# Data is modified from full million_song_dataset_metadata, echonest_taste_profile_data and Top_MAGD_dataset           
class popularity_recommender():
    def __init__(self):
        self.df=pd.read_csv('Dataset/popularity_based.csv')
        self.df=self.df.drop('Unnamed: 0',axis=1)   # Ignore this line of code

    
    # To normalize features            
    def normalize_columns(self,list_of_col):
        for feature in list_of_col:
            self.df[feature]=(self.df[feature]-self.df[feature].min())/(self.df[feature].max()-self.df[feature].min())    
    
    # Returns columns of the dataset    
    def get_columns(self):
        columns=list(self.df.columns)
        return columns
    
    # Get list of genres
    def get_unique_genres(self,listed=False):
        genres=list(self.df.genre.unique())
        idx=[i for i in genres if type(i)==float]
        genres.remove(idx[0])
        if listed==True:
            return genres
        return len(genres)
    
    # Get number of unique songs in dataset
    def get_unique_songs(self):
        songs=list(self.df.song_id.unique())        
        return len(songs)
    
    # Get number of unique artists in dataset
    def get_unique_artists(self):
        artists=list(self.df.artist_name.unique())        
        return len(artists)
    
    # Recommends based on user's preference of Artists and Genre
    def recommend(self,number_of_recommendations=20,artist_list=None,genre_list=None):
        
        if artist_list!=None and genre_list!=None:
            genre_list.append('nan')
            self.df=self.df.fillna('nan')
            count=0
            for index, row in self.df.iterrows():
                if row['genre'] in genre_list and row['artist_name'] in artist_list:
                    count+=1
                    print(row['title'],"-",row['artist_name'])
                if count==number_of_recommendations:
                    break
                
        elif artist_list!=None:
            count=0
            for index, row in self.df.iterrows():
                if row['artist_name'] in artist_list:
                    count+=1
                    print(row['title'],"-",row['artist_name'])
                if count==number_of_recommendations:
                    break 
                
        elif genre_list!=None:
            count=0
            for index, row in self.df.iterrows():
                if row['genre'] in genre_list:
                    count+=1
                    print(row['title'],"-",row['artist_name'])
                if count==number_of_recommendations:
                    break
        else:
            count=0
            for index, row in self.df.iterrows():
                count+=1
                print(row['title'],"-",row['artist_name'])
                if count==number_of_recommendations:
                    break

#### Creating our recommender and Analyzing the data.

In [4]:
# Importing Dependencies
import numpy as np
import pandas as pd

In [18]:
pr=popularity_recommender()
pr.df.head(10)

Unnamed: 0,song_id,play_count,track_id,title,artist_name,artist_familiarity,genre
0,SOFRQTD12A81C233C0,110479,TRDMBIJ128F4290431,Sehr kosmisch,Harmonia,0.505949,Pop_Rock
1,SOAUWYT12A81C206F1,90476,TRGXQES128F42BA5EB,Undo,Björk,0.853154,
2,SOAXGDH12A8C13F8A1,90444,TRHKJNX12903CEFCDF,Dog Days Are Over (Radio Edit),Florence + The Machine,0.818804,
3,SOBONKR12A58A7A7E0,84000,TRAEHHJ12903CF492F,You're The One,Dwight Yoakam,0.77537,Country
4,SOSXLTC12AF72A7F54,80656,TRONYHY128F92C9D11,Revelry,Kings Of Leon,0.845769,Pop_Rock
5,SONYKOW12AB01849C9,78353,TROAQBZ128F9326213,Secrets,OneRepublic,0.74441,Pop_Rock
6,SOEGIYH12A6D4FC0E3,69487,TRLGMFJ128F4217DBE,Horn Concerto No. 4 in E flat K495: II. Romanc...,Barry Tuckwell/Academy of St Martin-in-the-Fie...,0.340367,
7,SOLFXKT12AB017E3E0,64229,TRVSBTV12903CC6670,Fireflies,Charttraxx Karaoke,0.830279,
8,SODJWHY12A8C142CCE,63809,TRVCUSW128F92F20C6,Hey_ Soul Sister,Train,0.838886,
9,SOFLJQZ12A6D4FADA6,58610,TRTNDNE128F1486812,Tive Sim,Cartola,0.629097,


In [7]:
# Getting the column_names
pr.get_columns()

['song_id',
 'play_count',
 'track_id',
 'title',
 'artist_name',
 'artist_familiarity',
 'genre']

In [8]:
# Getting number of unique Songs and Artists
print("Number of unique Songs:", pr.get_unique_songs())
print("Number of unique Artists:", pr.get_unique_artists())

Number of unique Songs: 98485
Number of unique Artists: 19065


In [10]:
# Getting genre_info
print("Number of different genres:", pr.get_unique_genres())
# Getting list of genres
pr.get_unique_genres(listed=True)

Number of different genres: 21


['Pop_Rock',
 'Country',
 'Electronic',
 'RnB',
 'Rap',
 'Comedy_Spoken',
 'Jazz',
 'Folk',
 'Religious',
 'Latin',
 'Reggae',
 'Blues',
 'New Age',
 'Vocal',
 'Easy_Listening',
 'International',
 'Stage ',
 'Classical',
 'Holiday',
 'Avant_Garde',
 'Children']

#### Since this is a popularity based model, the ordering of data is done.
Here ordering of Songs will be based on a new variable $score which will be a function of features. You can change the features taken into consideration and change the weights to improve the model. 

In [19]:
# Normalization of features is must
# Here normalization is done via x=[x(i)-x.min()]/[x.max()-x.min()]
# Features being normalized are play_count and artist_familiarity
pr.normalize_columns(['play_count','artist_familiarity'])

Normalized data looks like...

In [12]:
pr.df.head(10)

Unnamed: 0,song_id,play_count,track_id,title,artist_name,artist_familiarity,genre
0,SOFRQTD12A81C233C0,1.0,TRDMBIJ128F4290431,Sehr kosmisch,Harmonia,0.752974,Pop_Rock
1,SOAUWYT12A81C206F1,0.818861,TRGXQES128F42BA5EB,Undo,Björk,0.926577,
2,SOAXGDH12A8C13F8A1,0.818571,TRHKJNX12903CEFCDF,Dog Days Are Over (Radio Edit),Florence + The Machine,0.909402,
3,SOBONKR12A58A7A7E0,0.760217,TRAEHHJ12903CF492F,You're The One,Dwight Yoakam,0.887685,Country
4,SOSXLTC12AF72A7F54,0.729935,TRONYHY128F92C9D11,Revelry,Kings Of Leon,0.922884,Pop_Rock
5,SONYKOW12AB01849C9,0.70908,TROAQBZ128F9326213,Secrets,OneRepublic,0.872205,Pop_Rock
6,SOEGIYH12A6D4FC0E3,0.628793,TRLGMFJ128F4217DBE,Horn Concerto No. 4 in E flat K495: II. Romanc...,Barry Tuckwell/Academy of St Martin-in-the-Fie...,0.670184,
7,SOLFXKT12AB017E3E0,0.581179,TRVSBTV12903CC6670,Fireflies,Charttraxx Karaoke,0.915139,
8,SODJWHY12A8C142CCE,0.577376,TRVCUSW128F92F20C6,Hey_ Soul Sister,Train,0.919443,
9,SOFLJQZ12A6D4FADA6,0.530295,TRTNDNE128F1486812,Tive Sim,Cartola,0.814549,


In [20]:
# Function for score
pr.df['score']=2*pr.df['play_count']+3*pr.df['artist_familiarity']
pr.df=pr.df.sort_values(by='score',ascending=False)

Data after sorting by score...

In [14]:
pr.df.head(10)

Unnamed: 0,song_id,play_count,track_id,title,artist_name,artist_familiarity,genre,score
1,SOAUWYT12A81C206F1,0.818861,TRGXQES128F42BA5EB,Undo,Björk,0.926577,,4.417452
2,SOAXGDH12A8C13F8A1,0.818571,TRHKJNX12903CEFCDF,Dog Days Are Over (Radio Edit),Florence + The Machine,0.909402,,4.365348
0,SOFRQTD12A81C233C0,1.0,TRDMBIJ128F4290431,Sehr kosmisch,Harmonia,0.752974,Pop_Rock,4.258923
4,SOSXLTC12AF72A7F54,0.729935,TRONYHY128F92C9D11,Revelry,Kings Of Leon,0.922884,Pop_Rock,4.228523
3,SOBONKR12A58A7A7E0,0.760217,TRAEHHJ12903CF492F,You're The One,Dwight Yoakam,0.887685,Country,4.183489
5,SONYKOW12AB01849C9,0.70908,TROAQBZ128F9326213,Secrets,OneRepublic,0.872205,Pop_Rock,4.034775
8,SODJWHY12A8C142CCE,0.577376,TRVCUSW128F92F20C6,Hey_ Soul Sister,Train,0.919443,,3.91308
12,SOUVTSM12AC468F6A7,0.461582,TREQNRF12903CF2405,Drop The World,Lil Wayne / Eminem,0.994969,,3.908071
7,SOLFXKT12AB017E3E0,0.581179,TRVSBTV12903CC6670,Fireflies,Charttraxx Karaoke,0.915139,,3.907776
10,SOUSMXX12AB0185C24,0.481848,TRSLDDC12903CC36E7,OMG,Usher featuring will.i.am,0.928225,,3.74837


##### Now testing of our model based on our preferences...

In [21]:
# Recommending songs without any preferences...
pr.recommend()

Undo - Björk
Dog Days Are Over (Radio Edit) - Florence + The Machine
Sehr kosmisch - Harmonia
Revelry - Kings Of Leon
You're The One - Dwight Yoakam
Secrets - OneRepublic
Hey_ Soul Sister - Train
Drop The World - Lil Wayne / Eminem
Fireflies - Charttraxx Karaoke
OMG - Usher featuring will.i.am
Use Somebody - Kings Of Leon
Alejandro - Lady GaGa
The Scientist - Coldplay
Marry Me - Train
Just Dance - Lady GaGa / Colby O'Donis
The Only Exception (Album Version) - Paramore
Clocks - Coldplay
Creep (Explicit) - Radiohead
Tive Sim - Cartola
Yellow - Coldplay


In [29]:
# Setting preferences
genre_list=['Pop_Rock','Electronic','New Age','Rap']
artist_list=['Avril Lavigne','Coldplay','Taylor Swift']

In [23]:
# Recommendations based on Genre
pr.recommend(genre_list=genre_list)

Sehr kosmisch - Harmonia
Revelry - Kings Of Leon
Secrets - OneRepublic
Use Somebody - Kings Of Leon
Alejandro - Lady GaGa
The Scientist - Coldplay
Clocks - Coldplay
Creep (Explicit) - Radiohead
Yellow - Coldplay
Uprising - Muse
Bring Me To Life - Evanescence
Bulletproof - La Roux
Party In The U.S.A. - Miley Cyrus
Heartbreak Warfare - John Mayer
Canada - Five Iron Frenzy
Invalid - Tub Ring
Seven Nation Army - The White Stripes
Without Me - Eminem
In The End (Album Version) - Linkin Park
Tighten Up - The Black Keys


In [30]:
# Recommendations based on Artists
pr.recommend(artist_list=artist_list)

The Scientist - Coldplay
Clocks - Coldplay
Yellow - Coldplay
Love Story - Taylor Swift
Fix You - Coldplay
You Belong With Me - Taylor Swift
Shiver - Coldplay
In My Place - Coldplay
My Happy Ending - Avril Lavigne
Speed Of Sound - Coldplay
Tim McGraw - Taylor Swift
Our Song - Taylor Swift
I'm With You - Avril Lavigne
Don't Panic - Coldplay
When You're Gone - Avril Lavigne
Complicated - Avril Lavigne
Tomorrow - Avril Lavigne
Alice - Avril Lavigne
Fall To Pieces - Avril Lavigne
Things I'll Never Say - Avril Lavigne


In [31]:
# Recommendations based on both Genre and Artists
pr.recommend(artist_list=artist_list, genre_list=genre_list)

The Scientist - Coldplay
Clocks - Coldplay
Yellow - Coldplay
Love Story - Taylor Swift
Fix You - Coldplay
Shiver - Coldplay
In My Place - Coldplay
My Happy Ending - Avril Lavigne
Speed Of Sound - Coldplay
I'm With You - Avril Lavigne
Don't Panic - Coldplay
When You're Gone - Avril Lavigne
Complicated - Avril Lavigne
Tomorrow - Avril Lavigne
Alice - Avril Lavigne
Fall To Pieces - Avril Lavigne
Things I'll Never Say - Avril Lavigne
Sparks - Coldplay
Brothers & Sisters - Coldplay
Lost! - Coldplay
