In [1]:
#Import Statements
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np

## Dataset

In [2]:
filepath = '../data/train_triplets.txt'
triplet_dataset = pd.read_csv(filepath_or_buffer = filepath,header = None,sep = '\t',names = ['user','song','play_count'])

In [3]:
triplet_dataset.head()

Unnamed: 0,user,song,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [4]:
triplet_dataset.shape

(48373586, 3)

## Exploratory Data Analysis (EDA):


In [5]:
#Determine how many unique users does the dataset have. 
#So we concentrate on users that consitute to a large percentage of play counts
#Due to the large size of the file, we will read it line by line
#We will then extract play count information on a user


output_dict = {}

with open(filepath) as f:
    for line_number,line in enumerate(f):
        user = line.split('\t')[0]
        play_count = int(line.split('\t')[2])
        if user in output_dict:
            play_count += output_dict[user]
            output_dict.update({user:play_count})
        output_dict.update({user:play_count})
    output_list = [{'user':k,'play_count':v} for k,v in output_dict.items()]
    user_count_df = pd.DataFrame(output_list)
    user_count_df = user_count_df.sort_values(by='play_count',ascending = False)
    user_count_df = user_count_df.reset_index()
    user_count_df.drop(columns = 'index',inplace = True)
    #user_count_df.to_csv(path_or_buf = '../data/user_playcount_df.csv',index = False)

In [6]:
user_count_df.head()

Unnamed: 0,user,play_count
0,093cb74eb3c517c5179ae24caf0ebec51b24d2a2,13132
1,119b7c88d58d0c6eb051365c103da5caf817bea6,9884
2,3fa44653315697f42410a30cb766a4eb102080bb,8210
3,a2679496cd0af9779a92a13ff7c6af5c81ea8c7b,7015
4,d7d2d888ae04d16e994d6964214a1de81392ee04,6494


In [None]:
#Determine how many unique songs does the dataset have. 
#So we concentrate on songs that consitute to a large percentage of play counts
#Due to the large size of the file, we will read it line by line
#We will then extract play count information on a song


output_dict = {}

with open(filepath) as f:
    for line_number,line in enumerate(f):
        song = line.split('\t')[1]
        play_count = int(line.split('\t')[2])
        if song in output_dict:
            play_count += output_dict[song]
            output_dict.update({song:play_count})
        output_dict.update({song:play_count})
    output_list = [{'song':k,'play_count':v} for k,v in output_dict.items()]
    song_count_df = pd.DataFrame(output_list)
    song_count_df = song_count_df.sort_values(by='play_count',ascending = False)
    song_count_df = song_count_df.reset_index()
    song_count_df.drop(columns = 'index',inplace = True)
    #song_count_df.to_csv(path_or_buf = '../data/song_playcount_df.csv',index = False)

In [None]:

song_count_df.head()

In [None]:
user_count_df.shape

In [None]:
song_count_df.shape

In [None]:
#Determining Number of users (n) accounting to 40% of play counts
total_play_count = sum(user_count_df.play_count)
(float(user_count_df.head(n=100000).play_count.sum())/total_play_count)*100

In [None]:
#Determining Number of songs (n) accounting to 80% of play counts
total_play_count = sum(song_count_df.play_count)
(float(song_count_df.head(n=30000).play_count.sum())/total_play_count)*100

In [None]:
#Subsets of users
user_count_subset = user_count_df.head(n=100000)
user_subset = user_count_subset.user

#Subsets of songs
song_count_subset = song_count_df.head(n=30000)
song_subset = song_count_subset.song

In [None]:
#Code to form subsets of with maximum play counts per song and user
triplet_dataset_sub = triplet_dataset[triplet_dataset.user.isin(user_subset)]
del(triplet_dataset)
triplet_dataset_sub_song = triplet_dataset_sub[triplet_dataset_sub.song.isin(song_subset)]
triplet_dataset_sub_song = triplet_dataset_sub_song.reset_index()
triplet_dataset_sub_song.drop(columns = 'index',inplace = True)
del(triplet_dataset_sub)

In [None]:
#Final subset
triplet_dataset_sub_song.head()

In [None]:
#Number of rows and columns in final subset
triplet_dataset_sub_song.shape

### Enhancing The Data:


In [None]:
conn = sqlite3.connect('../data/track_metadata.db')
cur = conn.cursor()
cur.execute("select name from sqlite_master where type = 'table'")
cur.fetchall()

In [None]:
track_metadata_df = pd.read_sql_query("SELECT * from songs", conn)


In [None]:
track_metadata_df.head()

In [None]:
track_metadata_df.drop(columns = ['track_id','artist_mbid','artist_id','duration','artist_familiarity','artist_hotttnesss','track_7digitalid','shs_perf','shs_work'],inplace = True)

In [None]:
track_metadata_df = track_metadata_df.drop_duplicates(['song_id'])

In [None]:
triple_dataset_merged = pd.merge(triplet_dataset_sub_song,track_metadata_df,how = 'left',left_on = 'song',right_on = 'song_id')

In [None]:
triple_dataset_merged.head()

In [None]:
triple_dataset_merged.rename(columns = {'play_count':'listen_count'},inplace  =True)
triple_dataset_merged.drop(columns = ['song_id'],inplace = True)
triple_dataset_merged.head()

### Visual Analysis

In [None]:
plt.rcdefaults()
popular_songs = triple_dataset_merged[['title','listen_count']].groupby('title').sum().reset_index()

In [None]:
popular_songs.head()

In [None]:
popular_songs_top_20 = popular_songs.sort_values('listen_count',ascending = False).head(n=20).reset_index()
popular_songs_top_20.drop(columns = ['index'],inplace  =True)

In [None]:
popular_songs_top_20.head()

In [None]:
objects = list(popular_songs_top_20['title'])
y_pos = np.arange(len(objects))
performance = list(popular_songs_top_20['listen_count'])

In [None]:
plt.bar(y_pos,performance,align = 'center',alpha = 0.5)
plt.xticks(y_pos,objects,rotation = 'vertical')
plt.ylabel('Number of times listened')
plt.title('Most popular songs')
plt.show()

#### Most Popular Artists

In [None]:
plt.rcdefaults()

popular_artists = triple_dataset_merged[['artist_name','listen_count']].groupby('artist_name').sum().reset_index()
popular_artists_top_20 = popular_artists.sort_values('listen_count',ascending = False).head(n=20).reset_index()
popular_artists_top_20.drop(columns = ['index'],inplace  =True)
objects = list(popular_artists_top_20['artist_name'])
y_pos = np.arange(len(objects))
performance = list(popular_artists_top_20['listen_count'])
plt.bar(y_pos,performance,align = 'center',alpha = 0.5)
plt.xticks(y_pos,objects,rotation = 'vertical')
plt.ylabel('Number of times listened')
plt.title('Most popular artists')
plt.show()

### Popularity Based Recommender

In [None]:
def create_popularity_recommendation(train_data,user_id,item_id):
    #Get a count of user_ids for each unique song as recommendation score
    train_data_grouped = train_data.groupby([item_id]).agg({user_id:'count'}).reset_index()
    train_data_grouped.rename(columns = {user_id:'score'},inplace = True)
    
    #Sort the songs based on recommendation score
    train_data_sort = train_data_grouped.sort_values(['score',item_id],ascending = [0,1])
    
    #Generate a recommendation rank based upon score
    train_data_sort['Rank'] = train_data_sort['score'].rank(ascending = 0,method  ='first')
    
    #Get the top 20 recommendations
    popularity_recommendations = train_data_sort.head(20)
    return popularity_recommendations

In [None]:
recommendations = create_popularity_recommendation(triple_dataset_merged,'user','title')
#Recommendations based on most listened song by all users
recommendations