In [2]:
import pandas as pd
import json
import os
import numpy as np
import sys

# ignore warnings - iterating through the grid search will throw a depreciation error
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [95]:
# first, extract the data we need from the various JSON files

file_locations = [r'C:\Users\roald\Documents\lastfm_test', r'C:\Users\roald\Documents\lastfm_train']

files_to_mine = []
for f in file_locations:
    for f1 in os.listdir(f):
        folder_location_1 = f + '\\' + f1
        for f2 in os.listdir(folder_location_1):
            folder_location_2 = folder_location_1 + '\\' + f2
            for f3 in os.listdir(folder_location_2):
                folder_location_3 = folder_location_2 + '\\' + f3
                for json_music_file in os.listdir(folder_location_3):
                    full_file_location = folder_location_3 + '\\' + json_music_file
                    files_to_mine.append(full_file_location)

print(len(files_to_mine), 'music files have been downloaded')

943334 music files have been downloaded


In [96]:
# extract the data from the json files and load into a normalized data model

all_songs = {}
for file in files_to_mine:
    with open(file, 'r') as f:
        music_file_contents = json.load(f)
        track_id = music_file_contents['track_id']
        all_songs[track_id] = music_file_contents

all_songs_dataframe = pd.DataFrame.from_dict(all_songs, orient='index')
print(all_songs_dataframe)

# write a csv file for the 'songs' table
song_table = all_songs_dataframe[['artist', 'timestamp', 'title']]
song_table.to_csv('song_table.csv')
print('song table has been generated and saved to csv')

# write a csv file for the similarity of songs
similar_songs = []
similars_dataframe = all_songs_dataframe[['similars']]
for song_id, row in similars_dataframe.iterrows():
    if len(row['similars']) > 0:
        for similar_song_info in row['similars']:
            info_to_append = [song_id] + similar_song_info
            similar_songs.append(info_to_append)
similars_dataframe = pd.DataFrame(similar_songs, columns=['song_id', 'similar_song_id', 'similarity_score']).set_index('song_id')
similars_dataframe.to_csv('song_similarity_table.csv')
print('similar song table has been generated and saved to csv')

# write a csv file for the tags
song_tags = []
tags_dataframe = all_songs_dataframe[['tags']]
for song_id, row in tags_dataframe.iterrows():
    if len(row['tags']) > 0:
        for song_tag_info in row['tags']:
            info_to_append = [song_id] + song_tag_info
            song_tags.append(info_to_append)
tags_dataframe = pd.DataFrame(song_tags, columns=['song_id', 'tag', 'tag_frequency']).set_index('song_id')
tags_dataframe.to_csv('song_tags_table.csv')
print('song tag table has been generated and saved to csv')


                                               artist  \
TRAAAAK128F9318786                       Adelitas Way   
TRAAAAV128F421A322                  Western Addiction   
TRAAAAW128F429D538                             Casual   
TRAAABD128F429CF47                       The Box Tops   
TRAAACV128F423E09E                       Super Deluxe   
TRAAADJ128F4287B47  Big Brother & The Holding Company   
TRAAADT12903CCC339                      Stanley Black   
TRAAADZ128F9348C2E                La Sonora Santanera   
TRAAAEA128F935A30D                      Jonathan King   
TRAAAED128E0783FAB                       Jamie Cullum   
TRAAAEF128F4273421                           Adam Ant   
TRAAAEM128F93347B9                           Son Kite   
TRAAAEW128F42930C0                    Broken Spindles   
TRAAAFD128F92F423A                                Gob   
TRAAAFI12903CE4F0E                  Minni The Moocher   
TRAAAFP128F931B4E3         F.L.Y. (Fly Life Yungstaz)   
TRAAAGF12903CEC202             

song table has been generated and saved to csv
similar song table has been generated and saved to csv
song tag table has been generated and saved to csv


In [100]:
# load the various raw dataframes into memory
songs = pd.read_csv('song_table.csv', encoding='latin-1')
song_tags = pd.read_csv('song_tags_table.csv', encoding='latin-1')
song_ratings = pd.read_csv('C:/Users/roald/Documents/music_ratings.csv')

print(list(songs.columns))
print(list(song_tags.columns))
print(list(song_ratings.columns))

# merge the songs, song tags and song ratings tables
rated_songs = pd.merge(left=song_ratings, right=songs, left_on=['artist', 'song'], right_on=['artist', 'title'], how='inner')
rated_songs['song_id'] = rated_songs['Unnamed: 0']
rated_songs = rated_songs[['song_id', 'rating', 'artist', 'title', 'timestamp']]
rated_songs = pd.merge(left=rated_songs, right=song_tags, left_on='song_id', right_on='song_id')

# produce a dataframe with one song per row, the rating, artist and the various song tags as columns
rated_songs['value'] = 1
rated_songs = rated_songs.pivot_table(index=['song_id', 'rating', 'artist', 'title'], columns='tag', values='value', aggfunc=np.sum)

# remove any tags that appear fewer than 3 times to reduce the dimensionality of the dataset
rated_songs.drop([col for col, val in rated_songs.count().iteritems() if val < 3], axis=1, inplace=True)
rated_songs = rated_songs.fillna(0).reset_index()
rated_songs = rated_songs.drop_duplicates(subset='song_id')

# add in dummy variables for the artist name
dummified_artists = pd.get_dummies(rated_songs['artist'])
rated_songs = pd.concat([rated_songs, dummified_artists], axis=1)

rated_songs.to_csv('full_song_featureset.csv')
print(rated_songs.head())

['Unnamed: 0', 'artist', 'timestamp', 'title']
['song_id', 'tag', 'tag_frequency']
['rating', 'artist', 'song']
              song_id  rating                 artist                  title  \
0  TRAAKAS128F4246013       7         Porcupine Tree                Waiting   
1  TRAALAH128E078234A       7              The Verve  Bitter Sweet Symphony   
2  TRABIOA128F42A6DB1       9            Marvin Gaye        What's Going On   
3  TRABPUY12903CE380F       7  Crosby, Stills & Nash         Long Time Gone   
4  TRACWWF128F9346965       8             Nick Drake  Things Behind The Sun   

    -3   -4  -chill-trip-lounge-down-  -pop-and-hip-hop-  -yulunga-  00s  \
0  0.0  0.0                       0.0                0.0        0.0  0.0   
1  0.0  0.0                       0.0                0.0        1.0  0.0   
2  0.0  0.0                       0.0                0.0        0.0  0.0   
3  0.0  0.0                       0.0                0.0        0.0  0.0   
4  0.0  0.0                      

In [3]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

rated_songs = pd.read_csv('full_song_featureset.csv', encoding='latin-1')
y = rated_songs['rating']
X = rated_songs.drop(['song_id', 'rating', 'artist', 'title'], axis=1)

# xgboost does not allow for the feature names to contain the symbols [, ], or < - replace these by an underscore
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

# split the dataset into a train and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# instantiate a 'vanilla' xgboost classifier to get an idea of baseline performance of this model
xg_model = xgb.XGBClassifier()
xg_model.fit(X_train, y_train)
predictions = xg_model.predict(X_test)

print(classification_report(y_test, predictions))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00         1
          3       0.00      0.00      0.00         0
          4       1.00      0.33      0.50         9
          5       1.00      0.43      0.60         7
          6       0.81      0.63      0.71        35
          7       0.60      0.78      0.68        50
          8       0.62      0.72      0.67        39
          9       0.71      0.57      0.63        21

avg / total       0.71      0.67      0.66       162



  if diff:


In [12]:
# now, let's see if we can improve on this baseline by performing a grid search with cross validation

from sklearn.grid_search import GridSearchCV

# instatiate a fresh, unfitted xgboost classifier
xgb_model = xgb.XGBClassifier()


# we will follow the hyperparameter tuning logic laid out in the following article:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# Step 1 - run a relatively broad grid search for max_depth and min_child_weight

param_test1 = {
 'max_depth': [5, 7, 9],
 'min_child_weight': [1, 3, 5]
}

gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = param_test1, cv=5)

gsearch1.fit(X_train, y_train)
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)




In [120]:
# at this stage, the best max_depth appears to be 7, and the best min_child_weight is 1.
# following the logic of the article, let's move to step 2 - optimizing these values within a tighter range

param_test2 = {
 'max_depth': [6, 7, 8],
 'min_child_weight': [0.5, 1, 1.5]
}

gsearch2 = GridSearchCV(estimator = xgb_model, param_grid = param_test2, cv=5)

gsearch2.fit(X_train, y_train)
print(gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_)

[mean: 0.59756, std: 0.05325, params: {'max_depth': 6, 'min_child_weight': 0.5}, mean: 0.59756, std: 0.05659, params: {'max_depth': 6, 'min_child_weight': 1}, mean: 0.59146, std: 0.03353, params: {'max_depth': 6, 'min_child_weight': 1.5}, mean: 0.60976, std: 0.06375, params: {'max_depth': 7, 'min_child_weight': 0.5}, mean: 0.61280, std: 0.05285, params: {'max_depth': 7, 'min_child_weight': 1}, mean: 0.60976, std: 0.04745, params: {'max_depth': 7, 'min_child_weight': 1.5}, mean: 0.59756, std: 0.04602, params: {'max_depth': 8, 'min_child_weight': 0.5}, mean: 0.59756, std: 0.05728, params: {'max_depth': 8, 'min_child_weight': 1}, mean: 0.59451, std: 0.05489, params: {'max_depth': 8, 'min_child_weight': 1.5}] {'max_depth': 7, 'min_child_weight': 1} 0.6128048780487805


In [125]:
# after step 2 of hyperparameter optimization, the optimal max_depth still.seems to be 7 and the best min_child weight is still 1.
# now, let's move to step 3 - tuning gamma. Gamma specifies the minimum loss reduction required to make a split. In other words, it is a regularization parameter. The higher gamma is, the more regularization there is.

param_test3 = {
                'max_depth':[7],
                'min_child_weight':[1],
                'gamma':[0, 0.3, 0.6, 1]
                }

gsearch3 = GridSearchCV(estimator = xgb_model, param_grid = param_test3, cv=5)

gsearch3.fit(X_train, y_train)
print(gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_)

[mean: 0.62805, std: 0.03377, params: {'gamma': 0, 'max_depth': 7, 'min_child_weight': 1}, mean: 0.64024, std: 0.02339, params: {'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1}, mean: 0.63415, std: 0.02804, params: {'gamma': 0.6, 'max_depth': 7, 'min_child_weight': 1}, mean: 0.57622, std: 0.03290, params: {'gamma': 1, 'max_depth': 7, 'min_child_weight': 1}] {'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1} 0.6402439024390244


In [127]:
# the optimal gamma appears to be 0.3. In the next round, we will tune subsample and colsample_bytree

param_test4 = {
                'max_depth':[7],
                'min_child_weight':[1],
                'gamma':[0.3],
                'subsample':[0.6, 0.8, 1],
                'colsample_bytree':[0.6, 0.8, 1]
}

gsearch4 = GridSearchCV(estimator = xgb_model, param_grid = param_test4, cv=5)

gsearch4.fit(X_train, y_train)
print(gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_)
    

[mean: 0.60671, std: 0.02543, params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.6}, mean: 0.62500, std: 0.05322, params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.8}, mean: 0.62195, std: 0.03361, params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 1}, mean: 0.58841, std: 0.03862, params: {'colsample_bytree': 0.8, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.6}, mean: 0.61280, std: 0.04957, params: {'colsample_bytree': 0.8, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.8}, mean: 0.61585, std: 0.03374, params: {'colsample_bytree': 0.8, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 1}, mean: 0.59146, std: 0.04011, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.6}, mean: 0.60366, std: 0.03930, params: {'colsample_bytree': 1,

In [128]:
# the optimal values for subsample and colsample_bytree are both 1. now, let's tune reg_alpha - another regularization parameter 

param_test5 = {
                'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
                'max_depth':[7],
                'min_child_weight':[1],
                'gamma':[0.3],
                'subsample':[1],
                'colsample_bytree':[1]
                }

gsearch5 = GridSearchCV(estimator = xgb_model, param_grid = param_test5, cv=5)

gsearch5.fit(X_train, y_train)
print(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_)

[mean: 0.64024, std: 0.02339, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0, 'subsample': 1}, mean: 0.64329, std: 0.03423, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0.001, 'subsample': 1}, mean: 0.63720, std: 0.02990, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0.005, 'subsample': 1}, mean: 0.63110, std: 0.02428, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0.01, 'subsample': 1}, mean: 0.63415, std: 0.04240, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0.05, 'subsample': 1}] {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'reg_alpha': 0.001, 'subsample': 1} 0.6432926829268293


In [14]:
# almost done! finally, let's tune the optimal number of estimators. 

param_test6 = {
                'reg_alpha':[0.001],
                'max_depth':[7],
                'min_child_weight':[1],
                'gamma':[0.3],
                'subsample':[1],
                'colsample_bytree':[1],
                'n_estimators':[10, 20, 40, 60, 100]
                }

gsearch6 = GridSearchCV(estimator = xgb_model, param_grid = param_test6, cv=10)

gsearch6.fit(X_train, y_train)
print(gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


[mean: 0.54268, std: 0.05221, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 10, 'reg_alpha': 0.001, 'subsample': 1}, mean: 0.58841, std: 0.04654, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 20, 'reg_alpha': 0.001, 'subsample': 1}, mean: 0.62195, std: 0.06135, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 40, 'reg_alpha': 0.001, 'subsample': 1}, mean: 0.62195, std: 0.04949, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 60, 'reg_alpha': 0.001, 'subsample': 1}, mean: 0.63415, std: 0.05649, params: {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.001, 'subsample': 1}] {'colsample_bytree': 1, 'gamma': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.001, 'subsample': 1} 0.6341463

In [15]:
# the optimal value for reg_alpha is 0.001. we could continue optimizing the hyperparameters into oblivion, but will settle for the current specifications.
# now, train the model on the full training set and evaluate the performance on the test set. This should compare favorably to the initial benchmark!
# we will set a lower learning rate than the standard to (hopefully!) find the best solution.

xg_model = xgb.XGBClassifier(eta=0.01, colsample_bytree=1, gamma=0.3, max_depth=7, min_child_weight=1, reg_alpha=0.001, subsample=1, n_estimators=100)
xg_model.fit(X_train, y_train)

predictions = xg_model.predict(X_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00         1
          4       1.00      0.33      0.50         9
          5       1.00      0.43      0.60         7
          6       0.81      0.63      0.71        35
          7       0.62      0.78      0.69        50
          8       0.65      0.77      0.71        39
          9       0.74      0.67      0.70        21

avg / total       0.72      0.69      0.69       162



  if diff:


NameError: name '_________________' is not defined

In [None]:
# now, train the model on the full dataset. 

xg_model = xgb.XGBClassifier(eta=0.01, colsample_bytree=1, gamma=0.3, max_depth=7, min_child_weight=1, reg_alpha=0.001, subsample=1, n_estimators=100)
xg_model.fit(X, y)

In [None]:
# now, generate predictions for the full dataset using the trained model. Let's see which new songs I should be listening to!
# we will only generate predictions for songs that have artist names and tags that also appeared in our original dataset
# the other songs will have zero values for all independent variables, thus yielding the same prediction.

# extract a list with the names of all the independent variables
full_song_featureset = pd.read_csv('full_song_featureset.csv', encoding='latin-1')
relevant_features = list(full_song_featureset.columns)[5:]

songs = pd.read_csv('song_table.csv', encoding='latin-1', names=['song_id', 'artist', 'timestamp', 'title'], skiprows=1)
songs = songs[['song_id', 'artist']]
songs = songs[songs['artist'].isin(relevant_features)]
songs.columns = ['song_id', 'attribute']

song_tags = pd.read_csv('song_tags_table.csv', encoding='latin-1')
song_tags = song_tags[['song_id', 'tag']]
song_tags = song_tags[song_tags['tag'].isin(relevant_features)]
song_tags.columns = ['song_id', 'attribute']
                     
all_song_info = pd.concat([songs, song_tags], axis=0)
all_song_info['count'] = 1
all_song_info = all_song_info.drop_duplicates()

all_song_info = all_song_info.pivot_table(index=['song_id'], columns='attribute', values='count', aggfunc=np.sum)
all_song_info = all_song_info[relevant_features]       

In [None]:
song_predictions = xg_model.predict(all_song_info)
print(song_predictions)