# Final Tutorial: Spotify Data

In [127]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels as sm
import statsmodels.graphics.regressionplots as smg
import sklearn
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor

mpl.rcParams['agg.path.chunksize'] = 10000

In [128]:
pip install recordlinkage

Note: you may need to restart the kernel to use updated packages.


In [129]:
import recordlinkage

In [130]:
top_path = "./BillboardFromLast20/billboardHot100_1999-2019.csv"
top_data = pd.read_csv(top_path)
top_data = top_data.drop(['Unnamed: 0', 'Writing.Credits', 'Lyrics', 'Features'], axis=1)
top_data.head()

ValueError: Index Billboard_Index invalid

In [None]:
attr_path = "./spotify-dataset-19212020-160k-tracks/data.csv"
attr_data = pd.read_csv(attr_path)
attr_data = attr_data[(attr_data['year'] >= 2018)]
attr_data.head()

In [None]:
top_data['Date'] = pd.to_datetime(top_data['Date'])
top_data['Year'] = top_data['Date'].apply(lambda date: date.year)
attr_data['release_date'] = pd.to_datetime(attr_data['release_date'])

In [None]:
top_data = top_data[top_data['Year'] >= 2018]

In [None]:
f = {'Weekly.rank': 'min', 'Peak.position': 'min', 'Weeks.on.chart': 'max', 'Genre': 'first'}
top_data = top_data.groupby(['Artists', 'Name', 'Year'], as_index = False).agg(f)

In [None]:
f = {'valence': 'mean',
     'acousticness': 'mean',
     'danceability': 'mean',
     'duration_ms': 'mean',
     'energy': 'mean',
     'explicit': 'max',
     'instrumentalness': 'mean',
     'key': 'first',
     'liveness': 'mean',
     'loudness': 'mean',
     'mode': 'first',
     'popularity': 'mean',
     'speechiness': 'mean',
     'tempo': 'mean',
    }
attr_data = attr_data.groupby(['artists', 'name', 'year'], as_index = False).agg(f)

In [None]:
def first_artist_top(artists):
    return artists.split(',')[0]

def first_artist_attr(artists):
    return artists.split('\'')[1]

In [None]:
top_data['join_artists'] = top_data['Artists'].apply(first_artist_top)
attr_data['join_artists'] = attr_data['artists'].apply(first_artist_attr)

In [None]:
indexer = recordlinkage.Index()
indexer.block(left_on='Year', right_on='year')
candidates = indexer.index(top_data, attr_data)
print(len(candidates))

In [None]:
compare = recordlinkage.Compare()
compare.string('Name', 'name', threshold=0.85, label='Name')
compare.string('join_artists', 'join_artists', method='jarowinkler', threshold=0.85, label='join_artists')
features = compare.compute(candidates, top_data, attr_data)

In [None]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'Name':'join_artists'].sum(axis=1)

In [None]:
potential_matches

In [None]:
top_data.loc[2419, :]

In [None]:
attr_data.loc[140498, :]

In [None]:
compression_opts = dict(method='zip',
                        archive_name='matched_data_2019.csv')  
potential_matches.to_csv('matched_data_2019.zip', index=False,
          compression=compression_opts)  

In [None]:
data_path = "./matched_data_2019.csv"
data = pd.read_csv(data_path)
data.head()

In [None]:
top_data['Song'] = top_data[['join_artists', 'Name']].apply(lambda x: ' - '.join(x), axis=1)
attr_data['Song'] = attr_data[['join_artists', 'name']].apply(lambda x: ' - '.join(x), axis=1)

In [None]:
top_data_lookup = top_data[['Song']]
attr_data_lookup = attr_data[['Song']]

In [None]:
merged_data = data.join(top_data_lookup, on='level_0', rsuffix = '_top')

In [None]:
merged_data = merged_data.join(attr_data_lookup, on='level_1', lsuffix = '_top', rsuffix = '_attr')

In [None]:
merged_data

In [None]:
top_data.loc[1, :]

In [None]:
attr_data.loc[19560, :]

In [None]:
attr_data.loc[38309, :]