# Predictive Modeling
### Kwame V. Taylor

I will set the baseline and create the first ML model to predict song popularity.

## Set up Environment

In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, TweedieRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [2]:
from prepare import handle_nulls
from preprocessing import spotify_split, split_df, scale_data

## Acquire data

In [3]:
df = pd.read_csv('full-playlist.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,artist,album,release_date,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,explicit,popularity,disc_number
0,Tay-K,TRAPMAN,2020-07-12,TRAPMAN,6mecZbKK3JDeMdFRNxsCV5,0.792,0.594,2.0,-8.544,1.0,0.3,0.0,0.244,0.351,82.512,232803.0,4.0,True,43.0,1.0
1,Lil Wyte,Doubt Me Now,2003-03-04,Oxy Cotton,5PtMwNq8Dp31uYdGGacVJE,0.816,0.578,9.0,-6.912,1.0,0.233,0.0,0.114,0.265,148.077,193920.0,4.0,True,61.0,1.0
2,Kamelen,KINGPIN SLIM,2019-11-29,Kingpin O.G - Remix,6s8EhlBn2PIoESylkXnwYc,0.649,0.798,0.0,-6.45,0.0,0.145,0.0,0.409,0.717,160.011,254390.0,4.0,True,22.0,1.0
3,Waka Flocka Flame,Flockaveli,2010-10-01,Grove St. Party (feat. Kebo Gotti),2e9EZ2V5QGGZPMJacO3y0Y,0.705,0.702,0.0,-4.783,0.0,0.108,0.0,0.364,0.771,140.059,250493.0,4.0,True,62.0,1.0
4,Project Pat,Mista Don't Play: Everythangs Workin',2001-02-13,Don't Save Her (feat. Crunchy Black),3ZRd5Z0fiYtASLdEPPb16m,0.838,0.793,11.0,-5.47,0.0,0.0773,1e-06,0.106,0.8,160.003,261933.0,4.0,True,45.0,1.0


In [5]:
df.shape

(6074, 20)

## Prepare data

In [6]:
# handle null values
df = handle_nulls(df)

# split the data
X_train, y_train, X_validate, y_validate, X_test, y_test, train, validate, test = spotify_split(df, 'popularity')
train.head()

Shape of train: (4250, 19) | Shape of validate: (912, 19) | Shape of test: (912, 19)
Percent train: 70.0 | Percent validate: 15.0 | Percent test: 15.0


Unnamed: 0,artist,album,release_date,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,explicit,popularity,disc_number
2473,ABRA,Rose,2015-06-22,Fruit,4SExof7SNMmYgTmgE8TBrE,0.724,0.528,7.0,-12.726,0.0,0.0354,0.0415,0.106,0.607,120.008,341000.0,4.0,False,55.0,1.0
3779,Foxy Brown,Ill Nana 2: The Fever,2003-03-21,Superfreak,1yeRbC7hSDK4bHwnIfdfZe,0.785,0.64,2.0,-8.15,1.0,0.391,0.0,0.118,0.809,99.927,159133.0,4.0,True,14.0,1.0
5020,M.I.A.,Matangi,2013-11-05,Sexodus,0JMVQxxBhBQn9Ms0M7sU4l,0.549,0.84,7.0,-5.213,0.0,0.077,0.000254,0.117,0.382,157.962,291680.0,4.0,True,31.0,1.0
3516,Cardi B,Gangsta Bitch Music Vol 1,2016-03-07,I Gotta Hurt You,7kGvSuocFQxrcneBebUA5l,0.647,0.651,0.0,-12.089,0.0,0.167,0.213,0.224,0.152,125.987,251533.0,4.0,True,34.0,1.0
4529,Linn da Quebrada,Pajubá,2017-10-06,(Muito +) Talento,4Dq3hEZc9WryOSyHEXrzPN,0.611,0.819,7.0,-5.031,0.0,0.231,0.0094,0.233,0.501,129.947,178859.0,4.0,False,24.0,1.0


In [7]:
# show features
df.columns

Index(['artist', 'album', 'release_date', 'track_name', 'track_id',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'explicit', 'popularity', 'disc_number'],
      dtype='object')

## Set the baseline

## Model 1 - 