Step 1: Imports

In [7]:
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from etl_music_data import MUSE_SQL as MS
import pandas as pd
import plotly.express as px
import numpy as np


In [8]:
#calling my music pipeline
mq = MS()
df=mq.query('select * from music')

Dropping high cardinality and irrelevant columns for the model

In [9]:
df = df.drop(columns=['index','instance_id','obtained_date','artist_name','track_name'])
df.head()


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,0.531,Electronic
1,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
2,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,0.27,Electronic
3,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
4,46.0,0.0289,0.572,214408.0,0.803,8e-06,B,0.106,-4.294,Major,0.351,149.995,0.23,Electronic


Build Linear Model to predict popularity of a song.

create a baseline

In [10]:
baseline = [df['popularity'].mean()]*len(df)
print(f'Baseline Popularity is: {round(baseline[0],2)}')


Baseline Popularity is: 44.97


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39792 entries, 0 to 39791
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        39792 non-null  float64
 1   acousticness      39792 non-null  float64
 2   danceability      39792 non-null  float64
 3   duration_ms       39792 non-null  float64
 4   energy            39792 non-null  float64
 5   instrumentalness  39792 non-null  float64
 6   key               39792 non-null  object 
 7   liveness          39792 non-null  float64
 8   loudness          39792 non-null  float64
 9   mode              39792 non-null  object 
 10  speechiness       39792 non-null  float64
 11  tempo             39792 non-null  float64
 12  valence           39792 non-null  float64
 13  music_genre       39792 non-null  object 
dtypes: float64(11), object(3)
memory usage: 4.3+ MB


Create my feature and target matrix.

In [12]:
#feature data matrix
X = df.drop(columns=['popularity'])
# Target Vector
y = df['popularity']

Split into train and test

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=.8, random_state=72)

Pipeline

In [14]:
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LinearRegression()
)

model_lr.fit(x_train, y_train)

In [15]:
train_b = [df['popularity'].mean()] * len(y_train)

In [16]:
mean_absolute_error(y_train, train_b)

12.12371759641513

In [17]:
mean_absolute_error(y_train, model_lr.predict(x_train))

6.769737834000614

In [18]:
mean_absolute_error(y_test, model_lr.predict(x_test))

6.690396715128737

Lets try to remove all ordinal values

In [19]:
#feature data matrix
X = df.drop(columns=['popularity','key','mode','music_genre'])
# Target Vector
y = df['popularity']

In [21]:
model_lr = make_pipeline(
    LinearRegression()
)
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=.8, random_state=72)
model_lr.fit(x_train, y_train)

In [22]:
mean_absolute_error(y_train, model_lr.predict(x_train))

10.392550378252517

In [23]:
mean_absolute_error(y_test, model_lr.predict(x_test))
#very consistant between the train and test but worse results without our ordinal fields.

10.257651588975444

I am going to try Random Forrest and add back in my higher Ordinality fields.

In [26]:
#feature data matrix
X = df.drop(columns=['popularity'])
# Target Vector
y = df['popularity']

In [27]:
X_train, v, y_train, y = train_test_split(X,y, test_size=.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(v,y, test_size=.5, random_state=42)

In [28]:
model_rf = make_pipeline(
    OrdinalEncoder(),
    RandomForestRegressor(random_state=42)
)

model_rf.fit(X_train, y_train)

In [29]:
train_base = [df['popularity'].mean()] * len(y_train)

In [30]:
#almost an entire standard deviation of error. Maybe not the best data.
print(mean_absolute_error(y_train, train_base))
print([df['popularity'].std()])

12.10261614013326
[14.725112279184602]


In [31]:
mean_absolute_error(y_train, model_rf.predict(X_train))

2.506057317728084

In [32]:
mean_absolute_error(y_val, model_rf.predict(X_val))

6.898145537883411

In [33]:
mean_absolute_error(y_test, model_rf.predict(X_test))

6.7755406197654935

In [34]:
r2_score(y_train, model_rf.predict(X_train))

0.9510678655047562

In [35]:
r2_score(y_test, model_rf.predict(X_test))

0.6509246089340155

The Random forrest seems to be significantly more accurate than the linear regression.