In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
#Import our data
songs_df = pd.read_csv('SpotifyFeatures.csv')

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['genre'] == 'Rap']
genre_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
86951,Rap,Post Malone,Wow.,6MWtB6iiXyIwun0YzU6DFP,99,0.163,0.833,149520,0.539,2e-06,B,0.101,-7.399,Minor,0.178,99.947,4/4,0.385
86952,Rap,J. Cole,MIDDLE CHILD,2JvzF1RMd7lE3KmFlsyZD8,96,0.149,0.837,213594,0.364,0.0,G#,0.271,-11.713,Major,0.276,123.984,4/4,0.463
86953,Rap,Post Malone,Sunflower - Spider-Man: Into the Spider-Verse,3KkXRkHbMCARz0aVfEt68P,97,0.556,0.76,158040,0.479,0.0,D,0.0703,-5.574,Major,0.0466,89.911,4/4,0.913
86954,Rap,Travis Scott,SICKO MODE,2xLMifQCjDGFmkHkpNLD9h,94,0.00513,0.834,312820,0.73,0.0,G#,0.124,-3.714,Major,0.222,155.008,4/4,0.446
86955,Rap,Meek Mill,Going Bad (feat. Drake),2IRZnDFmlqMuOrYOLnZZyc,95,0.259,0.889,180522,0.496,0.0,E,0.252,-6.365,Minor,0.0905,86.003,4/4,0.544


In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['genre','artist_name','track_name','track_id'])

genre_df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
86951,99,0.163,0.833,149520,0.539,2e-06,B,0.101,-7.399,Minor,0.178,99.947,4/4,0.385
86952,96,0.149,0.837,213594,0.364,0.0,G#,0.271,-11.713,Major,0.276,123.984,4/4,0.463
86953,97,0.556,0.76,158040,0.479,0.0,D,0.0703,-5.574,Major,0.0466,89.911,4/4,0.913
86954,94,0.00513,0.834,312820,0.73,0.0,G#,0.124,-3.714,Major,0.222,155.008,4/4,0.446
86955,95,0.259,0.889,180522,0.496,0.0,E,0.252,-6.365,Minor,0.0905,86.003,4/4,0.544


In [5]:
#Encode our categorical variables
genre_df_encoded = pd.get_dummies(genre_df, columns = ['key','mode','time_signature'])
genre_df_encoded

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_0/4,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
86951,99,0.16300,0.833,149520,0.539,0.000002,0.1010,-7.399,0.1780,99.947,...,0,0,0,0,1,0,0,0,1,0
86952,96,0.14900,0.837,213594,0.364,0.000000,0.2710,-11.713,0.2760,123.984,...,0,0,1,1,0,0,0,0,1,0
86953,97,0.55600,0.760,158040,0.479,0.000000,0.0703,-5.574,0.0466,89.911,...,0,0,0,1,0,0,0,0,1,0
86954,94,0.00513,0.834,312820,0.730,0.000000,0.1240,-3.714,0.2220,155.008,...,0,0,1,1,0,0,0,0,1,0
86955,95,0.25900,0.889,180522,0.496,0.000000,0.2520,-6.365,0.0905,86.003,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122392,51,0.04790,0.736,225613,0.581,0.000000,0.1670,-7.700,0.0697,126.006,...,0,0,0,0,1,0,0,0,1,0
122393,58,0.08030,0.554,370133,0.887,0.000000,0.1500,-3.367,0.5030,172.233,...,0,1,0,1,0,0,0,0,1,0
122394,49,0.10700,0.738,180293,0.515,0.000000,0.0889,-10.120,0.0984,75.040,...,0,0,0,1,0,0,0,0,1,0
122395,51,0.00597,0.750,168607,0.546,0.000032,0.3770,-6.951,0.3840,75.443,...,1,0,0,0,1,0,0,0,1,0


In [6]:
#Scale and Normalize our data for the MLR model, convert array back into DataFrame for train/test split
data_scaler = StandardScaler()
genre_data_scaled = data_scaler.fit_transform(genre_df_encoded)
genre_scaled_df = pd.DataFrame(genre_data_scaled, columns = genre_df_encoded.columns, index = genre_df_encoded.index)
genre_scaled_df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,key_F#,key_G,key_G#,mode_Major,mode_Minor,time_signature_0/4,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4
86951,4.704320,-0.026770,0.962040,-1.215013,-0.711778,-0.141516,-0.642892,-0.297522,-0.076517,-0.715725,...,-0.277253,-0.305272,-0.313470,-1.128172,1.128172,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
86952,4.337428,-0.100544,0.990386,-0.108139,-1.828723,-0.141547,0.473028,-2.057963,0.659654,0.097551,...,-0.277253,-0.305272,3.190103,0.886390,-0.886390,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
86953,4.459725,2.044161,0.444722,-1.067830,-1.094731,-0.141547,-0.844414,0.447217,-1.063588,-1.055287,...,-0.277253,-0.305272,-0.313470,0.886390,-0.886390,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
86954,4.092833,-0.858674,0.969127,1.605984,0.507287,-0.141547,-0.491914,1.206239,0.254009,1.147228,...,-0.277253,-0.305272,3.190103,0.886390,-0.886390,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
86955,4.215130,0.479106,1.358887,-0.679455,-0.986228,-0.141547,0.348308,0.124429,-0.733813,-1.187511,...,-0.277253,-0.305272,-0.313470,-1.128172,1.128172,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122392,-1.165959,-0.633295,0.274645,0.099489,-0.443712,-0.141547,-0.209652,-0.420353,-0.890062,0.165964,...,-0.277253,-0.305272,-0.313470,-1.128172,1.128172,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
122393,-0.309877,-0.462562,-1.015108,2.596062,1.509347,-0.141547,-0.321244,1.347842,2.364868,1.730024,...,-0.277253,3.275763,-0.313470,0.886390,-0.886390,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
122394,-1.410554,-0.321865,0.288818,-0.683411,-0.864959,-0.141547,-0.722319,-1.407898,-0.674469,-1.558437,...,-0.277253,-0.305272,-0.313470,0.886390,-0.886390,-0.010408,-0.055156,-0.158771,0.221834,-0.139811
122395,-1.165959,-0.854247,0.373856,-0.885286,-0.667100,-0.141064,1.168837,-0.114704,1.470946,-1.544802,...,3.606814,-0.305272,-0.313470,-1.128172,1.128172,-0.010408,-0.055156,-0.158771,0.221834,-0.139811


In [7]:
#Split our data into target and feature variables
X = genre_scaled_df.drop(columns=['popularity']).values
y= genre_scaled_df['popularity'].values

In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, random_state=1)

In [10]:
#Create and fit Linear Regression Model
mlr = LinearRegression()

mlr.fit(X_train, y_train)

LinearRegression()

In [13]:
#Create predictions; This is just a mockup so we will not be evaluating our model
y_pred = mlr.predict(X_test)
print(y_pred)

[-0.165802   -0.08621216  0.05587769 ...  0.16720581  0.22067261
 -0.04275513]
