Spotify preprocessing

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer



In [2]:
# loading the data

Merged_prep = pd.read_csv('Merged_eda.csv')
Merged_prep.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,genre,year,track_name,popularity,danceability,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,GDP_year,GDP,duration_min
0,0,0,acoustic,2000,Easy Tonight,37,0.47,-8.018,0.259,0.0,0.204,84.843,246600,2000.0,10252300000000.0,4.11
1,1,1,opera,2000,"Blumenlieder, Op. 500: No. 7, Herbstzeitlosen",0,0.258,-29.012,0.991,0.00119,0.079,89.835,157920,2001.0,10581800000000.0,2.632
2,2,2,opera,2000,"Blumenlieder, Op. 500: No. 15, Feuernelken",0,0.432,-26.235,0.989,0.0022,0.0618,136.726,193187,2002.0,10936400000000.0,3.219783
3,3,3,opera,2000,Die Gräfin Mariza: Einmal möchte ich wieder ta...,0,0.327,-14.306,0.963,0.000832,0.0888,97.87,208040,2003.0,11458200000000.0,3.467333
4,4,4,opera,2000,"Blumenlieder, Op. 500: No. 12, Anemonen",0,0.385,-24.274,0.992,0.00249,0.0712,83.645,169893,2004.0,12213700000000.0,2.83155


In [3]:
# Dropping Unnamed columns

Merged_prep= Merged_prep.drop(columns= ['Unnamed: 0.1','Unnamed: 0'], axis =1)


In [4]:
Merged_prep.head()

Unnamed: 0,genre,year,track_name,popularity,danceability,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,GDP_year,GDP,duration_min
0,acoustic,2000,Easy Tonight,37,0.47,-8.018,0.259,0.0,0.204,84.843,246600,2000.0,10252300000000.0,4.11
1,opera,2000,"Blumenlieder, Op. 500: No. 7, Herbstzeitlosen",0,0.258,-29.012,0.991,0.00119,0.079,89.835,157920,2001.0,10581800000000.0,2.632
2,opera,2000,"Blumenlieder, Op. 500: No. 15, Feuernelken",0,0.432,-26.235,0.989,0.0022,0.0618,136.726,193187,2002.0,10936400000000.0,3.219783
3,opera,2000,Die Gräfin Mariza: Einmal möchte ich wieder ta...,0,0.327,-14.306,0.963,0.000832,0.0888,97.87,208040,2003.0,11458200000000.0,3.467333
4,opera,2000,"Blumenlieder, Op. 500: No. 12, Anemonen",0,0.385,-24.274,0.992,0.00249,0.0712,83.645,169893,2004.0,12213700000000.0,2.83155


In [5]:
Merged_prep.isnull().sum()

genre                     0
year                      0
track_name                1
popularity                0
danceability              0
loudness                  0
acousticness              0
instrumentalness          0
liveness                  0
tempo                     0
duration_ms               0
GDP_year            1159740
GDP                 1159740
duration_min              0
dtype: int64

The first stage of preprocessing requires preparing columns before modelling. This will be done using ColumnTransformer which allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer to be concatenated to form a single feature space


In [6]:
# Defining X by dropping all columns with NaN values and genre column beacuse it has variable values

X= Merged_prep[['danceability','loudness','acousticness','instrumentalness','tempo','duration_min']]


In [7]:
# Defining y

y= Merged_prep['popularity']

In [8]:
# Determine numerical and categorical columns

num= X.select_dtypes(include=['int', 'float']).columns
cat= X.select_dtypes(include=['object','bool']).columns


Transforming columns using OneHotEncoder for categorical columns and StandardScaler for numerical columns

In [9]:
# Define the data preparation for the columns

t= [('cat', OneHotEncoder(), cat), ('num', StandardScaler(), num)]
col_transform = ColumnTransformer(transformers=t)

The second stage of preprocessing requires that data is split into training and testing sets. To do this, the train/test split from sklearn.model_selection will be used.

In [10]:
# Using train/test split to prepare training data

len(Merged_prep) * 0.75, len(Merged_prep)* 0.25

(869823.0, 289941.0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(Merged_prep.drop(columns=['popularity','genre','track_name'],axis=1), 
                                                    Merged_prep.popularity, test_size=0.25, 
                                                    random_state=47)

In [12]:
X_train.shape, X_test.shape

((869823, 11), (289941, 11))

In [13]:
y_train.shape, y_test.shape

((869823,), (289941,))

In [14]:
# Verify X_train values are all numeric

X_train.dtypes

year                  int64
danceability        float64
loudness            float64
acousticness        float64
instrumentalness    float64
liveness            float64
tempo               float64
duration_ms           int64
GDP_year            float64
GDP                 float64
duration_min        float64
dtype: object

In [15]:
# Verify X_test values are all numeric

X_test.dtypes

year                  int64
danceability        float64
loudness            float64
acousticness        float64
instrumentalness    float64
liveness            float64
tempo               float64
duration_ms           int64
GDP_year            float64
GDP                 float64
duration_min        float64
dtype: object

At the third stage, models will be choosen and a pipeline which contains all the steps required in preparing the data and the model will be defined. 

For the proposed predictive model, linear regression and Random forest models will be used. For the linear regression model, it is necessary to apply Ridge regression instead because of the detection of multicollinearity between some of the predictor variables at eda stage of the project.

For both the Ridge Regression and RandomForest models that will be tested this dataset, cross_validate from sklearn.model_selection will be used to fit and assess the models performance.
Both models will be assessed using: 
1. Mean Absolute Error scores which will be calculated for both.
2. Coefficient of determination/r2
   

In [16]:
# Defining a Ridge Regression model

RR = Ridge(alpha = 0.1)


In [17]:
# Defining the data preparation and RR model pipeline

RR_pipeline = make_pipeline(col_transform,RR)


In [18]:
RR_pipeline.fit(X_train,y_train)


In [19]:
y_pred = RR_pipeline.predict(X_test)

In [20]:
RR_cv_results= cross_validate(RR_pipeline, X_train, y_train, cv=5)

In [21]:
# Evaluate the RR_ pipeline using cross validation and calculate MAE

RR_cv_scores = RR_cv_results['test_score']

# convert MAE scores to positive values

scores = np.abs(RR_cv_scores)

# summarize the model performance

print('MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))


MAE: 0.040 (0.001)


In [22]:
y_tr_pred = RR_pipeline.predict(X_train)
y_te_pred = RR_pipeline.predict(X_test)

In [23]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(12.840940694671293, 12.823548471071229)

In [24]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.039691455583923396, 0.039897379098701724)

In [25]:
# Defining a RandomForest model with 8 n_estimators

Rf_8 = RandomForestClassifier(n_estimators = 8)

In [26]:
# Defining Rf_8 pipeline

Rf_8= make_pipeline(col_transform, Rf_8)
Rf_8

In [27]:
Rf_8.fit(X_train,y_train)

In [28]:
y_preds = Rf_8.predict(X_test)

In [29]:
# convert MAE scores to positive values

score = np.abs(y_preds)

# summarize the model performance

print('MAE: %.3f (%.3f)' % (np.mean(score), np.std(score)))

MAE: 8.673 (12.388)


In [30]:
y_tr_preds = Rf_8.predict(X_train)
y_te_preds = Rf_8.predict(X_test)

In [31]:
mean_absolute_error(y_train, y_tr_preds), mean_absolute_error(y_test, y_te_preds)

(0.30658306345084, 15.722360756153838)

In [32]:
r2_score(y_train, y_tr_preds), r2_score(y_test, y_te_preds)

(0.9667469618354008, -0.764085630827751)

In [33]:
# Defining a RandomForest model with 16 n_estimators

Rf_16 = RandomForestClassifier(n_estimators = 16)

In [34]:
# Defining Rf_16 pipeline

Rf_16= make_pipeline(col_transform, Rf_16)

In [35]:
Rf_16.fit(X_train,y_train)

In [36]:
y_predi = Rf_16.predict(X_test)

In [37]:
# convert MAE scores to positive values

score = np.abs(y_predi)

# summarize the model performance

print('MAE: %.3f (%.3f)' % (np.mean(score), np.std(score)))

MAE: 9.048 (12.493)


In [38]:
y_tr_predi = Rf_16.predict(X_train)
y_te_predi = Rf_16.predict(X_test)

In [39]:
mean_absolute_error(y_train, y_tr_predi), mean_absolute_error(y_test, y_te_predi)

(0.0624517861679905, 15.319623647569678)

In [40]:
r2_score(y_train, y_tr_predi), r2_score(y_test, y_te_predi)

(0.993463075439188, -0.6874762410338038)

1. Comparing the two models, Ridge Regression(Linear Regression) model performs better than the RandomForest models in terms of MAE scores. Although cross validation was not used to fit RF models, the MAE of the Rf_8 is very high. Increasing the number of n_estimators to 16 made the MAE even higher. High MAE scores is an indication that the models predictions do not align with actual data.
2. The MAE score of the training data was lower than the score in the test data in the RR model which shows that the model is functioning well in predicting with lower error. Both Rf models show MAE scores in the training data to be very low but the scores in the test data is very high which shows that the model is not performing well. 
3. In terms of R2 values, the RR model shows no overfitting of training data. Both RF models  show very poor R2 scores in test data which is another signal that this model is not performing well.
   