Spotify preprocessing

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline




In [2]:
# loading the data

Merged_prep = pd.read_csv('Merged_eda.csv')
Merged_prep.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,genre,year,track_name,popularity,danceability,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,GDP_year,GDP,duration_min
0,0,0,acoustic,2000,Easy Tonight,37,0.47,-8.018,0.259,0.0,0.204,84.843,246600,2000.0,10252300000000.0,4.11
1,1,1,opera,2000,"Blumenlieder, Op. 500: No. 7, Herbstzeitlosen",0,0.258,-29.012,0.991,0.00119,0.079,89.835,157920,2001.0,10581800000000.0,2.632
2,2,2,opera,2000,"Blumenlieder, Op. 500: No. 15, Feuernelken",0,0.432,-26.235,0.989,0.0022,0.0618,136.726,193187,2002.0,10936400000000.0,3.219783
3,3,3,opera,2000,Die Gräfin Mariza: Einmal möchte ich wieder ta...,0,0.327,-14.306,0.963,0.000832,0.0888,97.87,208040,2003.0,11458200000000.0,3.467333
4,4,4,opera,2000,"Blumenlieder, Op. 500: No. 12, Anemonen",0,0.385,-24.274,0.992,0.00249,0.0712,83.645,169893,2004.0,12213700000000.0,2.83155


In [3]:
# Dropping Unnamed columns

Merged_prep= Merged_prep.drop(columns= ['Unnamed: 0.1','Unnamed: 0'], axis =1)


In [4]:
Merged_prep.head()

Unnamed: 0,genre,year,track_name,popularity,danceability,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,GDP_year,GDP,duration_min
0,acoustic,2000,Easy Tonight,37,0.47,-8.018,0.259,0.0,0.204,84.843,246600,2000.0,10252300000000.0,4.11
1,opera,2000,"Blumenlieder, Op. 500: No. 7, Herbstzeitlosen",0,0.258,-29.012,0.991,0.00119,0.079,89.835,157920,2001.0,10581800000000.0,2.632
2,opera,2000,"Blumenlieder, Op. 500: No. 15, Feuernelken",0,0.432,-26.235,0.989,0.0022,0.0618,136.726,193187,2002.0,10936400000000.0,3.219783
3,opera,2000,Die Gräfin Mariza: Einmal möchte ich wieder ta...,0,0.327,-14.306,0.963,0.000832,0.0888,97.87,208040,2003.0,11458200000000.0,3.467333
4,opera,2000,"Blumenlieder, Op. 500: No. 12, Anemonen",0,0.385,-24.274,0.992,0.00249,0.0712,83.645,169893,2004.0,12213700000000.0,2.83155


In [5]:
Merged_prep.isnull().sum()

genre                     0
year                      0
track_name                1
popularity                0
danceability              0
loudness                  0
acousticness              0
instrumentalness          0
liveness                  0
tempo                     0
duration_ms               0
GDP_year            1159740
GDP                 1159740
duration_min              0
dtype: int64

In [6]:
# Handling missing track_name value

Merged_prep = Merged_prep.drop('track_name',axis =1)

In [7]:
# Handling missing GDP_year

Merged_prep['GDP_year'].fillna(0, inplace=True)

Merged_prep['GDP_year'] = Merged_prep['GDP_year'].apply(lambda x: year.values if x == '0' else x)
Merged_prep['GDP_year'].head(5)

0    2000.0
1    2001.0
2    2002.0
3    2003.0
4    2004.0
Name: GDP_year, dtype: float64

In [8]:
# Coverting float GDP_year values to int

Merged_prep['GDP_year']= Merged_prep['GDP_year'].astype('int')

In [9]:
# Handling GDP values

Merged_prep['GDP'].fillna(0, inplace=True)
Merged_prep['GDP'] = Merged_prep['GDP'].apply(lambda x: GDP.values if x == '0' else x)

In [10]:
# Checking if changes have been effected

Merged_prep.head()

Unnamed: 0,genre,year,popularity,danceability,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,GDP_year,GDP,duration_min
0,acoustic,2000,37,0.47,-8.018,0.259,0.0,0.204,84.843,246600,2000,10252300000000.0,4.11
1,opera,2000,0,0.258,-29.012,0.991,0.00119,0.079,89.835,157920,2001,10581800000000.0,2.632
2,opera,2000,0,0.432,-26.235,0.989,0.0022,0.0618,136.726,193187,2002,10936400000000.0,3.219783
3,opera,2000,0,0.327,-14.306,0.963,0.000832,0.0888,97.87,208040,2003,11458200000000.0,3.467333
4,opera,2000,0,0.385,-24.274,0.992,0.00249,0.0712,83.645,169893,2004,12213700000000.0,2.83155


In [11]:
Merged_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 13 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   genre             1159764 non-null  object 
 1   year              1159764 non-null  int64  
 2   popularity        1159764 non-null  int64  
 3   danceability      1159764 non-null  float64
 4   loudness          1159764 non-null  float64
 5   acousticness      1159764 non-null  float64
 6   instrumentalness  1159764 non-null  float64
 7   liveness          1159764 non-null  float64
 8   tempo             1159764 non-null  float64
 9   duration_ms       1159764 non-null  int64  
 10  GDP_year          1159764 non-null  int32  
 11  GDP               1159764 non-null  float64
 12  duration_min      1159764 non-null  float64
dtypes: float64(8), int32(1), int64(3), object(1)
memory usage: 110.6+ MB


In [12]:
Merged_prep.isnull().sum()

genre               0
year                0
popularity          0
danceability        0
loudness            0
acousticness        0
instrumentalness    0
liveness            0
tempo               0
duration_ms         0
GDP_year            0
GDP                 0
duration_min        0
dtype: int64

In [13]:
# Defining X and y

X= Merged_prep.drop(columns=['popularity'])
y= Merged_prep['popularity']

The second stage of preprocessing requires that data is split into training and testing sets. To do this, the train/test split from sklearn.model_selection will be used.

In [14]:
# Using train/test split to prepare training data

len(Merged_prep) * 0.75, len(Merged_prep)* 0.25

(869823.0, 289941.0)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(Merged_prep.drop(columns=['popularity'],axis=1), 
                                                    Merged_prep.popularity, test_size=0.25, 
                                                    random_state=47)

In [16]:
X_train.shape, X_test.shape

((869823, 12), (289941, 12))

In [17]:
y_train.shape, y_test.shape

((869823,), (289941,))

In [18]:
# Verify X_train values are all numeric

X_train.dtypes

genre                object
year                  int64
danceability        float64
loudness            float64
acousticness        float64
instrumentalness    float64
liveness            float64
tempo               float64
duration_ms           int64
GDP_year              int32
GDP                 float64
duration_min        float64
dtype: object

In [19]:
# Verify X_test values are all numeric

X_test.dtypes

genre                object
year                  int64
danceability        float64
loudness            float64
acousticness        float64
instrumentalness    float64
liveness            float64
tempo               float64
duration_ms           int64
GDP_year              int32
GDP                 float64
duration_min        float64
dtype: object

In [20]:
# Determine numerical and categorical columns

num= X.select_dtypes(include=['int', 'float']).columns
cat= X.select_dtypes(include=['object','bool']).columns


In [21]:
# Transforming columns using OneHotEncoder for categorical columns and StandardScaler for numerical columns.
# Using ColumnTransformer which allows different columns to be transformed separately and the features generated by each transformer to be concatenated to form a single feature space

col_transform = ColumnTransformer(transformers=[('scaler', StandardScaler(), ['year', 'acousticness', 'liveness', 'instrumentalness','duration_min', 'GDP']),
        ('encoder', OneHotEncoder(), ['genre'])])

In [22]:
# Using fit_transform on Merged_model dataframe to fit and trandform columns using ColumnTransform defined above 

col_transformed = col_transform.fit_transform(Merged_prep)

At the next stage of the project, models will be choosen and a pipeline which contains all the steps required in preparing the data and the model will be defined. 

For the proposed predictive model, linear regression and logistic regression models will be tested. For the linear regression model, it is necessary to apply Ridge regression instead because of the detection of multicollinearity between some of the predictor variables at eda stage of the project.

For both the Ridge Regression and logistic regression models that will be tested this dataset, cross_validate from sklearn.model_selection will be used to fit and assess the models performance.
Both models will be assessed using: 
1. Mean Absolute Error scores which will be calculated for both.
2. Coefficient of determination/r2
   

In [24]:
# Saving train and test sets

dataset_dict = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test
}

with open('dataset_dict.pickle', 'wb') as file:
    pickle.dump(dataset_dict, file)

# SUMMARY

At the preprocessing stage of the project, missing values were handled by dropping an entire column(track_name) and by replacing missing values with 0 in GDP and GDP_year columns.

X and y variables were defined and data was split into training and test sets in the ratio of 75:25.

Numerical and categorical columns were defined. Numerical columns were transformed using the StandardScaler and categorical columns were transformed using the OneHotEncoder. These transfromations were combined into a single feature space with ColumnTransformer.

ColumnTransformer was used to fit and transform dataframe.

Training and Test sets together with Col_Transformer weresaved using the pickle module