Spotify preprocessing

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline




In [None]:
# loading the data

Merged_prep = pd.read_csv('Merged_eda.csv')
Merged_prep.head()

In [None]:
# Dropping Unnamed columns

Merged_prep= Merged_prep.drop(columns= ['Unnamed: 0.1','Unnamed: 0'], axis =1)


In [None]:
Merged_prep.head()

In [None]:
Merged_prep.isnull().sum()

In [None]:
# Handling missing track_name value

Merged_prep = Merged_prep.drop('track_name',axis =1)

In [None]:
# Handling missing GDP_year

Merged_prep['GDP_year'].fillna(0, inplace=True)

Merged_prep['GDP_year'] = Merged_prep['GDP_year'].apply(lambda x: year.values if x == '0' else x)
Merged_prep['GDP_year'].head(5)

In [None]:
# Coverting float GDP_year values to int

Merged_prep['GDP_year']= Merged_prep['GDP_year'].astype('int')

In [None]:
# Handling GDP values

Merged_prep['GDP'].fillna(0, inplace=True)
Merged_prep['GDP'] = Merged_prep['GDP'].apply(lambda x: GDP.values if x == '0' else x)

In [None]:
# Checking if changes have been effected

Merged_prep.head()

In [None]:
Merged_prep.info()

In [None]:
Merged_prep.isnull().sum()

In [None]:
# Defining X and y

X= Merged_prep.drop(columns=['popularity'])
y= Merged_prep['popularity']

The second stage of preprocessing requires that data is split into training and testing sets. To do this, the train/test split from sklearn.model_selection will be used.

In [None]:
# Using train/test split to prepare training data

len(Merged_prep) * 0.75, len(Merged_prep)* 0.25

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Merged_prep.drop(columns=['popularity'],axis=1), 
                                                    Merged_prep.popularity, test_size=0.25, 
                                                    random_state=47)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
# Verify X_train values are all numeric

X_train.dtypes

In [None]:
# Verify X_test values are all numeric

X_test.dtypes

In [None]:
# Determine numerical and categorical columns

num= X.select_dtypes(include=['int', 'float']).columns
cat= X.select_dtypes(include=['object','bool']).columns


In [None]:
# Transforming columns using OneHotEncoder for categorical columns and StandardScaler for numerical columns.
# Using ColumnTransformer which allows different columns to be transformed separately and the features generated by each transformer to be concatenated to form a single feature space

col_transform = ColumnTransformer(transformers=[('scaler', StandardScaler(), ['year', 'acousticness', 'liveness', 'instrumentalness','duration_min', 'GDP']),
        ('encoder', OneHotEncoder(), ['genre'])])

In [None]:
# Using fit_transform on Merged_model dataframe to fit and trandform columns using ColumnTransform defined above 

col_transformed = col_transform.fit_transform(Merged_prep)

At the next stage of the project, models will be choosen and a pipeline which contains all the steps required in preparing the data and the model will be defined. 

For the proposed predictive model, linear regression and logistic regression models will be tested. For the linear regression model, it is necessary to apply Ridge regression instead because of the detection of multicollinearity between some of the predictor variables at eda stage of the project.

For both the Ridge Regression and logistic regression models that will be tested this dataset, cross_validate from sklearn.model_selection will be used to fit and assess the models performance.
Both models will be assessed using: 
1. Mean Absolute Error scores which will be calculated for both.
2. Coefficient of determination/r2
   

In [None]:
# Saving train and test sets

with open('train.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, col_transform), f)

with open('test.pkl', 'wb') as f:
    pickle.dump((X_test, y_test, col_transform), f)

# SUMMARY

At the preprocessing stage of the project, missing values were handled by dropping an entire column(track_name) and by replacing missing values with 0 in GDP and GDP_year columns.

X and y variables were defined and data was split into training and test sets in the ratio of 75:25.

Numerical and categorical columns were defined. Numerical columns were transformed using the StandardScaler and categorical columns were transformed using the OneHotEncoder. These transfromations were combined into a single feature space with ColumnTransformer.

ColumnTransformer was used to fit and transform dataframe.

Training and Test sets together with Col_Transformer weresaved using the pickle module