# Preprocessing and data cleaning

In [293]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import feather

### Load the training data

In [294]:
# load data
url1 = "https://raw.githubusercontent.com/PicoRadia/sdataProjet/main/dataset/Spotify_train_dataset.csv"
df = pd.read_csv(url1)#31728 rows x 20 columns

## Preprocessing :
* One hot encoding for the discrete variables 
* Robust scaler for the continuous variables

#### Splitting the target variabe from the training data

In [295]:
df['genre']

0                   trap
1                 techno
2                 techno
3              Dark Trap
4        Underground Rap
              ...       
31723    Underground Rap
31724         Trap Metal
31725                dnb
31726          Dark Trap
31727                Rap
Name: genre, Length: 31728, dtype: object

In [296]:
dff = df.copy()
dff.drop_duplicates(subset ="id", keep = False, inplace = True) 
y = dff['genre'].copy()
df.drop(columns=['genre'], inplace = True ) 

### Encoding the target variable

In [297]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = pd.DataFrame(y)
y.columns = ['target']
y

Unnamed: 0,target
0,12
1,12
2,0
3,12
4,0
...,...
24733,5
24734,7
24735,8
24736,0


In [298]:
def preprocess(dff):
    cols = ['type', 'uri', 'track_href', 'analysis_url', 'song_name','type']
    num = [col for col in dff.columns if dff[col].dtype != "O"]
    dscr = [col for col in dff[num].columns if len(dff[num].groupby(col)[col].unique()) < 20]
    conti = [val for val in num if val not in dscr]
    
    contdf = dff[conti].copy()
    cols = contdf.columns
    # convert the array back to a dataframe
    transform = RobustScaler()
    data = transform.fit_transform(contdf)
    
    # convert the array back to a dataframe
    
    contdf = pd.DataFrame(data)
    contdf.columns = cols

    dscrdf = dff[dscr].copy()
    key_ = pd.get_dummies(dscrdf.key, prefix='Key')

    time_signature_ = pd.get_dummies(dscrdf.time_signature, prefix='time_signature_')

    tmp = pd.get_dummies(dff,columns = ['mode'])
    result = pd.concat([dff,key_, time_signature_,tmp[['mode_0','mode_1']]], axis=1,join="inner")
    
    # dropping duplicates
    result.drop_duplicates(subset ="id", keep = False, inplace = True) 
    # dropping columns
    cols = ['type', 'uri', 'track_href', 'analysis_url','id' ,'song_name','type','key','time_signature','mode']
    result.drop(cols, axis = 1,inplace=True)
    return result


In [299]:
data = preprocess(df)

In [300]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24738 entries, 1 to 31727
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   danceability       24738 non-null  float64
 1   energy             24738 non-null  float64
 2   loudness           24738 non-null  float64
 3   speechiness        24738 non-null  float64
 4   acousticness       24738 non-null  float64
 5   instrumentalness   24738 non-null  float64
 6   liveness           24738 non-null  float64
 7   valence            24738 non-null  float64
 8   tempo              24738 non-null  float64
 9   duration_ms        24738 non-null  int64  
 10  Key_0              24738 non-null  uint8  
 11  Key_1              24738 non-null  uint8  
 12  Key_2              24738 non-null  uint8  
 13  Key_3              24738 non-null  uint8  
 14  Key_4              24738 non-null  uint8  
 15  Key_5              24738 non-null  uint8  
 16  Key_6              247

In [239]:
data

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,Key_8,Key_9,Key_10,Key_11,time_signature__1,time_signature__3,time_signature__4,time_signature__5,mode_0,mode_1
1,0.618,0.654,5,-14.305,0,0.4220,0.226000,0.864,0.1110,0.1760,...,0,0,0,0,0,0,1,0,1,0
2,0.696,0.896,7,-5.915,1,0.1080,0.016400,0.866,0.1030,0.5700,...,0,0,0,0,0,0,1,0,0,1
3,0.769,0.506,0,-11.658,1,0.0452,0.002730,0.000,0.0832,0.1020,...,0,0,0,0,0,0,1,0,0,1
5,0.568,0.779,7,-9.779,1,0.0323,0.000519,0.929,0.0660,0.0852,...,0,0,0,0,0,0,1,0,0,1
6,0.784,0.852,8,-2.511,0,0.1660,0.001630,0.000,0.0781,0.5290,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31722,0.384,0.667,0,-8.126,0,0.5160,0.062100,0.000,0.3300,0.7200,...,0,0,0,0,0,0,1,0,1,0
31723,0.855,0.586,6,-7.134,1,0.1670,0.200000,0.000,0.2530,0.7510,...,0,0,0,0,0,0,1,0,0,1
31725,0.464,0.838,8,-6.762,1,0.0401,0.000151,0.756,0.0671,0.1510,...,1,0,0,0,0,0,1,0,0,1
31726,0.683,0.590,0,-4.949,1,0.3590,0.215000,0.000,0.1350,0.0401,...,0,0,0,0,0,0,1,0,0,1


In [301]:
data.shape

(24738, 28)

### Saving clean data in feather format

In [306]:
data.reset_index().to_feather('./data')
y.to_feather('./target')

### Splitting data into train and test set

In [307]:
X = pd.read_feather('./data')

In [308]:
X.drop("index",axis = 1,inplace=True)

In [309]:
y = pd.read_feather('./target')

In [310]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=42)

# Conclusion

We now have clean data that we can use to trin our models and that is fast to load too because it's saved in feather format.
We can do some feature engineering to make our mode better.