# Preprocessing and data cleaning

In [100]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype , is_string_dtype , is_categorical_dtype

import feather

### Load the training data

In [101]:
# load data
url1 = "https://raw.githubusercontent.com/PicoRadia/sdataProjet/main/dataset/Spotify_train_dataset.csv"
df = pd.read_csv(url1)#31728 rows x 20 columns

## Preprocessing :
* One hot encoding for the discrete variables 
* Robust scaler for the continuous variables
* Feature engineering


In [102]:
def preprocess(dff):
    cols = ['type', 'uri', 'track_href', 'analysis_url', 'song_name','type']
    num = [col for col in dff.columns if is_numeric_dtype(dff[col])]
    dscr = [col for col in dff[num].columns if len(dff[num].groupby(col)[col].unique()) < 20]
    conti = [val for val in num if val not in dscr]
    
    contdf = dff[conti].copy()
    cols = contdf.columns
    # convert the array back to a dataframe
    transform = RobustScaler()
    data = transform.fit_transform(contdf)
    
    # convert the array back to a dataframe
    
    contdf = pd.DataFrame(data)
    contdf.columns = cols

    dscrdf = dff[dscr].copy()
    key_ = pd.get_dummies(dscrdf.key, prefix='Key')

    time_signature_ = pd.get_dummies(dscrdf.time_signature, prefix='time_signature_')

    tmp = pd.get_dummies(dff,columns = ['mode'])
    result = pd.concat([dff,key_, time_signature_,tmp[['mode_0','mode_1']]], axis=1,join="inner")
    result["mins"] = dff["duration_ms"] * 0.001 /60
    # result['mins'] = pd.to_datetime(result['mins']).dt.minute
    dff['song_name'] = dff['song_name'].replace(np.nan, 0)
    l3 = []
    for val in dff.song_name:
        if val == 0:
            l3.append(0)
        else:
            l3.append(1)
    result['song_name_nan'] = l3
    l = []
    for row in df['genre'] :
        if row == 'trap' or row =="Dark Trap" or row=="Trap Metal":
            l.append(1)
        else : 
            l.append(0)
    result['has_trap'] = l
    # Has trance
    l1 = []
    for row in df['genre'] :
        if row == 'trance' or row =="psytrance" :
            l1.append(1)
        else : 
            l1.append(0)
    result['has_trance'] = l1
    
    # Has rap
    l2 = []
    for row in df['genre'] :
        if row == 'Rap' or row =="Underground Rap" :
            l2.append(1)
        else : 
            l2.append(0)
    result['has_rap'] = l2

    # dropping duplicates
    result.drop_duplicates(subset ="id", keep = False, inplace = True) 
    # dropping columns
    cols = ['type', 'uri','genre' ,'track_href', 'analysis_url','id' ,'song_name','type','key','time_signature','mode','duration_ms']
    result.drop(cols, axis = 1,inplace=True)
    return result

In [103]:
dff = df.copy()

In [104]:
x = preprocess(dff)

In [105]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24738 entries, 1 to 31727
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   danceability       24738 non-null  float64
 1   energy             24738 non-null  float64
 2   loudness           24738 non-null  float64
 3   speechiness        24738 non-null  float64
 4   acousticness       24738 non-null  float64
 5   instrumentalness   24738 non-null  float64
 6   liveness           24738 non-null  float64
 7   valence            24738 non-null  float64
 8   tempo              24738 non-null  float64
 9   Key_0              24738 non-null  uint8  
 10  Key_1              24738 non-null  uint8  
 11  Key_2              24738 non-null  uint8  
 12  Key_3              24738 non-null  uint8  
 13  Key_4              24738 non-null  uint8  
 14  Key_5              24738 non-null  uint8  
 15  Key_6              24738 non-null  uint8  
 16  Key_7              247

In [109]:
x.tail(10).T

Unnamed: 0,31717,31718,31719,31720,31721,31722,31723,31725,31726,31727
danceability,0.619,0.463,0.665,0.804,0.746,0.384,0.855,0.464,0.683,0.868
energy,0.591,0.83,0.878,0.762,0.879,0.667,0.586,0.838,0.59,0.654
loudness,-8.232,-8.483,-4.745,-13.689,-4.006,-8.126,-7.134,-6.762,-4.949,-4.832
speechiness,0.0625,0.045,0.206,0.0747,0.101,0.516,0.167,0.0401,0.359,0.0766
acousticness,0.262,0.101,0.00325,0.09,0.00118,0.0621,0.2,0.000151,0.215,0.208
instrumentalness,0.000145,7e-06,0.0,0.867,0.00848,0.0,0.0,0.756,0.0,0.0
liveness,0.107,0.286,0.0663,0.0981,0.18,0.33,0.253,0.0671,0.135,0.187
valence,0.214,0.116,0.394,0.661,0.171,0.72,0.751,0.151,0.0401,0.299
tempo,130.624,149.745,186.094,125.002,174.069,187.767,138.132,173.995,209.986,138.02
Key_0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Saving result in feather format

In [107]:
x.reset_index().to_feather('./data2')

# Conclusion

We now have clean data that we can use to trin our models and that is fast to load too because it's saved in feather format.
We can do some feature engineering to make our mode better.