# Preprocessing and data cleaning

In [14]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype , is_string_dtype , is_categorical_dtype

import feather

# A-  Preprocessing the training data set

### Load the training data

In [23]:
df.genre.unique()

array(['trap', 'techno', 'Dark Trap', 'Underground Rap', 'hardstyle',
       'Trap Metal', 'techhouse', 'RnB', 'psytrance', 'Emo', 'dnb',
       'trance', 'Rap', 'Hiphop', 'Pop'], dtype=object)

In [15]:
# load data
url1 = "https://raw.githubusercontent.com/PicoRadia/sdataProjet/main/dataset/Spotify_train_dataset.csv"
df = pd.read_csv(url1)#31728 rows x 20 columns

## Preprocessing :
* One hot encoding for the discrete variables 
* Robust scaler for the continuous variables
* Feature engineering
* Feature selection

In [21]:
def preprocess1(dff):
    cols = ['type', 'uri', 'track_href', 'analysis_url', 'song_name','type']
    num = [col for col in dff.columns if is_numeric_dtype(dff[col])]
    dscr = [col for col in dff[num].columns if len(dff[num].groupby(col)[col].unique()) < 20]
    conti = [val for val in num if val not in dscr]
    
    contdf = dff[conti].copy()
    cols = contdf.columns
    # convert the array back to a dataframe
    transform = RobustScaler()
    data = transform.fit_transform(contdf)
    
    # convert the array back to a dataframe
    
    contdf = pd.DataFrame(data)
    contdf.columns = cols

    dscrdf = dff[dscr].copy()

    time_signature_ = pd.get_dummies(dscrdf.time_signature, prefix='time_signature_')

    tmp = pd.get_dummies(dff,columns = ['mode'])
    result = pd.concat([dff, time_signature_,tmp[['mode_0','mode_1']]], axis=1,join="inner")
    result["mins"] = dff["duration_ms"] * 0.001 /60
    # result['mins'] = pd.to_datetime(result['mins']).dt.minute
    dff['song_name'] = dff['song_name'].replace(np.nan, 0)
    l3 = []
    for val in dff.song_name:
        if val == 0:
            l3.append(0)
        else:
            l3.append(1)
    result['song_name_nan'] = l3
 
    # dropping duplicates
    result.drop_duplicates(subset ="id", keep = False, inplace = True) 
    # dropping columns
    cols = ['type', 'uri','genre' ,'track_href', 'key','analysis_url','id' ,'song_name','type','key','time_signature','mode','duration_ms']
    result.drop(cols, axis = 1,inplace=True)
    return result

In [17]:
dff = df.copy()

In [18]:
x = preprocess1(dff)

In [19]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24738 entries, 1 to 31727
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   danceability       24738 non-null  float64
 1   energy             24738 non-null  float64
 2   loudness           24738 non-null  float64
 3   speechiness        24738 non-null  float64
 4   acousticness       24738 non-null  float64
 5   instrumentalness   24738 non-null  float64
 6   liveness           24738 non-null  float64
 7   valence            24738 non-null  float64
 8   tempo              24738 non-null  float64
 9   time_signature__1  24738 non-null  uint8  
 10  time_signature__3  24738 non-null  uint8  
 11  time_signature__4  24738 non-null  uint8  
 12  time_signature__5  24738 non-null  uint8  
 13  mode_0             24738 non-null  uint8  
 14  mode_1             24738 non-null  uint8  
 15  mins               24738 non-null  float64
 16  song_name_nan      247

In [None]:
x.tail(10).T

## Saving result in feather format

In [20]:
x.reset_index().to_feather('./fin')

# B- Preprocessing test set

In [24]:
# load data
url1 = "https://raw.githubusercontent.com/PicoRadia/sdataProjet/main/dataset/Spotify_test_dataset.csv"
test = pd.read_csv(url1)#31728 rows x 20 columns

In [25]:
test.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name
0,0.859,0.792,2,-4.007,1,0.141,0.065,0.0,0.3,0.766,206.402,audio_features,6tYL1tBaqTcaqXIykzfF7M,spotify:track:6tYL1tBaqTcaqXIykzfF7M,https://api.spotify.com/v1/tracks/6tYL1tBaqTca...,https://api.spotify.com/v1/audio-analysis/6tYL...,276813,4,"All About U (ft. Nate Dogg, Snoop Dogg, Fatal,..."
1,0.829,0.619,1,-7.258,1,0.198,0.29,0.000371,0.0975,0.351,192.07,audio_features,4X5AMbgW6whnAuDX10fhgq,spotify:track:4X5AMbgW6whnAuDX10fhgq,https://api.spotify.com/v1/tracks/4X5AMbgW6whn...,https://api.spotify.com/v1/audio-analysis/4X5A...,148093,4,Revenge
2,0.461,0.834,3,-3.387,0,0.142,0.261,0.0,0.0918,0.517,129.976,audio_features,2J176Rj3ZTTLKeTirMzQ4M,spotify:track:2J176Rj3ZTTLKeTirMzQ4M,https://api.spotify.com/v1/tracks/2J176Rj3ZTTL...,https://api.spotify.com/v1/audio-analysis/2J17...,193542,4,Changes
3,0.691,0.984,1,-4.828,1,0.061,0.000924,0.513,0.596,0.756,126.979,audio_features,08oufzbacifu4V5zQcoNMG,spotify:track:08oufzbacifu4V5zQcoNMG,https://api.spotify.com/v1/tracks/08oufzbacifu...,https://api.spotify.com/v1/audio-analysis/08ou...,204091,4,
4,0.685,0.798,6,-4.683,0,0.0342,0.0131,7.8e-05,0.111,0.831,133.994,audio_features,3ZuoDMBWEnM5kuve9Y55tv,spotify:track:3ZuoDMBWEnM5kuve9Y55tv,https://api.spotify.com/v1/tracks/3ZuoDMBWEnM5...,https://api.spotify.com/v1/audio-analysis/3Zuo...,193647,5,Don't Believe Me


In [26]:
dff = test.copy()

In [None]:
x = preprocess1(dff)

###  Saving file in feather format

In [None]:
x.reset_index().to_feather('./testfin')

# Conclusion

We now have clean data that we can use to train our models and that is fast to load too because it's saved in feather format.
We can do some feature engineering to make our mode better.