# One Hote Encoding (OHE) avec Scikit Learn

In [None]:
import random
import pandas as pd

In [None]:
col1=[random.choice(['rouge','vert','bleu','rouge']) for _ in range(10)]

In [None]:
col1

['rouge',
 'vert',
 'rouge',
 'rouge',
 'bleu',
 'rouge',
 'rouge',
 'bleu',
 'vert',
 'rouge']

In [None]:
df=pd.DataFrame(data={'Couleur':col1})

In [None]:

df

Unnamed: 0,Couleur
0,rouge
1,vert
2,rouge
3,rouge
4,bleu
5,rouge
6,rouge
7,bleu
8,vert
9,rouge


# Faire le OHE Sans Pipeline

In [None]:
df['Couleur'].astype('category')

0    rouge
1     vert
2    rouge
3    rouge
4     bleu
5    rouge
6    rouge
7     bleu
8     vert
9    rouge
Name: Couleur, dtype: category
Categories (3, object): ['bleu', 'rouge', 'vert']

In [None]:
pd.get_dummies(df)

Unnamed: 0,Couleur_bleu,Couleur_rouge,Couleur_vert
0,0,1,0
1,0,0,1
2,0,1,0
3,0,1,0
4,1,0,0
5,0,1,0
6,0,1,0
7,1,0,0
8,0,0,1
9,0,1,0


In [None]:
# ajouter l'argument dummy_na=True
pd.get_dummies(df,dummy_na=True)

Unnamed: 0,Couleur_bleu,Couleur_rouge,Couleur_vert,Couleur_nan
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
5,0,1,0,0
6,0,1,0,0
7,1,0,0,0
8,0,0,1,0
9,0,1,0,0


# Faire le OHE avec Pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder=OneHotEncoder(sparse=False,
                      handle_unknown='ignore') 
# ce n'est pas de cette manière qu'on va utiliser OneHotEncoder
# pour desactiver le format sparse : sparse=False
# Pour prendre en compte autre categories lors de la production :handle_unknown='ignore'

In [None]:
encoder.fit(df[['Couleur']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=False)

In [None]:
encoder.transform(df[['Couleur']])

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [None]:
col2=[random.choice(['rouge','cian','bleu','violet']) for _ in range(10)] #  autre catégories


In [None]:
df2=pd.DataFrame(data={'Couleur':col2})
df2

Unnamed: 0,Couleur
0,bleu
1,bleu
2,bleu
3,rouge
4,violet
5,violet
6,rouge
7,rouge
8,cian
9,cian


In [None]:
encoder.transform(df2[['Couleur']])
# quand il rencontre une categorie qu'il ne connait pas il met zero

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

# Imputer avec scikit Learn

In [None]:
from sklearn.impute import SimpleImputer
from numpy import nan
import pandas as pd

In [None]:
def aleatoire():
  nombre_aleatoire=random.random()
  return (random.choice([nombre_aleatoire,nombre_aleatoire,nan]))

In [None]:
df=pd.DataFrame({'nombre':[aleatoire() for _ in range(20)]})

In [None]:
df

Unnamed: 0,nombre
0,0.157648
1,
2,0.742035
3,
4,
5,
6,0.173335
7,0.293642
8,
9,0.17823


In [None]:
df['nombre'] # on obtient une serie

0     0.157648
1          NaN
2     0.742035
3          NaN
4          NaN
5          NaN
6     0.173335
7     0.293642
8          NaN
9     0.178230
10    0.474030
11         NaN
12    0.881708
13    0.366637
14    0.893554
15    0.788311
16         NaN
17    0.794935
18    0.669006
19         NaN
Name: nombre, dtype: float64

In [None]:
df[['nombre']] # data frame

Unnamed: 0,nombre
0,0.157648
1,
2,0.742035
3,
4,
5,
6,0.173335
7,0.293642
8,
9,0.17823


In [None]:
imputer=SimpleImputer(strategy='constant',
                      fill_value=4,
                      add_indicator=True)
imputer.fit(df[['nombre']])

SimpleImputer(add_indicator=True, copy=True, fill_value=4, missing_values=nan,
              strategy='constant', verbose=0)

In [None]:
imputer.transform(df[['nombre']])

array([[0.15764793, 0.        ],
       [4.        , 1.        ],
       [0.7420354 , 0.        ],
       [4.        , 1.        ],
       [4.        , 1.        ],
       [4.        , 1.        ],
       [0.17333452, 0.        ],
       [0.29364228, 0.        ],
       [4.        , 1.        ],
       [0.1782301 , 0.        ],
       [0.47402978, 0.        ],
       [4.        , 1.        ],
       [0.88170818, 0.        ],
       [0.36663673, 0.        ],
       [0.89355372, 0.        ],
       [0.788311  , 0.        ],
       [4.        , 1.        ],
       [0.79493482, 0.        ],
       [0.66900626, 0.        ],
       [4.        , 1.        ]])

In [None]:
df[['nombre']].mean()

nombre    0.534423
dtype: float64

# Créer une pipeline de Preprocessing avec Scikit Learn

In [None]:
url='https://bit.ly/missing-values'

In [None]:
df=pd.read_csv(url)
df

Unnamed: 0,color,rating
0,,1.16
1,Red,
2,Blue,2.54
3,Red,1.51
4,Red,
...,...,...
912,Blue,3.40
913,Red,
914,,1.24
915,Yellow,2.64


In [None]:
df['color'].unique()

array([nan, 'Red', 'Blue', 'Orange', 'Yellow'], dtype=object)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
imputer= SimpleImputer(strategy='most_frequent')

In [None]:
data=encoder.fit_transform(imputer.fit_transform(df[['color']])) # enchainer 2 transformation

In [None]:
data

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
gestion_missing_cat=make_pipeline(imputer,
              encoder)

In [None]:
color=gestion_missing_cat.fit_transform(df[['color']])

## Gestion de la colonne rating

In [None]:
imputer2=SimpleImputer(strategy='median')
rating=imputer2.fit_transform(df[['rating']])

In [None]:
import numpy as np
np.hstack([color, rating]).shape

(917, 5)

# Ce que l'on va faire


In [None]:
## Importer les outils pour le preprocessing en tant que tel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
## Importer les outils qui nous premettent de faire ça clean
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
df

Unnamed: 0,color,rating
0,,1.16
1,Red,
2,Blue,2.54
3,Red,1.51
4,Red,
...,...,...
912,Blue,3.40
913,Red,
914,,1.24
915,Yellow,2.64


In [None]:
cat_nan_pipeline=make_pipeline(
     SimpleImputer(strategy="most_frequent"),
     OneHotEncoder(sparse=False,
                   handle_unknown='ignore')
)

In [None]:
preprocessing=make_column_transformer(
           ( cat_nan_pipeline , ['color']),
           ( SimpleImputer(strategy='median'), ['rating']),
           
           
)

In [None]:
preprocessing.fit_transform(df)

array([[1.  , 0.  , 0.  , 0.  , 1.16],
       [0.  , 0.  , 1.  , 0.  , 3.1 ],
       [1.  , 0.  , 0.  , 0.  , 2.54],
       ...,
       [1.  , 0.  , 0.  , 0.  , 1.24],
       [0.  , 0.  , 0.  , 1.  , 2.64],
       [1.  , 0.  , 0.  , 0.  , 3.1 ]])