# ***Encodage de variables qualitatives***
- **Importation des librairies**


In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

In [4]:
data = sb.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df = data[['sex','embark_town','alive','class','age','sibsp','parch','fare','survived']]
print(df.shape)
df.head()

(891, 9)


Unnamed: 0,sex,embark_town,alive,class,age,sibsp,parch,fare,survived
0,male,Southampton,no,Third,22.0,1,0,7.25,0
1,female,Cherbourg,yes,First,38.0,1,0,71.2833,1
2,female,Southampton,yes,Third,26.0,0,0,7.925,1
3,female,Southampton,yes,First,35.0,1,0,53.1,1
4,male,Southampton,no,Third,35.0,0,0,8.05,0


***
# **PREMIERE PARTIE**
- **pandas.get_dummies(...)**

In [6]:
cat_features = ['sex','embark_town','alive','class']

In [7]:
dummy_df = pd.get_dummies(df[cat_features], dummy_na = True)
print(dummy_df.shape)
dummy_df.head()

(891, 14)


Unnamed: 0,sex_female,sex_male,sex_nan,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_nan,alive_no,alive_yes,alive_nan,class_First,class_Second,class_Third,class_nan
0,0,1,0,0,0,1,0,1,0,0,0,0,1,0
1,1,0,0,1,0,0,0,0,1,0,1,0,0,0
2,1,0,0,0,0,1,0,0,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,1,0,1,0,0,0
4,0,1,0,0,0,1,0,1,0,0,0,0,1,0


---
- **SUPPRESSION DES DEMI-VARIABLES**

In [9]:
dummy_df = pd.get_dummies(df[cat_features],drop_first = True) 
print(dummy_df.shape)
dummy_df.head()

(891, 6)


Unnamed: 0,sex_male,embark_town_Queenstown,embark_town_Southampton,alive_yes,class_Second,class_Third
0,1,0,1,0,0,1
1,0,0,0,1,0,0
2,0,0,1,1,0,1
3,0,0,1,1,0,0
4,1,0,1,0,0,1


***
- **FUSION OU CONCATENATION**


In [10]:
df2 = pd.concat([df, dummy_df], axis = 1)
df2.head()

Unnamed: 0,sex,embark_town,alive,class,age,sibsp,parch,fare,survived,sex_male,embark_town_Queenstown,embark_town_Southampton,alive_yes,class_Second,class_Third
0,male,Southampton,no,Third,22.0,1,0,7.25,0,1,0,1,0,0,1
1,female,Cherbourg,yes,First,38.0,1,0,71.2833,1,0,0,0,1,0,0
2,female,Southampton,yes,Third,26.0,0,0,7.925,1,0,0,1,1,0,1
3,female,Southampton,yes,First,35.0,1,0,53.1,1,0,0,1,1,0,0
4,male,Southampton,no,Third,35.0,0,0,8.05,0,1,0,1,0,0,1


***
- **SUPPRESSION DES VARIABLES ORIGINELLES**


In [11]:
final_df = df2.drop(cat_features, axis = 1)

In [12]:
final_df

Unnamed: 0,age,sibsp,parch,fare,survived,sex_male,embark_town_Queenstown,embark_town_Southampton,alive_yes,class_Second,class_Third
0,22.0,1,0,7.2500,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,0,1,0,0
2,26.0,0,0,7.9250,1,0,0,1,1,0,1
3,35.0,1,0,53.1000,1,0,0,1,1,0,0
4,35.0,0,0,8.0500,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0,1,0,1,0,1,0
887,19.0,0,0,30.0000,1,0,0,1,1,0,0
888,,1,2,23.4500,0,0,0,1,0,0,1
889,26.0,0,0,30.0000,1,1,0,0,1,0,0


***
- ### **Ecrire ses propres méthodes pour regrouper l'ensemble des opérations que nous venons de réaliser précedemment**


In [13]:
def categorical_features_encoder(df, cat_feature):
    for col in cat_feature:
        dummy_df = pd.get_dummies(df[col], dummy_na = False, drop_first = True)
        df = df.drop(col, axis = 'columns')
        df = pd.concat([df, dummy_df], axis = 'columns')
    return df

In [14]:
df = categorical_features_encoder(df, cat_features)

In [15]:
print(df.shape)
df

(891, 11)


Unnamed: 0,age,sibsp,parch,fare,survived,male,Queenstown,Southampton,yes,Second,Third
0,22.0,1,0,7.2500,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,0,1,0,0
2,26.0,0,0,7.9250,1,0,0,1,1,0,1
3,35.0,1,0,53.1000,1,0,0,1,1,0,0
4,35.0,0,0,8.0500,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0,1,0,1,0,1,0
887,19.0,0,0,30.0000,1,0,0,1,1,0,0
888,,1,2,23.4500,0,0,0,1,0,0,1
889,26.0,0,0,30.0000,1,1,0,0,1,0,0


****
# DEUXIEME PARTIE


In [18]:
df = pd.DataFrame({"Couleur":['Noir','Bleu','Blanc','Rouge','jaune'],
                   "Mention":['Excellent','Tres-bien','Bien','Assez-bien','Passable']})

In [19]:
df

Unnamed: 0,Couleur,Mention
0,Noir,Excellent
1,Bleu,Tres-bien
2,Blanc,Bien
3,Rouge,Assez-bien
4,jaune,Passable


- **Pour une variable qualitative NOMINALE( OneHotEncoder(....) )**

In [26]:
onehot_encoder = OneHotEncoder(sparse = False)
onehot_encoder.fit_transform(df[['Couleur']]) 

array([[0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [27]:
#onehot_encoder.categories_
onehot_encoder.get_feature_names()

array(['x0_Blanc', 'x0_Bleu', 'x0_Noir', 'x0_Rouge', 'x0_jaune'],
      dtype=object)

- **Pour une variable qualitative ORDINALE( OrdinalEncoder(....) )**

In [28]:
ordinal_encoder = OrdinalEncoder(categories = [['Assez-bien','Bien','Tres-bien','Excellent','Passable']])
ordinal_encoder.fit_transform(df[['Mention']])

array([[3.],
       [2.],
       [1.],
       [0.],
       [4.]])

In [30]:
ordinal_encoder.categories_

[array(['Assez-bien', 'Bien', 'Tres-bien', 'Excellent', 'Passable'],
       dtype=object)]

# PARTIE 3
 - **TITANIC DATASET** 

In [31]:
import seaborn as sb
from sklearn.compose import make_column_transformer

In [32]:
data = sb.load_dataset('titanic')

In [33]:
df = data[['sex','embark_town','alive','class','age','sibsp','parch','who','fare','survived']]
df.head(3)

Unnamed: 0,sex,embark_town,alive,class,age,sibsp,parch,who,fare,survived
0,male,Southampton,no,Third,22.0,1,0,man,7.25,0
1,female,Cherbourg,yes,First,38.0,1,0,woman,71.2833,1
2,female,Southampton,yes,Third,26.0,0,0,woman,7.925,1


In [34]:
nominal_features = ['sex','alive','who']
ordinal_features = ['class']

In [35]:
onehot_encoder = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

In [36]:
mct = make_column_transformer(
        (onehot_encoder, nominal_features),
        (ordinal_encoder, ordinal_features),
)

In [37]:
mct.fit_transform(df) 

array([[0., 1., 1., ..., 1., 0., 2.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 2.],
       ...,
       [1., 0., 1., ..., 0., 1., 2.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 1., ..., 1., 0., 2.]])

- **Lien pour s'abonner à la chaine:** **https://www.youtube.com/channel/UCE-613S-bsuLukwHDhnRxIA/?sub_confirmation=1**
- **Lien de la vidéo associée à ce notebook:** **https://www.youtube.com/watch?v=4tcJpfSmfNw**
- **Merci d'ajouter aussi une étoile sur mon github si ce notebook vous est utile.**