![](1.jpg)

In [98]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

In [99]:
df = sns.load_dataset('titanic')  
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


---
**ETAPE 1**   
 * Identifier puis supprimer les données dupliquées.

In [100]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
47,1,3,female,,0,0,7.7500,Q,Third,woman,False,,Queenstown,yes,True
76,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
77,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
87,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
95,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True


In [101]:
df.drop_duplicates(inplace = True) 

In [102]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone


---
**ETAPE 4**
- Traiter les outliers en choisissant la méthode la plus appropriée.
  - voir le cours et exercices corrigés, disponible sur [Github](https://www.github.com/ousmanhamit/D.I.T)

---
**ETAPE 2** 
* Donnees d'entree **X** et la variable target **y**  
* Découpage du **DATASET** en données **d'entrainement(Xtrain, ytrain)** et données **TEST(Xtest, ytest)**.

In [103]:
y = df.survived
X = df.drop(['survived',
             'embarked',
             'class',
             'who',
             'adult_male',
             'alive'], axis = 'columns')

In [104]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,deck,embark_town,alone
0,3,male,22.0,1,0,7.25,,Southampton,False
1,1,female,38.0,1,0,71.2833,C,Cherbourg,False
2,3,female,26.0,0,0,7.925,,Southampton,True
3,1,female,35.0,1,0,53.1,C,Southampton,False
4,3,male,35.0,0,0,8.05,,Southampton,True


In [105]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

In [116]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                                test_size = .2,
                                                random_state = 0)

---
**ETAPE 3**
- Définir la liste de variables qualitatives(nominales, ordinales)
- Définir la liste de variables numériques(continues, discrètes)

In [107]:
num_cols = Xtrain.select_dtypes(include = ['int64','float64']).columns
cat_cols = Xtrain.select_dtypes(include = 'object').columns 

In [108]:
num_cols

Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [109]:
cat_cols

Index(['sex', 'embark_town'], dtype='object')

----
**ETAPE 5**: FEATURE SCALING et ENCODAGE de variables
- Créer un pipeline **cat_pipeline** pour les variables catégories(**make_pipeline**)
- Créer un pipeline **num_pipeline** pour les variables quantitatives(**make_pipeline**).

In [110]:
num_pipeline = make_pipeline(
                             SimpleImputer(strategy = 'median'),
                             RobustScaler(),
                            )

cat_pipeline = make_pipeline(
                     SimpleImputer(strategy = 'most_frequent'),
                     OneHotEncoder(handle_unknown = 'ignore', drop = 'if_binary')
                    )

In [133]:
num_pipeline.inverse_transform
# num_pipeline.fit_transform(df[num_cols])

In [140]:
cat_pipeline
# cat_pipeline.fit_transform(df[cat_cols])

---
**ETAPE 6**
- Créer un **make_column_transformer** pour combiner les 2 précédents **pipelines** intermediaires.

In [113]:
mct = make_column_transformer(
                               (num_pipeline, num_cols),
                               (cat_pipeline, cat_cols)
                             )  

---
**ETAPE 7**
- Choisir l’algorithme d'apprentissage correspondant au type de données(**estmator**).

---
**ETAPE 8**
- Combiner toutes les précédentes étapes en une seule sequence(**make_pipeline**) et appliquer dessus les methodes **fit_transform()** pour entrainer l’algorithme et créer le modele

In [114]:
full_pipeline = make_pipeline(mct, LogisticRegression())

---
**ETAPE 9**
- entrainer l'algorithme d'apprentissage avec la methode **fit**


In [81]:
full_pipeline.fit(Xtrain, ytrain)

---
- evaluer le modele avec la methode **score**

In [82]:
full_pipeline.score(Xtest, ytest).round(2)*100

78.0

In [83]:
print('Le score obtenu est de :', full_pipeline.score(Xtest, ytest).round(2)*100,'%')


Le score obtenu est de : 78.0 %


---
- **Sauvegarder le modele pre-entraine**

In [84]:
from joblib import dump, load

dump(full_pipeline, 'modele.joblib') 

['modele.joblib']

In [85]:
modele = load('modele.joblib') 
modele.predict(Xtest)

array([0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0])

In [86]:
print('Le score obtenu est de:',round(modele.score(Xtest,ytest), 2)*100,'%')

Le score obtenu est de: 78.0 %
