In [24]:
import pandas as pd 
import seaborn as sb
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
import seaborn as sb

In [25]:
titanic_dataset = sb.load_dataset('titanic')  

In [26]:
titanic_dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [27]:
titanic_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [28]:
y = titanic_dataset.survived
X = titanic_dataset.drop(['survived',
                          'embarked',
                          'class',
                          'who',
                          'adult_male',
                          'alive'], axis = 'columns')

In [29]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,deck,embark_town,alone
0,3,male,22.0,1,0,7.25,,Southampton,False
1,1,female,38.0,1,0,71.2833,C,Cherbourg,False
2,3,female,26.0,0,0,7.925,,Southampton,True
3,1,female,35.0,1,0,53.1,C,Southampton,False
4,3,male,35.0,0,0,8.05,,Southampton,True


 **2. DIVISION DU DATASET EN DONNEES D'ENTRAINEMENT ET DONNEES DU TEST**
- **(Xtrain, ytrain) = donnees d'apprentissage**
- **(Xtest, ytest)   = donnees du test**

In [30]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                                test_size = .2,
                                                random_state = 0)

**3. SEPARATION DES COLONNES EN DEUX GROUPES**
- **NUMERIQUES**
- **QUALITATIVES**

In [31]:
num_cols = Xtrain.select_dtypes(include = ['int','float']).columns
cat_cols = Xtrain.select_dtypes(exclude = 'number').columns 

In [32]:
num_cols

Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [33]:
cat_cols

Index(['sex', 'deck', 'embark_town', 'alone'], dtype='object')

 **3. CREATION DES PIPELINES INTERMEDIAIRES**
 - **Pipeline pour les colonnes NUMERIQUES**
 -**Un autre Pipeline pour les colonnes QUALITATIVES**

In [34]:
num_pipeline = make_pipeline(
                             SimpleImputer(strategy = 'median'),
                             MinMaxScaler(),
                            )

cat_pipeline = make_pipeline(
                     SimpleImputer(strategy = 'most_frequent'),
                     OneHotEncoder()
                    )

In [35]:
num_pipeline

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('minmaxscaler', MinMaxScaler())])

In [36]:
cat_pipeline

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder())])

**3. TRANSFORMATEUR FINAL**

In [37]:
mct = make_column_transformer(
                               (num_pipeline, num_cols),
                               (cat_pipeline, cat_cols)
                             )  

In [38]:
mct

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 Index(['sex', 'deck', 'embark_town', 'alone'], dtype='object'))])

**4. CREATION DE PIPELINE FINAL**

In [39]:
full_pipeline = make_pipeline(mct, LogisticRegression())

In [40]:
full_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())

In [41]:
full_pipeline.fit(Xtrain, ytrain)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())

In [42]:
full_pipeline.score(Xtest,ytest)

0.8044692737430168

In [43]:
import numpy as np 
np.round(full_pipeline.score(Xtest,ytest), 2)

0.8

**5. SAUVEGARDER LE MODELE**

In [44]:
from joblib import dump, load

dump(full_pipeline, 'modele.joblib') 

['modele.joblib']

In [45]:
modele = load('modele.joblib') 
modele.predict(Xtest)

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [46]:
np.round(modele.score(Xtest,ytest), 2)

0.8

- REJOINDRE LA COMMUNAUTE OPENCLASS4ALL:
  - 👉 https://www.youtube.com/channel/UCE-613S-bsuLukwHDhnRxIA/?sub_confirmation=1

- VIDEO ASSOCIEE A CE NOTEBOOK:
 -  👉 https://youtu.be/t7ZoE9XrF1M

SI VOUS TROUVEZ QUE CE DEPOT EST UTILE, MERCI D'AJOUTER UNE ETOILE 👆 MERCI POUR SOUTENIR LA CHAINE