In [14]:
# NUMPY
import numpy as np

# STATS
import scipy.stats as stats
from scipy.stats import norm, skew

# MATPLOTLIB
import matplotlib as mlp
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('fivethirtyeight') 

# PANDAS
import pandas as pd 
pd.set_option("display.max_rows", None, "display.max_columns", None) 

# SEABRON
import seaborn as sns

# SCIKIT-LEARN: PRE-PROCESSING
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # encodage des variables catégorielles ordinales
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder # encodage des variables catégorielles nominales
from sklearn.preprocessing import StandardScaler # standardisation des variables numériques
from sklearn.preprocessing import MinMaxScaler # normalisation des variables numériques
from sklearn.preprocessing import RobustScaler # normalisation des variables numériques
from sklearn.impute import SimpleImputer # remplissage des valeurs manquantes
from sklearn.impute import KNNImputer # remplissage des valeurs manquantes


# SCIKIT-LEARN: MODELES
from sklearn.linear_model import LogisticRegression # régréssion logistique
from sklearn.svm import LinearSVC, SVC # machines à vecteurs de support
from sklearn.tree import DecisionTreeClassifier # arbres classification
from sklearn.tree import DecisionTreeRegressor # arbres de décision

# SCIKIT-LEARN: VALIDATION CROISEE + OPTIMISATION
from sklearn.model_selection import train_test_split # 
from sklearn.model_selection import cross_val_score # validation croisée pour comparaison entre modèles
from sklearn.model_selection import validation_curve # courbe de validation: visulaisr les scores lors du choix d'un hyperparamétre
from sklearn.model_selection import GridSearchCV # tester plusieurs hyper_paramètres
from sklearn.model_selection import RandomizedSearchCV # tester arbitrairement plusieurs hyperparamètres
from sklearn.model_selection import learning_curve # courbe d'apprentissage: visualisation les scores du train et du validation sets en fonction des quanitiés des données
 
## EVALUATION
from sklearn.metrics import accuracy_score # exactitude (accuracy)
from sklearn.metrics import f1_score # F1-score
from sklearn.metrics import confusion_matrix # matrice de confusion
from sklearn.metrics import plot_confusion_matrix # graphique de la matrice de confusion
from sklearn.metrics import classification_report # rapport pour le modèle de classification

## EVALUATION: COURBE ROC
from sklearn.metrics import auc # aire sous la courbe 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 


# SCIKIT-LEARN: PIPELINE
from sklearn.pipeline import make_pipeline

# WARNINGS
import warnings
warnings.filterwarnings('ignore')

In [15]:
data = pd.read_excel('./data/titanic3.xls')

In [21]:
# copie du dataframe data
df = data.copy()

In [22]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [23]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [24]:
# premier tri
df.drop(['name','sibsp','parch','ticket','cabin','embarked','fare', 'body', 'boat', 'home.dest'],axis='columns',inplace=True)

In [25]:
# separation
features = df.drop('survived',axis='columns')
y = df.survived
features.head()
#Features.shape

Unnamed: 0,pclass,sex,age
0,1,female,29.0
1,1,male,0.9167
2,1,female,2.0
3,1,male,30.0
4,1,female,25.0


In [26]:
# pre-traitement raide des features
features.sex = features.sex.map({'male': 1, 'female': 2})
features.age = features.age.fillna(features.age.mean())
features.head()

Unnamed: 0,pclass,sex,age
0,1,2,29.0
1,1,1,0.9167
2,1,2,2.0
3,1,1,30.0
4,1,2,25.0


In [27]:
# Construire le Modèle d'arbre de décision

In [28]:
from sklearn import tree

In [30]:
#Création jeu de train et de test

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2)
modelTree = tree.DecisionTreeClassifier(random_state=0, criterion='gini',max_depth=6 )
modelTree.fit(X_train,y_train)
accuracyTreeReel = modelTree.score(X_test,y_test)
accuracyTreeTrain = modelTree.score(X_train,y_train)

print('Accuracy Arbre X_test: ', accuracyTreeReel)
print('Accuracy Arbre X_train: ', accuracyTreeTrain)

Accuracy Arbre x_test:  0.7786259541984732
Accuracy Arbre x_train:  0.8166189111747851


In [31]:
reglog = LogisticRegression(random_state = 0, solver='newton-cg')
reglog.fit(X_train,y_train)
accuracyreglogTest = reglog.score(X_test,y_test)
accuracyreglogTrain = reglog.score(X_train,y_train)

print('Accuracy RL X_test: ', accuracyreglogTest )
print('Accuracy RL X_train : ', accuracyreglogTrain)

Accuracy RL X_test:  0.7938931297709924
Accuracy RL X_train :  0.7822349570200573


In [33]:
from os import system
# Graphviz - Logiciel de visualisation de graphes 
# https://graphviz.org/

dotfile = open("test.dot", 'w')
tree.export_graphviz(modelTree, out_file=dotfile, 
                      feature_names=['pclass','genre','age'],  
                      class_names =['mort','vivant'],
                      filled=True, rounded=True,  
                      special_characters=True)  

dotfile.close()
system("dot -Tpng test.dot -o dtree2.png")

0