In [27]:
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

In [28]:
titanic = pd.read_excel('titanic3.xls')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# Data preparation

In [29]:
titanic.drop(columns=['name', 'home.dest', 'ticket','cabin', 'embarked', 'boat','body', 'sibsp', 'parch', 'fare'], inplace=True)
titanic.isna().sum()

pclass        0
survived      0
sex           0
age         263
dtype: int64

In [30]:
titanic.dropna(inplace=True)
titanic

Unnamed: 0,pclass,survived,sex,age
0,1,1,female,29.0000
1,1,1,male,0.9167
2,1,0,female,2.0000
3,1,0,male,30.0000
4,1,0,female,25.0000
...,...,...,...,...
1301,3,0,male,45.5000
1304,3,0,female,14.5000
1306,3,0,male,26.5000
1307,3,0,male,27.0000


# ----------------------------------------------------------------------------

# Feature importance
survived rate and ratio

In [31]:
# 0 = no, 1= yes
titanic.survived.value_counts()

survived
0    619
1    427
Name: count, dtype: int64

In [32]:
global_survived_rate = titanic.survived.mean()
round(global_survived_rate,2)

0.41

In [33]:
titanic.groupby('sex').survived.mean()

sex
female    0.752577
male      0.205167
Name: survived, dtype: float64

In [34]:
titanic.groupby('pclass').survived.mean()

pclass
1    0.637324
2    0.440613
3    0.261477
Name: survived, dtype: float64

In [35]:
age_bins = [0, 18, 30, 50, 100]
age_labels = ['Enfant', 'Jeune Adulte', 'Adulte', 'Personne Âgée']
titanic['group_age'] = pd.cut(titanic['age'], bins=age_bins, labels=age_labels)

In [36]:
titanic.groupby('group_age', observed=True).survived.mean()

group_age
Enfant           0.492228
Jeune Adulte     0.367788
Adulte           0.412281
Personne Âgée    0.400000
Name: survived, dtype: float64

Le groupe d'age a peu d'influence, mais il y en a

In [37]:
titanic.drop('group_age', axis=1, inplace=True)

In [38]:
titanic.groupby(['sex','pclass']).survived.mean()

sex     pclass
female  1         0.962406
        2         0.893204
        3         0.473684
male    1         0.350993
        2         0.145570
        3         0.169054
Name: survived, dtype: float64

In [39]:
from IPython.display import display
global_survived_rate = titanic.survived.mean()
categorical_features = ['sex', 'pclass']
for c in categorical_features:
    print(c)
    df_group = titanic.groupby(c).survived.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_survived_rate
    df_group['risk'] = df_group['mean'] / global_survived_rate
    display(df_group)
    print("---")

sex


Unnamed: 0_level_0,mean,count,diff,risk
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.752577,388,0.344356,1.84355
male,0.205167,658,-0.203055,0.502588


---
pclass


Unnamed: 0_level_0,mean,count,diff,risk
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.637324,284,0.229102,1.56122
2,0.440613,261,0.032391,1.079347
3,0.261477,501,-0.146745,0.640527


---


Les paramètres qui ont le plus d'infulence sur la probabilité de survie:
1. Le sexe
2. La classe
3. L'age

# ----------------------------------------------------------------------------

In [40]:
titanic['sex'] = titanic['sex'].apply(lambda x: 1 if x == 'female' else 0)

In [41]:
data = titanic.drop(['survived'],axis=1)
target = titanic['survived']

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)

In [44]:
len(X_train), len(X_test), len(y_train), len(y_test)

(836, 210, 836, 210)

# EDA

In [45]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [46]:
X_train

Unnamed: 0,pclass,sex,age
0,3,0,22.0
1,2,1,20.0
2,2,0,23.0
3,3,0,25.0
4,1,1,27.0
...,...,...,...
831,3,0,39.0
832,3,0,28.0
833,3,0,33.0
834,3,1,20.0


# Construction du rédeau de neurones

In [48]:
model = Sequential()