In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))
#
# Any results you write to the current directory are saved as output.

In [2]:
# Load libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# get titanic & test csv files as a DataFrame
X_orig = pd.read_csv("Titanic_train.csv")
X_test_orig = pd.read_csv("Titanic_test.csv")

# make a copy
X = X_orig.copy()
X_test = X_test_orig.copy()

# separate target from predictors
y = X.Survived
X.drop(['Survived'], axis=1, inplace=True)

In [4]:
X.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# remove columns
X.drop(['PassengerId', 'Name',
       'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
X_test.drop(['PassengerId', 'Name',
       'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [6]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [7]:
# Numerical and categorical columns transformers
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression(random_state = 0)],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier(random_state = 0)],
                 'classifier__n_estimators': range(10, 1000, 50),
                 'classifier__max_depth': range(1,10,1),
                }]

In [8]:
# Create grid search 
grid = GridSearchCV(clf, search_space, cv=5, verbose=0)

In [9]:
# Fit grid search
best_model = grid.fit(X, y)







In [10]:
results = pd.DataFrame(grid.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__penalty,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034588,0.007911,0.013597,0.000802,"LogisticRegression(C=10000.0, class_weight=Non...",1,l1,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
1,0.031190,0.002400,0.012195,0.000400,"LogisticRegression(C=10000.0, class_weight=Non...",1,l2,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
2,0.035988,0.006030,0.013197,0.001469,"LogisticRegression(C=10000.0, class_weight=Non...",2.78256,l1,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
3,0.036588,0.006799,0.014797,0.001470,"LogisticRegression(C=10000.0, class_weight=Non...",2.78256,l2,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
4,0.040186,0.004018,0.015797,0.003185,"LogisticRegression(C=10000.0, class_weight=Non...",7.74264,l1,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
5,0.032191,0.000978,0.014596,0.000800,"LogisticRegression(C=10000.0, class_weight=Non...",7.74264,l2,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
6,0.053184,0.007725,0.015594,0.002245,"LogisticRegression(C=10000.0, class_weight=Non...",21.5443,l1,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
7,0.033392,0.001021,0.013795,0.001166,"LogisticRegression(C=10000.0, class_weight=Non...",21.5443,l2,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
8,0.080572,0.011889,0.013197,0.000399,"LogisticRegression(C=10000.0, class_weight=Non...",59.9484,l1,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181
9,0.030800,0.001720,0.013787,0.001475,"LogisticRegression(C=10000.0, class_weight=Non...",59.9484,l2,,,"{'classifier': LogisticRegression(C=10000.0, c...",0.793296,0.810056,0.764045,0.752809,0.790960,0.782267,0.020839,181


In [11]:
# View best model
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=610,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
# Predict target vector
preds_test = best_model.predict(X_test)

In [15]:
# Save test predictions to file
output = pd.DataFrame({'PassengerId': X_test_orig.PassengerId,
                       'Survived': preds_test})
output.to_csv('submission.csv', index=False)

Le résultat sur Kaggle donne une précision de 75,12 %