# Importation des librairies

In [1]:
import numpy as np
import pandas as pd

# Importation des datasets

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# Analyse exploratoire

In [3]:
train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Machine Learning


## Random Forest

In [4]:
def prepare_df(df_unprepared) :
    df = df_unprepared
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1,"Q":2})
    df["NameLength"] = df["Name"].str.len()
    df["FamilySize"] = df["SibSp"]+df["Parch"]+1
    df = df.drop(columns=["PassengerId", "Ticket","Name","Cabin"])
    return df

In [5]:
X = prepare_df(train_df)
X = X.drop(columns=["Survived"])
y = train_df["Survived"]

X_test = prepare_df(test_df)

X.head(20)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,NameLength,FamilySize
0,3,0,22.0,1,0,7.25,0.0,23,2
1,1,1,38.0,1,0,71.2833,1.0,51,2
2,3,1,26.0,0,0,7.925,0.0,22,1
3,1,1,35.0,1,0,53.1,0.0,44,2
4,3,0,35.0,0,0,8.05,0.0,24,1
5,3,0,,0,0,8.4583,2.0,16,1
6,1,0,54.0,0,0,51.8625,0.0,23,1
7,3,0,2.0,3,1,21.075,0.0,30,5
8,3,1,27.0,0,2,11.1333,0.0,49,3
9,2,1,14.0,1,0,30.0708,1.0,35,2


### Préparation

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)

clf = clf.fit(X, y)

In [7]:
submission = clf.predict(X_test)

In [8]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': submission})
output.to_csv('submission_rf.csv', index=False)

## CatBoost

### Préparation

In [9]:
def prepare_df(df_unprepared) :
    df = df_unprepared
    df["NameLength"] = df["Name"].str.len()
    df["FamilySize"] = df["SibSp"]+df["Parch"]+1
    df = df.drop(columns=[ "PassengerId"])
    return df

In [10]:
X = prepare_df(train_df)
X = X.drop(columns=["Survived"])
y = train_df["Survived"]

X_test = prepare_df(test_df)

X.head(20)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength,FamilySize
0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0.0,23,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0,51,2
2,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0.0,22,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0.0,44,2
4,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0.0,24,1
5,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,2.0,16,1
6,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0.0,23,1
7,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,0.0,30,5
8,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0.0,49,3
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1.0,35,2


### Modèle

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

cat_features = ["Name", "Sex","Ticket", "Cabin", "Embarked"]

for col in cat_features:
    X[col] = X[col].fillna("Missing").astype(str)
    X_test[col] = X_test[col].fillna("Missing").astype(str)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

model = CatBoostClassifier(
    iterations=2000,
    depth=5,
    learning_rate=0.03,
    loss_function="Logloss",
    eval_metric="AUC",
    l2_leaf_reg=6,
    random_strength=10,
    bootstrap_type="Bernoulli",
    subsample=0.8,
    early_stopping_rounds=100,
    verbose=False
    )

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val)
)

0:	test: 0.8075214	best: 0.8075214 (0)	total: 55.1ms	remaining: 1m 50s
100:	test: 0.8862988	best: 0.8865504 (98)	total: 3.31s	remaining: 1m 2s
200:	test: 0.8970736	best: 0.8970736 (198)	total: 6.39s	remaining: 57.2s
300:	test: 0.9018112	best: 0.9019789 (287)	total: 9.41s	remaining: 53.1s
400:	test: 0.9062133	best: 0.9068003 (387)	total: 14.3s	remaining: 57s
500:	test: 0.9090223	best: 0.9092739 (474)	total: 19.6s	remaining: 58.7s
600:	test: 0.9098608	best: 0.9117055 (516)	total: 24.4s	remaining: 56.8s
700:	test: 0.9080999	best: 0.9117055 (516)	total: 29.2s	remaining: 54.1s
800:	test: 0.9061714	best: 0.9117055 (516)	total: 33.8s	remaining: 50.6s
900:	test: 0.9030689	best: 0.9117055 (516)	total: 38.8s	remaining: 47.3s
1000:	test: 0.8998826	best: 0.9117055 (516)	total: 43.8s	remaining: 43.7s
1100:	test: 0.8967801	best: 0.9117055 (516)	total: 48.8s	remaining: 39.9s
1200:	test: 0.8956062	best: 0.9117055 (516)	total: 53.2s	remaining: 35.4s
1300:	test: 0.8947677	best: 0.9117055 (516)	total: 57

<catboost.core.CatBoostClassifier at 0x1baa2a065a0>

In [19]:
submission = model.predict(X_test)

In [20]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': submission})
output.to_csv('submission_catboost.csv', index=False)