In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [28]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [29]:
all_df = pd.concat((train_df.loc[:, 'Pclass':'Embarked'], test_df.loc[:, 'Pclass':'Embarked']))

In [30]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [31]:
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())
all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean())
all_df['Embarked'] = all_df['Embarked'].fillna(all_df['Embarked'].mode()[0])

In [32]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1309 non-null float64
Cabin       295 non-null object
Embarked    1309 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [33]:
all_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
cat_features = ['Sex', 'Embarked']

for col in cat_features:
    lbl = LabelEncoder()
    all_df[col] = lbl.fit_transform(list(all_df[col].values))

In [35]:
all_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [36]:
all_df = all_df.drop(columns=['Name', 'Ticket', 'Cabin'])

In [37]:
train = all_df[:train_df.shape[0]]
test = all_df[train_df.shape[0]:]

In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 55.7 KB


In [41]:
y = train_df['Survived']
ID = test_df['PassengerId']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=0)

In [46]:
import xgboost as xgb

In [51]:
params = {
    "objective": "binary:logistic", 
    "eval_metric": "auc",
    "eta": 0.1, 
    "max_depth": 6,  
    "subsample": 1, 
    "colsample_bytree": 1,
    "silent": 1
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

model = xgb.train(params=params, 
                  dtrain=dtrain,
                  num_boost_round=100,
                  early_stopping_rounds=10,
                  evals=[(dtest, 'test')]
                 )

[0]	test-auc:0.886905
Will train until test-auc hasn't improved in 10 rounds.
[1]	test-auc:0.89624
[2]	test-auc:0.893243
[3]	test-auc:0.889603
[4]	test-auc:0.892857
[5]	test-auc:0.886005
[6]	test-auc:0.890673
[7]	test-auc:0.894741
[8]	test-auc:0.889603
[9]	test-auc:0.888832
[10]	test-auc:0.889431
[11]	test-auc:0.89153
Stopping. Best iteration:
[1]	test-auc:0.89624



In [54]:
prediction = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
prediction

array([0.42588335, 0.4750208 , 0.42588335, 0.42588335, 0.5168644 ,
       0.42588335, 0.45609358, 0.46160665, 0.56230706, 0.42588335,
       0.42588335, 0.44456193, 0.5809824 , 0.42588335, 0.5809824 ,
       0.5809824 , 0.42588335, 0.43935266, 0.5168644 , 0.45609358,
       0.44066563, 0.56824934, 0.5809824 , 0.47461712, 0.5809824 ,
       0.42588335, 0.5809824 , 0.43935266, 0.44603732, 0.52094805,
       0.42588335, 0.46160665, 0.5       , 0.44124418, 0.4927498 ,
       0.43935266, 0.5168644 , 0.5168644 , 0.42588335, 0.4737735 ,
       0.43935266, 0.5063673 , 0.42588335, 0.5809824 , 0.5809824 ,
       0.42588335, 0.44603732, 0.42588335, 0.5809824 , 0.48098934,
       0.47461712, 0.43935266, 0.5809824 , 0.5809824 , 0.52094805,
       0.43798986, 0.42588335, 0.42588335, 0.42588335, 0.5809824 ,
       0.42588335, 0.42588335, 0.42588335, 0.56230706, 0.56824934,
       0.5809824 , 0.56230706, 0.44603732, 0.44603732, 0.5809824 ,
       0.56230706, 0.42588335, 0.5168644 , 0.44603732, 0.58098

In [55]:
prediction = np.where(prediction < 0.5, 0, 1)
prediction

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [56]:
submisson = pd.DataFrame({
    'PassengerId': ID,
    'Survived': prediction
})
submisson.to_csv('submisson.csv', index=False)