In [2]:
import pandas as pd
import numpy as np

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV

data = pd.read_csv('train.csv').set_index('PassengerId')
test = pd.read_csv('test.csv')
new_data = data.copy()

# short feature preparation

data = data.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'])
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Age'] = data['Age'].fillna(data['Age'].mean().round(0))
data['Age'] = pd.cut(data['Age'], bins=[0, 17, 25, 45, 65], labels=[3, 2, 1, 0])
data['Fare'] = pd.cut(data['Fare'], bins=[0, 20, 70, 100, 200], labels=[0, 1, 2, 3])

In [61]:
test = test.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'])
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
test['Age'] = test['Age'].fillna(test['Age'].mean().round(0))
test['Age'] = pd.cut(test['Age'], bins=[0, 17, 25, 45, 65], labels=[3, 2, 1, 0])
test['Fare'] = pd.cut(test['Fare'], bins=[0, 20, 70, 100, 200], labels=[0, 1, 2, 3])
new_test = test.copy().set_index('PassengerId')

In [55]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Survived'], axis=1), data['Survived'], test_size=0.2, random_state=42)

# hyperparameters tuning

rand_titanic = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), 
                           param_distributions={'criterion': ['gini', 'entropy'], 
                                                'splitter': ['best', 'random'],
                                                'min_samples_split': range(2, 20),
                                                'min_samples_leaf': range(1, 20)},
                           n_iter=100, 
                           cv=5, 
                           scoring='accuracy',
                           n_jobs=2,
                           verbose=1,
                           random_state=42
                          )

rand_titanic.fit(X_train, y_train)
rand_titanic.best_params_
rand_titanic.score(X_test, y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


0.76355

In [63]:
test['Survived'] = rand_titanic.predict(new_test)
submission = test[['PassengerId', 'Survived']]
submission.to_csv('submission_dtc.csv', index=False)

submission

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1
