In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble  import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data_train = pd.read_csv('titanic/train.csv')
data_test = pd.read_csv('titanic/test.csv')

In [3]:
X_train = data_train.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y_train = data_train.Survived

In [4]:
X_train = X_train.fillna({'Age': X_train.Age.median()})

In [5]:
X_train['Sex'] = X_train['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [6]:
X_train['Embarked'] = X_train['Embarked'].fillna('S')

In [7]:
X_train['Embarked'] = X_train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 3}).astype(int)

In [46]:
X_train, X_test, y_train, y_test =  train_test_split(X_train, y_train, test_size=0.25, random_state=20)

In [47]:
clf = RandomForestClassifier(n_estimators=15, max_depth=5)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
clf.score(X_train, y_train)


0.8646616541353384

In [49]:
clf.score(X_test, y_test)

0.8

In [50]:
parametrs = {'n_estimators': [10, 20, 30, 40, 50], 
             'max_depth': [1, 3, 5, 7, 9, 11],
            'min_samples_leaf': range(1, 8),
            'min_samples_split': range(2, 10, 2)}

In [51]:
grid_search_clf = GridSearchCV(clf, parametrs, cv=3, n_jobs=-1)

In [52]:
grid_search_clf.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=15, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn'

In [53]:
grid_search_clf.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 30}

In [54]:
best_clf = grid_search_clf.best_estimator_

In [55]:
best_clf.score(X_train, y_train)

0.849624060150376

In [56]:
best_clf.score(X_test, y_test)

0.8444444444444444

In [57]:
logreg = LogisticRegression()

In [58]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
logreg.score(X_train, y_train)

0.7819548872180451

In [60]:
logreg.score(X_test, y_test)

0.8666666666666667