# Titanic: Machine Learning from Disaster

In [4]:
# https://www.kaggle.com/c/titanic

In [5]:
import sklearn
import pandas as pd

In [6]:
# Загружаем данные из файлов
train = pd.read_csv('./homework/train.csv')
test = pd.read_csv('./homework/test.csv')

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Содержание:
1. Подготовка данных
2. Импорт библиотек моделей
3. Подбор параметров для Random Forest
4. Подбор параметров для Gradient Boosting
5. Подбор параметров для Decision Tree
6. Подбор параметров для SVM
7. Подбор параметров для Logistic Regression
8. Формирование матрицы предсказаний нескольких моделей
9. Объединение предсказаний by Logistic Regression
10. Формирование файла для отправки

## Предобработка данных


.
 


In [8]:
# Заполняем пропуски в данных медианными 
# значениями факторов на обучающей выборке
train_median = train.median()
train_imp = train.fillna(train_median)
test_imp = test.fillna(train_median)

In [9]:
# Бинаризуем категориальные признаки
CATEGORY_COL = ['Sex', 'Pclass', 'Embarked']
train_dummies = pd.get_dummies(train_imp, columns=CATEGORY_COL, drop_first=True)
test_dummies = pd.get_dummies(test_imp, columns=CATEGORY_COL, drop_first=True)

In [10]:
train_dummies.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1,0,1


In [11]:
# Удаляем лишние столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']
TARGET_COL = 'Survived'
X_train = train_dummies.drop(DROP_COL + [TARGET_COL], axis=1)
y_train = train_dummies[TARGET_COL]
X_test = test_dummies.drop(DROP_COL, axis=1)

In [12]:
X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,1,0,1,0,1
1,38.0,1,0,71.2833,0,0,0,0,0
2,26.0,0,0,7.925,0,0,1,0,1
3,35.0,1,0,53.1,0,0,0,0,1
4,35.0,0,0,8.05,1,0,1,0,1


## Предсказание моделей для стеккинга


.


In [13]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer


from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# TODO: подобрать гиперпараметры для ансамблей

kfold = KFold(n_splits=4, shuffle=True, random_state=None)

### Подбор параметров модели №1 для Random_forest

In [14]:
grid = {'n_estimators':[10,50,100],
       'max_features':[3,5,9],
       'max_depth':[2,6,10],
       'min_samples_leaf':[10,30,70,100],
        'warm_start':[0,1]
       }

gridSearchRandomForest = GridSearchCV(RandomForestClassifier(), grid, cv = kfold, scoring = 'accuracy')



In [15]:
param_finder_RFC = gridSearchRandomForest.fit(X_train, y_train)


In [16]:
param_finder_RFC.best_params_

{'max_depth': 6,
 'max_features': 5,
 'min_samples_leaf': 10,
 'n_estimators': 10,
 'warm_start': 0}

In [17]:
param_finder_RFC.cv_results_

{'mean_fit_time': array([ 0.02538443,  0.02255803,  0.10336322,  0.11996859,  0.19009942,
         0.1966064 ,  0.02187318,  0.02130675,  0.09817624,  0.09581733,
         0.19059676,  0.20353001,  0.02340204,  0.02188891,  0.10875982,
         0.1058777 ,  0.19878376,  0.18084979,  0.02045095,  0.020037  ,
         0.09518588,  0.09212554,  0.18863297,  0.1902402 ,  0.02059829,
         0.02069151,  0.09583628,  0.09514672,  0.19845915,  0.19166392,
         0.02222651,  0.01982647,  0.09795988,  0.09675747,  0.19960999,
         0.19428557,  0.02209318,  0.01953906,  0.09551013,  0.09867877,
         0.19277138,  0.19433242,  0.02175629,  0.02149296,  0.09848136,
         0.099819  ,  0.1949721 ,  0.20116967,  0.02392721,  0.0223105 ,
         0.10417557,  0.1038602 ,  0.19911522,  0.20780081,  0.02231812,
         0.02098185,  0.10613573,  0.10508555,  0.20053542,  0.20100701,
         0.02185535,  0.02319729,  0.105259  ,  0.09992474,  0.19813907,
         0.20224369,  0.02109504, 

In [18]:
rf_estimator = RandomForestClassifier(max_depth=6,
                                  max_features= 3,
                                  min_samples_leaf= 10,
                                  n_estimators= 50,
                                  warm_start= 0)
rfc_estimator_fitted = rf_estimator.fit(X_train, y_train) 

##### Single Random Forest Classifier @ kaggle - 0.78947

### Подбор параметров модели №2 для GradientBoosting

In [19]:
grid = {'n_estimators':[3,10,50,100],
       'learning_rate':[0.03,0.3,3],
       'max_depth':[2,6,10],
       'min_samples_leaf':[10,30,70,100]
       }

gridSearchGradientBoosting = GridSearchCV(GradientBoostingClassifier(), grid, cv = kfold, scoring = 'accuracy')



In [20]:
param_finder_GBC = gridSearchGradientBoosting.fit(X_train, y_train)


  * tree.value[:, 0, 0].take(terminal_regions, axis=0))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  * tree.value[:, 0, 0].take(terminal_regions, axis=0))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  * tree.value[:, 0, 0].take(terminal_regions, axis=0))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))


In [21]:
param_finder_GBC.best_params_

{'learning_rate': 0.03,
 'max_depth': 6,
 'min_samples_leaf': 10,
 'n_estimators': 100}

In [22]:
gb_estimator = GradientBoostingClassifier(n_estimators=10, max_depth=6, min_samples_leaf=30, learning_rate=0.3)


In [23]:
gb_estimator_fitted = gb_estimator.fit(X_train, y_train)

##### Single GradientBoostingClassifier @ kaggle - 0.77512

### Подбор параметров модели №3 для Decision Tree

In [24]:
grid = {'max_depth':list(range(2,10,1)),
       'min_samples_leaf':[10,30,70,100]}

gridSearchDecisionTree = GridSearchCV(DecisionTreeClassifier(), grid, cv = kfold, scoring = 'accuracy')



In [25]:
param_finder_DT = gridSearchDecisionTree.fit(X_train, y_train)


In [26]:
param_finder_DT.best_params_

{'max_depth': 9, 'min_samples_leaf': 10}

In [27]:
dt_estimator = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10)

In [28]:
dt_estimator_fitted = dt_estimator.fit(X_train, y_train)

##### Single Decision Tree @ kaggle - 0.78469

### Подбор параметров модели №4  для SVM

In [29]:
grid = {'C':[0.03, 0.3, 0.9, 3, 6, 9]}

gridSearchSupportVector = GridSearchCV(LinearSVC(), grid, cv = kfold, scoring = 'accuracy')

In [30]:
param_finder_SVC = gridSearchSupportVector.fit(X_train, y_train)


In [31]:
param_finder_SVC.best_params_

{'C': 0.03}

In [32]:
scores = cross_val_score(LinearSVC(C=0.03),
                         X_train, y_train, groups=None,
                        scoring = make_scorer(accuracy_score),
                        cv=kfold)
scores

array([ 0.76233184,  0.78026906,  0.79820628,  0.83333333])

In [33]:
svc_estimator = LinearSVC(C=0.03)

In [34]:
svc_estimator_fitted = svc_estimator.fit(X_train, y_train)

##### Single LinearSVC @ kaggle - 0.75
Самый плохой показатель, в стекинг моделей в любом случае не включаем - отсутствует метод оценки вероятности для предсказания.

### Подбор параметров модели №5  для LinearRegression

In [35]:
grid = {'C':[0.03, 0.3, 0.9, 3, 6, 9]}

gridSearchLogisticRegression = GridSearchCV(LogisticRegression(), grid, cv = kfold, scoring = 'accuracy')

In [36]:
param_finder_LR = gridSearchLogisticRegression.fit(X_train, y_train)

In [37]:
param_finder_LR.best_params_

{'C': 0.3}

In [38]:
LR_estimator = LogisticRegression(C=6)
LR_estimator_fitted = LR_estimator.fit(X_train, y_train)

##### Single LogisticRegression @ kaggle - 0.76077
Достаточно низкий показатель, в любом случае идет в стеккинг

## Формирование матрицы предсказаний четырех моделей

In [39]:
# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)
dt_train_pred = cross_val_predict_proba(dt_estimator, X_train, y_train)
lr_train_pred = cross_val_predict_proba(LR_estimator, X_train, y_train)

In [40]:
X_train_stack = np.stack([rf_train_pred[:,1], gb_train_pred[:,1], dt_train_pred[:,1],lr_train_pred[:,1]], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rfc_estimator_fitted.predict_proba(X_test)
gb_test_pred = gb_estimator_fitted.predict_proba(X_test)
dt_test_pred = dt_estimator_fitted.predict_proba(X_test)
lr_test_pred = LR_estimator_fitted.predict_proba(X_test)


X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1],dt_test_pred[:,1],lr_test_pred[:,1]], axis=1)


## Объединяем предсказания ансамблей с помощью логистической регрессии

In [41]:
grid = {'C':[0.03, 0.3, 0.9, 3, 6, 9,15,50]}
gridSearchEnsemble = GridSearchCV(LogisticRegression(), grid, cv = kfold, scoring = 'accuracy')

In [42]:
gridSearchEnsemble.fit(X_train_stack, y_train)

GridSearchCV(cv=KFold(n_splits=4, random_state=None, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.03, 0.3, 0.9, 3, 6, 9, 15, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [43]:
gridSearchEnsemble.best_params_

{'C': 0.3}

При таком значении модель переобучается. Взято значение 0.5 - дает наилучший результат

50 0.7799
0.5 0.78794
0.03 0.78469

In [47]:
# TODO: подобрать гиперпараметры LogisticRegression

logreg = LogisticRegression(C=0.7).fit(X_train_stack, y_train)
predicted = logreg.predict(X_test_stack)

## Формируем фалй для отправки

In [48]:
with open('submission_ensemble_3_models_regularization_10.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))