In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
matplotlib.style.use('ggplot')

# remove warnings
import warnings
warnings.filterwarnings('ignore')

# Feature engineering

In [2]:
# 读取训练集和测试集为 DataFrame
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

train_data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
# 基于 Predict_Survival.ipynb 类似的操作
train_data = train_data.drop(['PassengerId', 'Ticket'], axis=1)
test_data = test_data.drop(['Ticket'], axis=1)

In [5]:
# 获取名字中的 Title
def get_name_title(name):
    if name.find('Mr.') != -1:
        return 'Mr'
    elif name.find('Mrs.') != -1:
        return 'Mrs'
    elif name.find('Miss.') != -1:
        return 'Miss'
    elif name.find('Master.') != -1:
        return 'Master'
    elif name.find('Don.') != -1:
        return 'Don'
    elif name.find('Rev.') != -1:
        return 'Rev'
    elif name.find('Dr.') != -1:
        return 'Dr'
    elif name.find('Mme.') != -1:
        return 'Mme'
    elif name.find('Ms.') != -1:
        return 'Ms'
    elif name.find('Major.') != -1:
        return 'Major'
    elif name.find('Lady.') != -1:
        return 'Lady'
    elif name.find('Sir.') != -1:
        return 'Sir'
    elif name.find('Mlle.') != -1:
        return 'Mlle'
    elif name.find('Col.') != -1:
        return 'Col'
    elif name.find('Capt.') != -1:
        return 'Capt'
    elif name.find('Countess.') != -1:
        return 'Countess'
    elif name.find('Jonkheer.') != -1:
        return 'Jonkheer'
    else:
        return 'None'

# 添加 Title 字段
train_data['Title'] = train_data['Name'].apply(get_name_title)
train_data = train_data.drop('Name', axis=1)

test_data['Title'] = test_data['Name'].apply(get_name_title)
test_data = test_data.drop('Name', axis=1)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
Title       891 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Title          418 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 32.7+ KB


In [8]:
# 测试集和训练集合并
combined_train_data = train_data.drop('Survived', axis=1).append(test_data.drop('PassengerId', axis=1))
combined_train_data.reset_index(inplace=True)
combined_train_data.drop('index', axis=1, inplace=True)
combined_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 92.1+ KB


In [9]:
combined_train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,male,22.0,1,0,7.25,,S,Mr
1,1,female,38.0,1,0,71.2833,C85,C,Mrs
2,3,female,26.0,0,0,7.925,,S,Miss
3,1,female,35.0,1,0,53.1,C123,S,Mrs
4,3,male,35.0,0,0,8.05,,S,Mr


In [10]:
# Processing the ages
grouped = combined_train_data.groupby(['Sex','Pclass','Title'])
grouped.median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,SibSp,Parch,Fare
Sex,Pclass,Title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,1,Countess,33.0,0,0,86.5
female,1,Dr,49.0,0,0,25.9292
female,1,Lady,48.0,1,0,39.6
female,1,Miss,30.0,0,0,108.65415
female,1,Mlle,24.0,0,0,59.4021
female,1,Mme,24.0,0,0,69.3
female,1,Mrs,45.0,1,0,78.2667
female,1,,39.0,0,0,108.9
female,2,Miss,20.0,0,0,20.25
female,2,Mrs,30.5,1,0,26.0


In [11]:
def fill_missed_age(row):
    if row['Sex']=='female' and row['Pclass'] == 1:
        if row['Title'] == 'Countess':
            return 33
        elif row['Title'] == 'Dr':
            return 49
        elif row['Title'] == 'Lady':
            return 48
        elif row['Title'] == 'Miss':
            return 30
        elif row['Title'] == 'Mlle':
            return 24
        elif row['Title'] == 'Mme':
            return 24
        elif row['Title'] == 'Mrs':
            return 45
        elif row['Title'] == 'None':
            return 39
    
    elif row['Sex']=='female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return 20
        elif row['Title'] == 'Ms':
            return 28
        elif row['Title'] == 'Mrs':
            return 31
        
    elif row['Sex']=='female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return 18
        elif row['Title'] == 'Ms':
            return 28
        elif row['Title'] == 'Mrs':
            return 31
        
    elif row['Sex']=='male' and row['Pclass'] == 1:
        if row['Title'] == 'Capt':
            return 70
        elif row['Title'] == 'Col':
            return 55
        elif row['Title'] == 'Don':
            return 40
        elif row['Title'] == 'Dr':
            return 47
        elif row['Title'] == 'Jonkheer':
            return 38
        elif row['Title'] == 'Major':
            return 49
        elif row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 42
        elif row['Title'] == 'Sir':
            return 50
    
    elif row['Sex']=='male' and row['Pclass'] == 2:
        if row['Title'] == 'Dr':
            return 39
        elif row['Title'] == 'Master':
            return 2
        elif row['Title'] == 'Mr':
            return 30
        elif row['Title'] == 'Rev':
            return 42
        
    elif row['Sex']=='male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 26
            
combined_train_data['Age'] = combined_train_data.apply(lambda row : fill_missed_age(row)
                                                      if np.isnan(row['Age']) else row['Age'],
                                                       axis=1)

In [12]:
combined_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 92.1+ KB


In [13]:
# Fare 票价可能和等级 Pclass 和登录的船舱 Embarked 有关
grouped = combined_train_data.groupby(['Pclass', 'Embarked', 'Sex'])
grouped_median_data = grouped.median()
grouped_median_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,SibSp,Parch,Fare
Pclass,Embarked,Sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,C,female,39.0,0,0,83.1583
1,C,male,42.0,0,0,62.66875
1,Q,female,35.0,1,0,90.0
1,Q,male,44.0,2,0,90.0
1,S,female,35.0,1,0,78.85
1,S,male,42.0,0,0,35.5
2,C,female,23.0,1,0,27.7208
2,C,male,30.0,0,0,15.0458
2,Q,female,25.0,0,0,12.35
2,Q,male,57.0,0,0,12.35


In [14]:
combined_train_data[combined_train_data.Fare.isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
1043,3,male,60.5,0,0,,,S,Mr


In [15]:
combined_train_data.Fare.fillna(26, inplace=True)

In [16]:
# 填充缺失的 Embarked
combined_train_data['Embarked'] = combined_train_data['Embarked'].fillna('S')

In [17]:
combined_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1309 non-null float64
Cabin       295 non-null object
Embarked    1309 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 92.1+ KB


In [18]:
combined_train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,male,22.0,1,0,7.25,,S,Mr
1,1,female,38.0,1,0,71.2833,C85,C,Mrs
2,3,female,26.0,0,0,7.925,,S,Miss
3,1,female,35.0,1,0,53.1,C123,S,Mrs
4,3,male,35.0,0,0,8.05,,S,Mr


In [19]:
# 处理 Cabin 字段及填补缺失的 Cabin 字段
# 缺失字段填充为 U (for Uknown)
combined_train_data.Cabin.fillna('U',inplace=True)
# 获取船舱的首字母
combined_train_data['Cabin'] = combined_train_data['Cabin'].map(lambda c : c[0])

In [20]:
combined_train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,male,22.0,1,0,7.25,U,S,Mr
1,1,female,38.0,1,0,71.2833,C,C,Mrs
2,3,female,26.0,0,0,7.925,U,S,Miss
3,1,female,35.0,1,0,53.1,C,S,Mrs
4,3,male,35.0,0,0,8.05,U,S,Mr


In [21]:
# `Cabin` dummy encoding 
embarked_dummies = pd.get_dummies(combined_train_data['Cabin'], prefix='Cabin')
combined_train_data = pd.concat([combined_train_data, embarked_dummies], axis=1)
combined_train_data.drop('Cabin',axis=1,inplace=True)

In [22]:
# `Embarked` dummy encoding 
embarked_dummies = pd.get_dummies(combined_train_data['Embarked'], prefix='Embarked')
combined_train_data = pd.concat([combined_train_data, embarked_dummies], axis=1)
combined_train_data.drop('Embarked',axis=1,inplace=True)

In [23]:
# `Title` dummy encoding 
title_dummies = pd.get_dummies(combined_train_data['Title'], prefix='Title')
combined_train_data = pd.concat([combined_train_data, title_dummies], axis=1)
combined_train_data.drop('Title',axis=1,inplace=True)

In [24]:
combined_train_data['Sex'] = combined_train_data['Sex'].map({'male':1,'female':0})

In [25]:
#  对于 Pclass，只是个等级标记，所以也应该进行 dummy encoding 
# `Pclass` dummy encoding 
pclass_dummies = pd.get_dummies(combined_train_data['Pclass'], prefix='Pclass')
combined_train_data = pd.concat([combined_train_data, pclass_dummies], axis=1)
combined_train_data.drop('Pclass',axis=1,inplace=True)

In [26]:
combined_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 38 columns):
Sex               1309 non-null int64
Age               1309 non-null float64
SibSp             1309 non-null int64
Parch             1309 non-null int64
Fare              1309 non-null float64
Cabin_A           1309 non-null uint8
Cabin_B           1309 non-null uint8
Cabin_C           1309 non-null uint8
Cabin_D           1309 non-null uint8
Cabin_E           1309 non-null uint8
Cabin_F           1309 non-null uint8
Cabin_G           1309 non-null uint8
Cabin_T           1309 non-null uint8
Cabin_U           1309 non-null uint8
Embarked_C        1309 non-null uint8
Embarked_Q        1309 non-null uint8
Embarked_S        1309 non-null uint8
Title_Capt        1309 non-null uint8
Title_Col         1309 non-null uint8
Title_Countess    1309 non-null uint8
Title_Don         1309 non-null uint8
Title_Dr          1309 non-null uint8
Title_Jonkheer    1309 non-null uint8
Title_Lady     

In [27]:
combined_train_data.shape

(1309, 38)

In [28]:
combined_train_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_None,Title_Rev,Title_Sir,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,1,0,7.25,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,38.0,1,0,71.2833,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,26.0,0,0,7.925,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,35.0,1,0,53.1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1,35.0,0,0,8.05,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [29]:
# feature scale
from sklearn.preprocessing import MinMaxScaler

scaler_data = MinMaxScaler().fit_transform(combined_train_data[['Age','Fare']])
combined_train_data['Age'] = scaler_data[:, 0]
combined_train_data['Fare'] = scaler_data[:, 1]
combined_train_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_None,Title_Rev,Title_Sir,Pclass_1,Pclass_2,Pclass_3
0,1,0.273456,1,0,0.014151,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,0.473882,1,0,0.139136,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0.323563,0,0,0.015469,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.436302,1,0,0.103644,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1,0.436302,0,0,0.015713,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [30]:
# 分割得到训练集和测试集
train_X = combined_train_data.ix[:890]
train_Y = train_data.Survived

test_X = combined_train_data.ix[891:]

In [31]:
train_X.shape, train_Y.shape, test_X.shape

((891, 38), (891,), (418, 38))

# Modeling

In [32]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf.fit(train_X, train_Y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [33]:
feature_importance = pd.DataFrame()
feature_importance['feature'] = train_X.columns
feature_importance['importance'] = clf.feature_importances_

feature_importance.sort(['importance'], ascending=False)

Unnamed: 0,feature,importance
4,Fare,0.199379
1,Age,0.192346
29,Title_Mr,0.126298
0,Sex,0.102624
37,Pclass_3,0.048474
2,SibSp,0.0482
30,Title_Mrs,0.047298
26,Title_Miss,0.039127
13,Cabin_U,0.029331
3,Parch,0.029212


In [34]:
selectModel = SelectFromModel(clf, prefit=True, threshold=0.001)
train_X_new = selectModel.transform(train_X)
test_X_new = selectModel.transform(test_X)

In [35]:
train_X_new.shape, test_X_new.shape

((891, 25), (418, 25))

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

# 随机森林模型的超参数选择
forest = RandomForestClassifier()

parameter_grid = {
                 'max_depth' : [3,4,5,6,7,8,9,10],
                 'n_estimators': [100,140,180,200,240,280,300],
                 'criterion': ['gini','entropy']
                 }

cross_validation = StratifiedKFold(train_Y, n_folds=5)

grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_X_new, train_Y)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.832772166105
Best parameters: {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 6}


In [39]:
# GradientBoostingClassifier 模型的超参数选择
from sklearn.ensemble import GradientBoostingClassifier

parameter_grid = {
                 'learning_rate' : [0.001, 0.003, 0.009, 0.025, 0.075, 0.1, 0.3, 0.6, 0.8, 1, 1.2],
                 'max_depth' : [3,4,5,6,7,8,9,10],
                 'n_estimators': [100,140,180,200,240,280,300],
                 'max_features' : ['sqrt','log2', None]
                 }

gbc = GradientBoostingClassifier()
grid_search = GridSearchCV(gbc,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_X_new, train_Y)

print('gbc Best score: {}'.format(grid_search.best_score_))
print('gbc Best parameters: {}'.format(grid_search.best_params_))

gbc Best score: 0.83950617284
gbc Best parameters: {'max_features': 'sqrt', 'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 4}


In [40]:
# XGBClassifier 模型的超参数选择
from xgboost import XGBClassifier

parameter_grid = {
                 'learning_rate' : [0.001, 0.003, 0.009, 0.025, 0.075, 0.1, 0.3, 0.6, 0.8, 1, 1.2],
                 'max_depth' : [2,3,4,5,6,7,8,9,10],
                 'n_estimators': [100,140,180,200,240,280,300],
                 }

xgbc = XGBClassifier()
grid_search = GridSearchCV(xgbc,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_X_new, train_Y)

print('xgbc Best score: {}'.format(grid_search.best_score_))
print('xgbc Best parameters: {}'.format(grid_search.best_params_))

xgbc Best score: 0.842873176207
xgbc Best parameters: {'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 2}


In [41]:
# LogisticRegression 模型的超参数选择
from sklearn.linear_model import LogisticRegression

parameter_grid = {
                 'C' : [0.1,0.2,0.4, 0.8,1,1.5,2,4,6,8,10],
                 'max_iter': [50, 80, 100,120,140,160,180,200,220,240,260,280,300]
                 }

logistic = LogisticRegression()
grid_search = GridSearchCV(logistic,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_X_new, train_Y)

print('logistic Best score: {}'.format(grid_search.best_score_))
print('logistic Best parameters: {}'.format(grid_search.best_params_))

logistic Best score: 0.829405162738
logistic Best parameters: {'C': 1.5, 'max_iter': 50}


In [42]:
# AdaBoostClassifier 模型的超参数选择
from sklearn.ensemble import AdaBoostClassifier

parameter_grid = {
                 'learning_rate' : [0.001, 0.003, 0.009, 0.025, 0.075, 0.1, 0.3, 0.6, 0.8, 1, 1.2],
                 'n_estimators': [50,80,100,120,140,160,180,200,220,240,260,280,300]
                 }

adaBoost = AdaBoostClassifier()
grid_search = GridSearchCV(adaBoost,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_X_new, train_Y)

print('adaBoost Best score: {}'.format(grid_search.best_score_))
print('adaBoost Best parameters: {}'.format(grid_search.best_params_))

adaBoost Best score: 0.822671156004
adaBoost Best parameters: {'n_estimators': 120, 'learning_rate': 0.3}


** GridSearch 获取的最佳模型参数如下：**
1. RandomForestClassifier
```
Best score: 0.832772166105
Best parameters: {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 6}
```

2. GradientBoostingClassifier
```
Best score: 0.83950617284
Best parameters: {'max_features': 'sqrt', 'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 4}
```

3. XGBClassifier
```
Best score: 0.842873176207
Best parameters: {'n_estimators': 200, 'learning_rate': 0.3, 'max_depth': 2}
```

4. LogisticRegression
```
Best score: 0.829405162738
Best parameters: {'C': 1.5, 'max_iter': 50}
```

5. AdaBoostClassifier
```
Best score: 0.822671156004
Best parameters: {'n_estimators': 120, 'learning_rate': 0.3}
```

In [43]:
# best params model
forest = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=6)
gradientBoosting = GradientBoostingClassifier(max_features='sqrt', n_estimators=300, learning_rate=0.1,
                                              max_depth=4)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.3, max_depth=2)
logistic = LogisticRegression(C=1.5, max_iter=50)
adaBoost = AdaBoostClassifier(n_estimators=120, learning_rate=0.3)

In [None]:
# ensemble and voting predict