In [1]:
# import libraries
%matplotlib inline
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ML alogorithm
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, LabelEncoder

from sklearn.base import BaseEstimator, TransformerMixin

### 数据预处理要做的事情

- 数值属性数据字和文本属性数据分别处理，分为两个pipeline独立处理后在做FeatureUnion
- 文本属性要先补缺，再做one-hot encoding 变为数值数据: __sklearn.preprocessing.LableEncoder__   
  补缺---> one-hot encoding

- 数值数据要补缺处理，解决NaN值问题：__用sklearn.preprocessing.Imputer来做__
- 全部变为数值数据后，处理scaling的问题，正规化(异常值不敏感，但取值范围不能限制在一个固定范围)或者minMax（0-1范围，但是异常值敏感！）:__用 StandardScaler__   
  imputer---> Scaler 
- __(上述两个pipeline合并)__
- full_pipeline = FeaturUnion(num_pipeline, text_pipline)  
- _输出完整的可直接用于训练的输入的数据集。_


In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """
    return:values of the DataFrame because sklearn do not deal with DataFrame.
    -------------------------
    This  is useful for selecting the num and text attributes.
    Choose some specific attributes in the DataFrame.
    ------------
    """
    def __init__(self, attri_names):
        self.attri_names = attri_names
        
    # do nothing
    def fit(self, indataF, y=None):
        return self
    
    # return selected attributes values as arrays
    def transform(self, indataF):
        X_arrays = indataF[self.attri_names].values
        return X_arrays

In [3]:
#### Adding age_cat to the dataset
# (0-18 19-39 40-59 >60)
def ageClss(age):
    whois = 0 # ''
    if age <= 18:
        whois = 1 #'Child'
    elif age <= 39:
        whois = 2 #'Teenager'
    elif age <= 59:
        whois = 3 #'MiddleAge'
    elif age == 'NaN':
        whois = 0# 'NaN'
    else:
        whois = 5 #'elder'
    return whois

In [4]:
# 对dataFrame 进行操作： 输入dataFrame 输出加入新属性的DataFrame
#----------------------------------------------------------
class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Transform an array into array plus new attributes values from adding two old attributes.
    attri_names: the old attributes used for addition.
                 you may add more hyperparameters such as the weight of addiotion.
    newAttriName: new attributes name
    -------------
    return: A dataFrame with new attributes.
    """
    def __init__(self, attri_names):
        self.attri_names = attri_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        #Xadder = dataSet[self.attri_names[0]].values
        #for att in self.attri_names[1:]:
        #     Xadder += dataSet[att].values
        Xadder = X[:,0] + X[:,1]        
        return np.c_[X, Xadder]

{'Capt': 1, 
   'Col': 2, 
   'Don': 1,
   'Dr': 7,
   'Jonkheer': 1,
   'Lady': 1,
   'Major': 2,
   'Master': 40,
         'Miss': 182,
         'Mlle': 2,
         'Mme': 1,
         'Mr': 517,
         'Mrs': 125,
         'Ms': 1,
         'Rev': 6, 牧师
         'Sir': 1,
         'the Countess': 1})女伯爵
    

In [5]:
def getPrebyreSplitStrip(name):
    m = re.split(r'\,|\.',name)
    if m is not None:
        prefix = m[1].strip()
    else:
        # 如果每天title就设为'Mr'
        prefix = 'Mr'
    # 做一个map 判断把相近的titlemap到一类里去！
    MrSet = ('Sir','Mr')
    MrsSet = ('Miss','Mrs','Lady', 'Mme' )
    MsSet = ('Miss','Mlle', 'Ms' )
    eliteSet = ('Don', 'Dr', 'Master') 
    noble = ('the Countess', 'Major', 'Col')
    
    if prefix in MrSet:
        prefix = 'Mr'
    elif prefix in MrsSet:
        prefix = 'Mrs'
    elif prefix in eliteSet:
        prefix = 'elite'
    elif prefix in noble:
        prefix = 'noble'
    else:
        prefix = 'others'
    
    return prefix

    
def newCabin(cabin):
    if cabin is not np.nan:
        newcabin = cabin[0]
    else:
        # 这里应该暂先设置为最大概率的Cabin
        newcabin = 'X'
    # 单独处理异常值
    if newcabin=='T': newcabin = 'X'
        
    return newcabin
# 处理Name属性的匹配问题，提取特征，wholeWord开关：取姓名的title还是取Cabin的首字母
class extractWordOfTextAttr(BaseEstimator, TransformerMixin):
    """
    提前Name里的title做粗粒化，提取Cabin的首字母
    增加新的特征：mother or not, child or not 在数值里面增加用age判断 ?
    
    """
    def __init__(self, titleOr1Letter=[0,3,4]):
        self.titleOr1Letter = titleOr1Letter

    def fit(self, X):
          return self
        
    def transform(self, X ,y=None):
    # extract title of Name , first letter of Cabin
        for ix_ in range(X.shape[0]):
            X[ix_, self.titleOr1Letter[0]] = getPrebyreSplitStrip(X[ix_, self.titleOr1Letter[0]])
            X[ix_, self.titleOr1Letter[1]] = newCabin(X[ix_, self.titleOr1Letter[1]])
            if X[ix_,self.titleOr1Letter[2]] is np.nan:
                X[ix_,self.titleOr1Letter[2]] = 'S'
        # ---------------------------------------------
        # 转换为数字编码，为后续的OneHot做准备
        encoder = LabelEncoder()
        
        X[:,self.titleOr1Letter[0]] = encoder.fit_transform(X[:, self.titleOr1Letter[0]])
        X[:,self.titleOr1Letter[1]] = encoder.fit_transform(X[:, self.titleOr1Letter[1]])
        # 把embarked也转换为数字
        X[:, self.titleOr1Letter[2]] = encoder.fit_transform(X[:, self.titleOr1Letter[2]])
        # ---------------------------------------------------
        return X
    
# checked !
class Sex2Gender(BaseEstimator, TransformerMixin):
    """
    cat_pipeline: transform the second('male','female') col of X to (0,1)
    """
    def __init__(self, col=2):
        self.col = col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for ix_ in range(X.shape[0]):
            X[ix_, self.col] = int(X[ix_, self.col]=='male')
        return X

In [66]:
# ------------ START ---------------------------------

In [6]:
dataSet0 = pd.read_csv('train.csv')

In [7]:
yLabels = dataSet0.copy()['Survived'].values
dataSet = dataSet0.copy()
dataSet.drop(['Survived','PassengerId', 'Ticket'], axis=1, inplace=True)
dataSet['age_cat'] = dataSet['Age'].map(ageClss)

In [8]:
dataSet.head(10)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,age_cat
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,2
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,2
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,2
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,2
5,3,"Moran, Mr. James",male,,0,0,8.4583,,Q,5
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S,3
7,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,,S,1
8,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333,,S,2
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,,C,1


In [10]:
CAT_ATTRI = ['Name', 'Pclass', 'Sex','Cabin', 'Embarked', 'age_cat']
NUM_ATTRI = ['SibSp', 'Parch', 'Fare','Age']
USELESS_ATTRI = ['Ticket']
add_attr = ['SibSp', 'Parch']
# 被丢掉一些无用属性
drop_attri_names = ['PassengerId']

In [11]:
dataSet0['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
dataSet0.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
dataSet0['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [15]:
dataSet0['Cabin'].map(newCabin).value_counts()

X    688
C     59
B     47
D     33
E     32
A     15
F     13
G      4
Name: Cabin, dtype: int64

>  注意到一个异常值 for Cabin： 'T': 1

In [15]:
from collections import Counter 
Counter([newCabin(aa) for aa in dataSet0['Cabin'].values])

Counter({'A': 15,
         'B': 47,
         'C': 59,
         'D': 33,
         'E': 32,
         'F': 13,
         'G': 4,
         'X': 688})

In [9]:
### This cell is used for test every class independently.
indataF = dataSet.copy()
sel = DataFrameSelector(CAT_ATTRI)
CAT = sel.fit_transform(indataF)
print(CAT)
extract = extractWordOfTextAttr()
cat = extract.fit_transform(CAT)
sex2g = Sex2Gender()
dd = sex2g.transform(cat)
print(dd)
#oneh = OneHotEncoder()
#ee = oneh.fit_transform(dd)
#Counter(dd[:,0])

[['Braund, Mr. Owen Harris' 3 'male' nan 'S' 2]
 ['Cumings, Mrs. John Bradley (Florence Briggs Thayer)' 1 'female' 'C85'
  'C' 2]
 ['Heikkinen, Miss. Laina' 3 'female' nan 'S' 2]
 ...
 ['Johnston, Miss. Catherine Helen "Carrie"' 3 'female' nan 'S' 5]
 ['Behr, Mr. Karl Howell' 1 'male' 'C148' 'C' 2]
 ['Dooley, Mr. Patrick' 3 'male' nan 'Q' 2]]
[[0 3 1 7 2 2]
 [1 1 0 2 0 2]
 [1 3 0 7 2 2]
 ...
 [1 3 0 7 2 5]
 [0 1 1 2 0 2]
 [0 3 1 7 1 2]]


The Features are:    
1.  PassengerId    
2.  Survived   
3.  Pclass   
4.  Name   
5.  Sex   
6.  Age   
7.  SibSp   
8.  Parch   
9.  Ticket   
10.  Fare   
11.  Cabin    
12.  Embarked   

> __这个数值特征的pipeline的流水线操作是： __  
>    1.告诉我要提取的列名，从数据集里提取指定的_数值_特征的值   
>    2.对所有的数值特征进行补缺操作，用中位数来补缺    
>    3.增加属性Relatives,用’孩子配偶‘数加’父母兄弟姐妹‘数，作为一个’亲人‘特征   
>    4.对数值数据做标准化处理    

In [50]:
# 输入一个DataFrame, 输出一个数组而非DataFrame
num_pipeline = Pipeline([('selector', DataFrameSelector(NUM_ATTRI)),
                        ('imputer', Imputer(strategy='median')), 
                         ('adder', CombineAttributesAdder(add_attr)),
                        # ("poly_feature", PolynomialFeatures(degree=2)),
                        ('std_scaler', StandardScaler()), 
                        ])
#####
# num_data = num_pipeline.fit_transform(dataSet)

> __这个pipeline的流水线操作是： __  
>    1.告诉我要提取的列名，从数据集里提取指定的类别特征的值   
>    2.对指定的列进行特征engineering，这里对Name提取它的Title，对Cabin提取首字母；并都转换为数字标签！   
>    3.把性别特征转换为（0，1）   
>    4.对现在的类别特征的数据进行OneHot编码，输出编码后的数据   

In [51]:
#先补全分类标签如果有缺失的话？或者说缺失当做单独的一类
# 输入一个DataFrame, 输出一个数组而非DataFrame
cat_pipeline = Pipeline([('selector', DataFrameSelector(CAT_ATTRI)),
                         # 提取'Name'里面的title，提取’Cabin‘的首字母；记得完成补全的操作！
                         # titleOr1Letter： 告诉transformer，也就是这里的extraWordofTextAttr,对那一列操作以及操作什么
                         #               :  [0,3]  意思是对第一列和第四列操作提取字符的变换器
                         # -----------------------------------------------------------
                         # 提取的步骤里面顺便把类别分好！变成数字编码的类别！！！！
                         #-------------------------------------------------------------
                         ('extractor', extractWordOfTextAttr(titleOr1Letter=[0,3,4])), 
                         ('sex2gen', Sex2Gender(col=2)), # 第二列性别变为数字,输入为告诉哪一列是性别
                         # 此时所有的类别变量都变为离散的数字类别，做one-hot 编码
                         ('one_hot_encode', OneHotEncoder())# OneHot 编码
                        ])
####
#cat_data = cat_pipeline.fit_transform(dataSet).toarray()

In [52]:
from sklearn.preprocessing import PolynomialFeatures

full_pipeline = FeatureUnion(transformer_list=[("num_pipeline",num_pipeline),
                                              ("cat_pipeline", cat_pipeline),
                                              ])

In [53]:
full_data = full_pipeline.fit_transform(dataSet)
Xtrain = full_data.toarray()
Xtrain

array([[ 0.43279337, -0.47367361, -0.50244517, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.43279337, -0.47367361,  0.78684529, ...,  1.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.48885426, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.43279337,  2.00893337, -0.17626324, ...,  0.        ,
         0.        ,  1.        ],
       [-0.4745452 , -0.47367361, -0.04438104, ...,  1.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.49237783, ...,  1.        ,
         0.        ,  0.        ]])

In [54]:
Xtrain.shape

(891, 30)

In [55]:
### 测试集，用于提交结果
testSet0 = pd.read_csv('test.csv')
testSet = testSet0.copy()
testSet.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)
testSet['age_cat'] = testSet['Age'].map(ageClss)

In [14]:
testSet0['Cabin'].map(newCabin).value_counts()

X    327
C     35
B     18
D     13
E      9
F      8
A      7
G      1
Name: Cabin, dtype: int64

In [56]:
test_data = full_pipeline.transform(testSet).toarray()
test_data

array([[-0.4745452 , -0.47367361, -0.49078316, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.43279337, -0.47367361, -0.50747884, ...,  0.        ,
         1.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.45336687, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.4745452 , -0.47367361, -0.50244517, ...,  1.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.48633742, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.43279337,  0.76762988, -0.19824428, ...,  0.        ,
         0.        ,  1.        ]])

In [57]:
Xtrain.shape, test_data.shape

((891, 30), (418, 30))

#### 训练模型 

In [33]:
from sklearn.linear_model import LinearRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.model_selection import GridSearchCV

In [20]:
# 决策树
tree_clf = DecisionTreeClassifier()
tree_clf.fit(Xtrain, yLabels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [76]:
# 随机森林
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(Xtrain, yLabels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [46]:
# SVM
# lin_svc = LinearSVC(C=1, loss="hinge")
# lin_svc.fit(Xtrain, yLabels)

# svm_clf = SVC(kernel="poly", degree=5, coef0=1, C=5)  score:0.76
#svm_clf = SVC(kernel="poly", degree=5, coef0=1, C=1) # score:0.79425 2018/08/12我的目前最高分
svm_clf = SVC(C= 1.0, gamma= 0.05, kernel='rbf')
svm_clf.fit(Xtrain, yLabels)

SVC(C=1, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape='ovr', degree=5, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
# test data
y_test_predict = pd.Series(svm_clf.predict(test_data))

result = pd.DataFrame({"PassengerId":testSet0["PassengerId"].values, 'Survived':y_test_predict})
result.to_csv("svm_clf_rbf2000.csv",index=0)

In [44]:
# Use GridSearch for the SVM
para_grid = [{'kernel':['rbf'], 'gamma':[0.01, 0.03, 0.05, 0.08], 'C':[ 0.5, 1.0, 3.0], 'degree':[1,3,5,6]}]
grid_search = GridSearchCV(svm_clf, para_grid, cv=8)

grid_search.fit(Xtrain, yLabels)

GridSearchCV(cv=8, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.01, 0.03, 0.05, 0.08], 'C': [0.5, 1.0, 3.0], 'degree': [1, 3, 5, 6]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
grid_search.best_params_, grid_search.best_score_,grid_search.grid_scores_



({'C': 1.0, 'degree': 1, 'gamma': 0.05, 'kernel': 'rbf'},
 0.8282828282828283,
 [mean: 0.80359, std: 0.01810, params: {'C': 0.5, 'degree': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  mean: 0.82492, std: 0.02186, params: {'C': 0.5, 'degree': 1, 'gamma': 0.03, 'kernel': 'rbf'},
  mean: 0.82492, std: 0.02403, params: {'C': 0.5, 'degree': 1, 'gamma': 0.05, 'kernel': 'rbf'},
  mean: 0.82267, std: 0.02640, params: {'C': 0.5, 'degree': 1, 'gamma': 0.08, 'kernel': 'rbf'},
  mean: 0.80359, std: 0.01810, params: {'C': 0.5, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'},
  mean: 0.82492, std: 0.02186, params: {'C': 0.5, 'degree': 3, 'gamma': 0.03, 'kernel': 'rbf'},
  mean: 0.82492, std: 0.02403, params: {'C': 0.5, 'degree': 3, 'gamma': 0.05, 'kernel': 'rbf'},
  mean: 0.82267, std: 0.02640, params: {'C': 0.5, 'degree': 3, 'gamma': 0.08, 'kernel': 'rbf'},
  mean: 0.80359, std: 0.01810, params: {'C': 0.5, 'degree': 5, 'gamma': 0.01, 'kernel': 'rbf'},
  mean: 0.82492, std: 0.02186, params: {'C': 0.5, 'degree

### Emsemble Learning:

#### a simply try !

In [24]:
# Ensemble Learning 
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(kernel="poly", degree=5, coef0=1, C=1)
sgd_clf = SGDClassifier()

voting_clf = VotingClassifier(estimators=[('lr',log_clf),('rf', rnd_clf),('svc', svm_clf),('sgd',sgd_clf)], voting='hard')

voting_clf.fit(Xtrain, yLabels)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [25]:
# test data
y_test_predict = pd.Series(voting_clf.predict(test_data))

result = pd.DataFrame({"PassengerId":testSet0["PassengerId"].values, 'Survived':y_test_predict})
result.to_csv("emsemble_svm_rnd_log.csv",index=0)

  if diff:


#### Baging and Pasting

In [48]:
from sklearn.ensemble import BaggingClassifier

# 0.78947
# bag_clf = BaggingClassifier(RandomForestClassifier(), n_estimators=900, max_samples=0.5, bootstrap=True, n_jobs=-1)

# 0.78468
bag_clf = BaggingClassifier(RandomForestClassifier(), n_estimators=500, max_samples=0.5, bootstrap=True, n_jobs=-1)

bag_clf.fit(Xtrain, yLabels)

BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [49]:
# submission
y_test_predict = pd.Series(bag_clf.predict(test_data))

result = pd.DataFrame({"PassengerId":testSet0["PassengerId"].values, 'Survived':y_test_predict})
result.to_csv("Forest_bagging_sample0.5_500esitimators.csv",index=0)

In [24]:
# Use GridSearch for the bagging
bag_clf1 = BaggingClassifier(RandomForestClassifier(), bootstrap=True, n_jobs=-1)
para_grid = [{'n_estimators':[500, 1000], 'max_samples':[0.5, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(bag_clf1, para_grid)

grid_search.fit(Xtrain, yLabels)

GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl..._estimators=10, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [500, 1000], 'max_samples': [0.5, 0.7, 0.8, 0.9]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
grid_search.best_params_, grid_search.best_score_, grid_search.grid_scores_



({'max_samples': 0.7, 'n_estimators': 1000},
 0.8237934904601572,
 [mean: 0.82267, std: 0.01657, params: {'max_samples': 0.5, 'n_estimators': 500},
  mean: 0.82043, std: 0.01931, params: {'max_samples': 0.5, 'n_estimators': 1000},
  mean: 0.82155, std: 0.02250, params: {'max_samples': 0.7, 'n_estimators': 500},
  mean: 0.82379, std: 0.02063, params: {'max_samples': 0.7, 'n_estimators': 1000},
  mean: 0.82379, std: 0.02063, params: {'max_samples': 0.8, 'n_estimators': 500},
  mean: 0.82379, std: 0.02222, params: {'max_samples': 0.8, 'n_estimators': 1000},
  mean: 0.82155, std: 0.01803, params: {'max_samples': 0.9, 'n_estimators': 500},
  mean: 0.82155, std: 0.02076, params: {'max_samples': 0.9, 'n_estimators': 1000}])

### Using TensorFlow: Neural Network