In [32]:
import xgboost as xgb

In [33]:
# More a very good read on Ensembling (Stacking/Blending), please check out MLWave's brilliant Ensembling Guide
# http://mlwave.com/kaggle-ensembling-guide/

import pandas as pd
import numpy as np
import re as re
import xgboost as xgb
# Going to use these 4 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold

train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")
PassengerId = test['PassengerId']

#################################################################################################### 
#                                       PRE-PROCESSING                                             #
#################################################################################################### 

# This part essentially ripped from Sina's work as I'm too lazy

full_data = [train, test]
# Check distribution of PCLASS and number survived
print(train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
# Check distribution of Sexes and number survived
print(train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
# Encoding our features
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']                           = 4

# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
#drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Parch', 'FamilySize']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)


   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363
      Sex  Survived
0  female  0.742038
1    male  0.188908


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
#################################################################################################### 
#                                         STACKING                                                 #
#################################################################################################### 

ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 12 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS, random_state=SEED)

# Write some Python helper functions that collects a lot of the SKlearn methods under one roof. 
# Totally ripped from Faron's Stacking starter ;)
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


def get_oof(clf, x_train, y_train, x_test):
    clf.train(x_train,y_train)
    return pd.Series(clf.predict(x_train),name = "train" ), pd.Series(clf.predict(x_test),name = "test" )

    
# Assign the parameters for each of our 4 base models
rf_params = {
    'n_jobs': -1,
    'n_estimators': 575,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 3 
}
et_params = {
    'n_jobs': -1,
    'n_estimators':575,
    #'max_features': 0.5,
    'max_depth': 5,
    'min_samples_leaf': 3,
    'verbose': 3
}
ada_params = {
    'n_estimators': 575,
    'learning_rate' : 0.95
}

gb_params = {
    'n_estimators': 575,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 3,
    'verbose': 3
}
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }


# Create 4 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values
x_test = test.values

# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test)
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test)
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test)
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test)
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test)

et_oof_train.name = 'ET_Pred'
rf_oof_train.name = 'RF_Pred'
ada_oof_train.name = 'ADA_Pred'
gb_oof_train.name= 'GB_pred'
svc_oof_train.name = 'SVC_Pred'

et_oof_test.name = 'ET_Pred'
rf_oof_test.name = 'RF_Pred'
ada_oof_test.name = 'ADA_Pred'
gb_oof_test.name= 'GB_pred'
svc_oof_test.name = 'SVC_Pred'

train = pd.concat(( train,et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
test = pd.concat(( test,et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
#print("{},{}".format(x_train.shape, x_test.shape))



building tree 1 of 575building tree 3 of 575
building tree 2 of 575

building tree 4 of 575
building tree 5 of 575building tree 6 of 575building tree 7 of 575


building tree 8 of 575
building tree 9 of 575building tree 10 of 575building tree 11 of 575


building tree 12 of 575building tree 13 of 575building tree 14 of 575


building tree 15 of 575
building tree 16 of 575building tree 17 of 575building tree 18 of 575


building tree 19 of 575
building tree 20 of 575
building tree 21 of 575
building tree 22 of 575
building tree 23 of 575
building tree 24 of 575
building tree 25 of 575building tree 26 of 575

building tree 27 of 575building tree 28 of 575

building tree 30 of 575building tree 29 of 575

building tree 31 of 575
building tree 32 of 575
building tree 33 of 575
building tree 34 of 575
building tree 35 of 575
building tree 36 of 575
building tree 37 of 575
building tree 38 of 575
building tree 39 of 575
building tree 40 of 575building tree 41 of 575

building tree 42 of 575
b

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    0.1s



building tree 167 of 575building tree 168 of 575building tree 169 of 575building tree 170 of 575


building tree 171 of 575
building tree 172 of 575
building tree 173 of 575

building tree 174 of 575
building tree 175 of 575
building tree 176 of 575
building tree 177 of 575
building tree 178 of 575
building tree 179 of 575
building tree 180 of 575building tree 181 of 575

building tree 182 of 575
building tree 183 of 575
building tree 184 of 575
building tree 185 of 575
building tree 186 of 575
building tree 187 of 575
building tree 188 of 575
building tree 189 of 575
building tree 190 of 575
building tree 191 of 575
building tree 193 of 575building tree 192 of 575building tree 194 of 575building tree 198 of 575



building tree 195 of 575
building tree 196 of 575
building tree 201 of 575building tree 197 of 575

building tree 199 of 575
building tree 200 of 575building tree 203 of 575

building tree 202 of 575building tree 204 of 575

building tree 205 of 575building tree 206 of 575


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.3s



building tree 288 of 575building tree 289 of 575building tree 290 of 575

building tree 291 of 575
building tree 292 of 575

building tree 293 of 575building tree 294 of 575

building tree 295 of 575
building tree 296 of 575
building tree 297 of 575
building tree 298 of 575
building tree 299 of 575
building tree 300 of 575
building tree 301 of 575
building tree 302 of 575
building tree 303 of 575
building tree 304 of 575
building tree 305 of 575
building tree 306 of 575
building tree 307 of 575
building tree 308 of 575building tree 309 of 575

building tree 310 of 575
building tree 311 of 575
building tree 312 of 575
building tree 313 of 575
building tree 314 of 575
building tree 316 of 575building tree 315 of 575

building tree 317 of 575
building tree 318 of 575
building tree 319 of 575building tree 320 of 575

building tree 321 of 575building tree 322 of 575

building tree 324 of 575building tree 323 of 575

building tree 325 of 575building tree 326 of 575building tree 327 of 575



[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 575 out of 575 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 575 out of 575 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 575 out of 575 | elapsed:    0.1s finished


building tree 1 of 575building tree 4 of 575building tree 3 of 575building tree 2 of 575



building tree 6 of 575building tree 5 of 575building tree 7 of 575


building tree 8 of 575
building tree 9 of 575building tree 10 of 575building tree 11 of 575building tree 12 of 575



building tree 13 of 575building tree 14 of 575building tree 15 of 575building tree 16 of 575



building tree 17 of 575
building tree 18 of 575building tree 19 of 575building tree 20 of 575


building tree 21 of 575building tree 22 of 575building tree 23 of 575


building tree 24 of 575
building tree 25 of 575
building tree 26 of 575
building tree 27 of 575building tree 28 of 575

building tree 29 of 575
building tree 30 of 575building tree 31 of 575

building tree 32 of 575
building tree 33 of 575
building tree 34 of 575
building tree 35 of 575
building tree 36 of 575
building tree 37 of 575
building tree 38 of 575
building tree 39 of 575
building tree 40 of 575
building tree 41 of 575
building tree 42 of 575
b

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    0.1s


building tree 144 of 575building tree 145 of 575
building tree 146 of 575

building tree 147 of 575
building tree 148 of 575
building tree 149 of 575
building tree 150 of 575building tree 151 of 575building tree 152 of 575


building tree 153 of 575
building tree 154 of 575
building tree 155 of 575
building tree 156 of 575
building tree 157 of 575
building tree 158 of 575
building tree 159 of 575
building tree 160 of 575
building tree 161 of 575
building tree 162 of 575
building tree 163 of 575
building tree 164 of 575
building tree 165 of 575
building tree 166 of 575
building tree 167 of 575
building tree 168 of 575building tree 169 of 575

building tree 170 of 575
building tree 171 of 575
building tree 172 of 575
building tree 173 of 575
building tree 174 of 575
building tree 175 of 575building tree 176 of 575

building tree 177 of 575building tree 178 of 575

building tree 179 of 575
building tree 180 of 575
building tree 181 of 575building tree 182 of 575

building tree 183 of 575b

[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.3s




building tree 299 of 575building tree 300 of 575

building tree 301 of 575building tree 302 of 575

building tree 303 of 575building tree 304 of 575

building tree 306 of 575building tree 305 of 575

building tree 307 of 575
building tree 308 of 575building tree 309 of 575

building tree 310 of 575
building tree 311 of 575
building tree 312 of 575
building tree 313 of 575
building tree 314 of 575
building tree 315 of 575
building tree 316 of 575
building tree 317 of 575
building tree 318 of 575
building tree 319 of 575building tree 320 of 575

building tree 321 of 575
building tree 322 of 575
building tree 323 of 575building tree 324 of 575

building tree 325 of 575
building tree 326 of 575
building tree 327 of 575building tree 328 of 575

building tree 329 of 575
building tree 330 of 575
building tree 331 of 575
building tree 332 of 575
building tree 333 of 575
building tree 334 of 575
building tree 335 of 575building tree 336 of 575

building tree 337 of 575
building tree 338 of 57

[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 575 out of 575 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 575 out of 575 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 575 out of 575 | elapsed:    0.1s finished


      Iter       Train Loss   Remaining Time 
         1           1.2310            8.97s
         2           1.1500            4.48s
         3           1.0836            2.98s
         4           1.0286            2.23s
         5           0.9822            1.78s
         6           0.9431            2.96s
         7           0.9097            2.54s
         8           0.8809            2.22s
         9           0.8564            1.97s
        10           0.8358            1.77s
        11           0.8152            2.40s
        12           0.7984            2.20s
        13           0.7826            2.03s
        14           0.7697            1.88s
        15           0.7574            1.75s
        16           0.7469            2.18s
        17           0.7382            2.05s
        18           0.7294            1.93s
        19           0.7220            1.83s
        20           0.7150            2.17s
        21           0.7096            2.06s
        2

       204           0.5186            1.13s
       205           0.5185            1.12s
       206           0.5183            1.12s
       207           0.5181            1.12s
       208           0.5181            1.11s
       209           0.5178            1.11s
       210           0.5173            1.11s
       211           0.5171            1.10s
       212           0.5165            1.10s
       213           0.5161            1.10s
       214           0.5159            1.10s
       215           0.5154            1.09s
       216           0.5145            1.08s
       217           0.5142            1.08s
       218           0.5139            1.07s
       219           0.5137            1.08s
       220           0.5135            1.08s
       221           0.5130            1.07s
       222           0.5123            1.06s
       223           0.5121            1.08s
       224           0.5120            1.07s
       225           0.5119            1.06s
       226

       394           0.4787            0.56s
       395           0.4786            0.57s
       396           0.4786            0.56s
       397           0.4786            0.56s
       398           0.4784            0.56s
       399           0.4782            0.56s
       400           0.4782            0.55s
       401           0.4780            0.55s
       402           0.4777            0.55s
       403           0.4775            0.54s
       404           0.4774            0.54s
       405           0.4773            0.54s
       406           0.4772            0.54s
       407           0.4770            0.53s
       408           0.4767            0.53s
       409           0.4767            0.53s
       410           0.4766            0.52s
       411           0.4766            0.52s
       412           0.4765            0.52s
       413           0.4765            0.52s
       414           0.4765            0.51s
       415           0.4764            0.51s
       416

IndexError: axis 1 out of bounds [0, 1)

In [38]:
train = pd.concat(( train,et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
test = pd.concat(( test,et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [40]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,FamilySize,IsAlone,Title,ET_Pred,RF_Pred,ADA_Pred,GB_pred,SVC_Pred
0,3,1,2,0,0,2,1,1,1,0,0,0,0,0
1,3,0,2,0,0,0,2,0,3,1,1,1,0,1
2,2,1,3,0,1,2,1,1,1,0,0,0,0,0
3,3,1,1,0,1,0,1,1,1,0,0,0,0,0
4,3,0,1,1,1,0,3,0,3,1,1,1,0,1


In [43]:
# Finally, we use an Xgboost classifier and feed it our oof train and test values as new features
gbm = xgb.XGBClassifier(learning_rate = 0.95,
 n_estimators= 5000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=1,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)
gbm.fit(train, y_train)
predictions = gbm.predict(test)


In [44]:

#################################################################################################### 
#                           PRODUCING SUBMISSION FILE                                              #
#################################################################################################### 
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)