In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier



In [2]:
def add_feature(X, name, values):
        X_new = X
        X_new[name] = values
        
        return X_new

In [3]:
class StackedEnsemble():
    
    def __init__(self):
        self.logistic_regr = LogisticRegression(C=0.1, penalty='l1')
        self.adaboost = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1)
        self.random_forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_features='auto', bootstrap=True)
        self.xgb = xgb.XGBClassifier(max_depth=4, learning_rate=0.01, n_estimators=1000)
        
        
    def fit(self, X, y):
        self.logistic_regr.fit(X, y)
        preds = self.logistic_regr.predict_proba(X)[:, 1]
        X = add_feature(X, 'lg', preds)

        self.adaboost.fit(X, y)
        preds = self.adaboost.predict_proba(X)[:, 1]
        X = add_feature(X, 'ab', preds)
        
        self.random_forest.fit(X, y)
        preds = self.random_forest.predict_proba(X)[:, 1]
        X = add_feature(X, 'rf', preds)
        
        self.xgb.fit(X, y)
        
        
    def predict_proba(self, X_test):
        preds = self.logistic_regr.predict_proba(X_test)[:, 1]
        X_test = add_feature(X_test, 'lg', preds)
        
        preds = self.adaboost.predict_proba(X_test)[:, 1]
        X_test = add_feature(X_test, 'ab', preds)
        
        preds = self.random_forest.predict_proba(X_test)[:, 1]
        X_test = add_feature(X_test, 'rf', preds)
        
        return self.xgb.predict_proba(X_test)[:, 1]

In [4]:
def to_file(filename, ids, preds):
    with open(filename, 'w') as f:
        f.write('id,Y\n')
        for num, pred in zip(ids, preds):
            f.write('{},{}\n'.format(num, pred))

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
# Fill NA values with the mean
train = train.fillna(train.mean())
test = test.fillna(train.mean())

In [7]:
X_train = train.drop(['id', 'Y'], axis=1)
y_train = train['Y']

X_test = test.drop(['id'], axis=1)
ids = test['id']

In [8]:
# Drop highly correlated features
X_train = X_train.drop(['F18', 'F3'], axis=1)
X_test = X_test.drop(['F18', 'F3'], axis=1)

In [9]:
ens = StackedEnsemble()
ens.fit(X_train, y_train)
preds = ens.predict_proba(X_test)
to_file('submissions/stacked_clean.csv', ids, preds)