In [4]:
import numpy as np 
import pandas as pd
import pandas_profiling

In [170]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [258]:
train = pd.read_csv("../data/train.csv", delimiter='\t')
test = pd.read_csv("../data/test.csv", delimiter='\t')

In [6]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,1,1,0,0,0,0,0,0.090909,0,...,0.221395,0,1,0,0,0.222222,1,1,1,1
1,1,1,1,0,0,1,0,0,0.090909,0,...,0.241508,0,1,0,0,0.111111,1,1,1,0
2,2,0,1,0,0,1,0,0,0.090909,0,...,0.123067,0,1,0,0,0.444444,1,1,1,1


In [259]:
test.drop(["Unnamed: 0"],axis=1,inplace=True)
train.drop(["Unnamed: 0"], axis=1, inplace=True)

In [197]:
X_train, y_train = train.drop('0', axis=1), train['0']
X = test.drop('0', axis=1)

In [260]:
X = test.drop('0', axis=1)

Feature engineering - removal of rejected, highly-correlated features

In [17]:
prof_report = pandas_profiling.ProfileReport(train)

In [None]:
prof_report.to_file("pp_report.html")

In [18]:
rejected = prof_report.get_rejected_variables()

In [263]:
for i in rejected:
    train.drop(i, axis=1, inplace=True)
    test.drop(i, axis=1, inplace=True)

# Logistic regresssion

In [75]:
clf = LogisticRegression()

In [77]:
param_grid = {'C': [0.1, 0.01, 5, 10]}
kf = KFold(n_splits=8)

In [109]:
gs = GridSearchCV (
    estimator=clf,  
    param_grid=param_grid,  
    cv=kf,    
    scoring='roc_auc',
    verbose=3
)

In [112]:
X_train.drop(["Unnamed: 0"],axis=1,inplace=True)

In [113]:
gs.fit(
    X=X_train,
    y=y_train
)

Fitting 8 folds for each of 4 candidates, totalling 32 fits
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7247150897572121, total=   1.0s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .................. C=0.1, score=0.7183584643584026, total=   1.0s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s


[CV] .................. C=0.1, score=0.7228512253093704, total=   1.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7535099211223013, total=   1.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7126979331511054, total=   1.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7313501784903204, total=   0.9s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7265130336439904, total=   1.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7288913490370647, total=   1.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7188296410510422, total=   0.6s
[CV] C=0.01 ..........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:  1.1min finished


GridSearchCV(cv=KFold(n_splits=8, random_state=None, shuffle=False),
       error_score='raise',
       estimator=LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 0.01, 5, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=3)

In [117]:
best_estimator = gs.best_estimator_

In [118]:
score = cross_val_score(
    estimator=best_estimator,
    X=X_train,
    y=y_train,
    scoring='roc_auc',
    cv=kf, 
).mean()

print('Log reg scoring: {:.4f}'.format(scores))

Log reg scoring: 0.7334


In [130]:
X = test.drop('0', axis=1)

In [133]:
X.drop('id', axis=1, inplace=True)

In [134]:
best_estimator.fit(X_train, y_train)
predictions = best_estimator.predict_proba(X)

Log Reg Public Leaderboard: 0.73669106

# Log Reg Bagging

In [141]:
log_reg = LogisticRegression(class_weight='balanced', C=2.0)
skf = StratifiedKFold(shuffle=True, random_state=42)

In [142]:
bag_clf = BaggingClassifier(base_estimator=log_reg, n_estimators=50,
                            oob_score=True, random_state=42)

In [143]:
parameters = {'max_features': [0.5, 0.7, 0.9, 1.], 'max_samples': [0.5, 0.7], "base_estimator__C": [0.001, 0.01, 1]}
r_grid_search = RandomizedSearchCV(bag_clf, parameters, scoring ='roc_auc', n_iter=10, cv=skf, random_state=1)
r_grid_search = r_grid_search.fit(X_train, y_train)
print(r_grid_search.best_score_)

0.725217922218654


In [124]:
best_bag = r_grid_search.best_estimator_

In [132]:
best_bag

BaggingClassifier(base_estimator=LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=0.7,
         max_samples=0.7, n_estimators=50, n_jobs=1, oob_score=True,
         random_state=42, verbose=0, warm_start=False)

In [128]:
best_bag.fit(X_train, y_train)
predictions = best_bag.predict_proba(X)

Bagging Public Leaderboard: 0.73573301

# Random Forest

In [131]:
param_grid = {'criterion': ['gini', 'entropy'], 
             'max_depth': [5, 10, 20],
             'max_features': [0.8],
             'min_samples_leaf': [2, 5],
             'n_estimators': [50]}

In [104]:
clf = RandomForestClassifier(n_jobs=-1)
kf = KFold(n_splits=5)

In [105]:
gs = GridSearchCV (
    estimator=clf,  
    param_grid=param_grid,  
    cv=kf,  
    scoring='roc_auc',
    verbose=3
)

In [106]:
gs.fit(
    X=X_train,
    y=y_train
)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50 
[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50, score=0.7054431780906266, total=   4.7s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.9s remaining:    0.0s


[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50, score=0.7115731506281848, total=   4.7s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.8s remaining:    0.0s


[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50, score=0.6987974952043677, total=   6.5s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50 
[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50, score=0.7060978797819649, total=   5.2s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50 
[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=2, n_estimators=50, score=0.7162025749055575, total=   5.9s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=5, n_estimators=50 
[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=5, n_estimators=50, score=0.7056424114452845, total=   7.6s
[CV] criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=5, n_estimators=50 
[CV]  criterion=gini, max_depth=5, max_features=0.8, min_samples_leaf=5, n_estimators=50, score=0.7123969517143149, total=  

[CV]  criterion=entropy, max_depth=10, max_features=0.8, min_samples_leaf=5, n_estimators=50, score=0.7344580532410931, total=  10.7s


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  5.5min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [5, 10], 'max_features': [0.8], 'min_samples_leaf': [2, 5], 'n_estimators': [50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [107]:
best_score = gs.best_score_
best_estimator = gs.best_estimator_

In [108]:
scores = cross_val_score(
    estimator=best_estimator,
    X=X_train, # ...
    y=y_train, # ...
    scoring='roc_auc', # ...
    cv=kf, # cross-validation strategy
).mean()

print('Random forest scoring: {:.4f}'.format(scores))

Random forest scoring: 0.7299


In [112]:
best_estimator.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [113]:
predictions = best_estimator.predict_proba(X)

Random Forest Public Leaderboard: 0.72819695

In [166]:
predictions

array([[0.85081421, 0.14918579],
       [0.59441196, 0.40558804],
       [0.71995866, 0.28004134],
       ...,
       [0.9003711 , 0.0996289 ],
       [0.89846322, 0.10153678],
       [0.87375263, 0.12624737]])

In [236]:
# Submission
test['id'] = test.index
submission = pd.DataFrame.from_dict({'_ID_': test['id']})
submission["_VAL_"]= [i[1] for i in predictions]
submission.to_csv("submission_b2.csv",index=False)

In [231]:
test.drop(['0', 'id'], axis=1, inplace=True)

In [232]:
test

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,...,332,336,337,338,339,340,342,343,344,345
0,1,0,0,1,0,0,0.136364,0,1,1,...,1,0.192984,0,1,0,0,1,1,1,1
1,1,0,0,1,0,0,0.181818,0,1,1,...,0,0.195690,0,1,0,0,1,1,1,0
2,1,0,0,0,0,0,0.090909,0,1,1,...,1,0.192984,0,1,0,0,1,1,1,0
3,1,0,0,1,0,0,0.090909,0,1,1,...,0,0.195690,0,1,0,0,1,1,1,0
4,1,0,0,1,0,0,0.090909,0,1,1,...,0,0.289893,0,0,1,0,1,1,1,1
5,1,0,0,1,0,0,0.090909,0,1,1,...,0,0.206963,0,1,0,0,1,1,1,0
6,1,0,0,0,0,0,0.181818,0,1,1,...,1,0.450414,1,0,0,0,1,1,1,0
7,1,0,0,0,0,0,0.090909,0,1,1,...,1,0.450414,1,0,0,0,1,1,1,0
8,1,0,0,0,0,0,0.090909,0,1,1,...,1,0.109160,1,0,0,0,1,1,1,1
9,1,0,0,1,0,0,0.090909,0,1,1,...,0,0.134028,0,1,0,0,1,1,1,1


# Extra Trees

In [158]:
param_grid = {
    'n_estimators': [50],
    'criterion': ['gini'],
    'min_samples_leaf': [10, 15],
    'min_samples_split': [2, 5, 10]
}

In [159]:
xclf = ExtraTreesClassifier()

In [160]:
gs = GridSearchCV (
    estimator=xclf,  
    param_grid=param_grid,  
    cv=skf,    
    scoring='roc_auc',
    verbose=3
)

In [161]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50 
[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50, score=0.7313573970930596, total=   2.1s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50, score=0.7322316733487407, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.5s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=2, n_estimators=50, score=0.7332088335607568, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50 
[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50, score=0.7321408248435527, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50 
[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50, score=0.7334693436251742, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50 
[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=5, n_estimators=50, score=0.733218102191018, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=10, n_estimators=50 
[CV]  criterion=gini, min_samples_leaf=10, min_samples_split=10, n_estimators=50, score=0.7318432016998265, total=   2.0s
[CV] criterion=gini, min_samples_leaf=10, min_samples_split=10, n_estimato

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   38.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'criterion': ['gini'], 'min_samples_leaf': [10, 15], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [162]:
gs.best_score_

0.7329427478588668

In [163]:
best_estimator = gs.best_estimator_

In [164]:
best_estimator.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [165]:
predictions = best_estimator.predict_proba(X)

ExtraTrees Public Leaderboard: 0.74413731

# Stacking

In [177]:
best_estimator

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [172]:
log = LogisticRegression(С=2.0)
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features=0.8,
            min_samples_leaf=5, min_samples_split=2, n_estimators=50)
et = best_estimator
bag = best_bag

models = [log, rf, et, bag]
ens_model = Ridge()

In [180]:
log_predictions = []
rf_predictions = []
et_predictions = []
bag_predictions = []

In [201]:
for train, val in skf.split(X_train, y_train):
    rf.fit(X_train.iloc[train], y_train[train])
    et.fit(X_train.iloc[train], y_train[train])
    rf_predictions.append([y_train[val], rf.predict_proba(X_train.iloc[val])[:,1]])
    et_predictions.append([y_train[val], et.predict_proba(X_train.iloc[val])[:,1]])

# stacking
for i, fold in enumerate(skf.split(X_train, y_train)):
    train, val = fold[0], fold[1]
    X_train.iloc[val, -2] = rf_predictions[i][1]
    X_train.iloc[val, -1] = et_predictions[i][1]

In [228]:
score = cross_val_score(
    estimator=log,
    X=X_train,
    y=y_train,
    scoring='roc_auc',
    cv=kf, 
).mean()

print('Stacking validation scoring: {:.4f}'.format(scores))

Stacking validation scoring: 0.7357


In [235]:
log.fit(X_train, y_train)
predictions = log.predict_proba(X)

Stacking Public Leaderboard: 0.73719262