In [13]:
import pandas as pd
import numpy as np
import os
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

import config


SEED=42

In [56]:
# read train, test
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train, y_train = train.drop(columns=['TARGET']), train['TARGET']
X_test, y_test = test.drop(columns=['TARGET']), test['TARGET']

In [60]:
def train_and_predict(model, X_train, y_train, X_test, y_test):
    """
    Fit a model on X_train, y_train
    predicts on X_text, y_test 
    Calculate AUROC on predictions made on test data
    
    Outputs - AUROC score, time elapse for training and prediction    
    """
    start = time.time()
    model = model.fit(X_train, y_train)
    
    roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    cv = cross_val_score(model, X_train, y_train, scoring='roc_auc', n_jobs=-1, verbose=2, cv=5)
    
    end = time.time()
    time_elapsed = end - start
    
    return roc_score, time_elapsed, cv

models = [("LogisticRegression", LogisticRegression(random_state=SEED, max_iter=10000)),
         ("RandomForestClassifier", RandomForestClassifier(random_state=SEED)),
         ("GradientBoostingClassifier", GradientBoostingClassifier(random_state=SEED)),
         ("AdaBoostClassifier", AdaBoostClassifier(random_state=SEED)),
          ('KNeighbors', KNeighborsClassifier()),
         ("XGBClassifier",xgb.XGBClassifier(random_state=SEED))]

### Model Training

In [64]:
import time
results = {"Model":[],
          "AUCROC_score":[],
          "Time_in_sec":[],
          'cv_mean':[],
          'cv_std':[]}

for name, model in models:
    print(name)
    roc, time_, cv = train_and_predict(model, X_train, y_train, X_test, y_test)
    results["Model"].append(name)
    results["AUCROC_score"].append(roc)
    results["Time_in_sec"].append(time_)
    results['cv_mean'].append(cv.mean())
    results['cv_std'].append(cv.std())

    
results = pd.DataFrame.from_dict(results, orient='index').transpose()
results

LogisticRegression


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.4min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.5min finished


RandomForestClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.1s finished


GradientBoostingClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   27.2s remaining:   18.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.4s finished


AdaBoostClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.6s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.4s finished


KNeighbors


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   57.7s remaining:   38.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished


XGBClassifier






[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   19.6s remaining:   13.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   19.7s finished


Unnamed: 0,Model,AUCROC_score,Time_in_sec,cv_mean,cv_std
0,LogisticRegression,0.656398,196.070127,0.640466,0.028124
1,RandomForestClassifier,0.634512,13.217502,0.610521,0.018577
2,GradientBoostingClassifier,0.737012,61.439194,0.763697,0.031072
3,AdaBoostClassifier,0.684497,19.411731,0.741957,0.024768
4,KNeighbors,0.507838,69.892156,0.504493,0.019561
5,XGBClassifier,0.683997,24.337503,0.753749,0.021391


## Hyperparameter tuning for GradientBoostingClassifier

In [68]:
params = {
    'learning_rate'    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'n_estimators'     : [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_depth'        : [3,4,5,6,8,10,12,15],
    'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
    'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
}

model = GradientBoostingClassifier(random_state=SEED)
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=5)
random_search.fit(train, y_train)
random_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GradientBoostingClassifier(learning_rate=0.2, max_depth=5, min_samples_leaf=0.1,
                           min_samples_split=0.9, n_estimators=64,
                           random_state=42)

In [66]:
## score 0.80139
model = GradientBoostingClassifier(learning_rate=0.15, max_depth=12,
                           min_samples_leaf=0.2,
                           min_samples_split=0.7000000000000001,
                           n_estimators=16, random_state=42)
model.fit(train, y_train)
roc_score = roc_auc_score(y_test, model.predict_proba(test)[:,1])
roc_score

0.750184857849679

In [69]:
## score 0.80139
model = GradientBoostingClassifier(learning_rate=0.2, max_depth=5, min_samples_leaf=0.1,
                           min_samples_split=0.9, n_estimators=64,
                           random_state=42)
model.fit(train, y_train)
roc_score = roc_auc_score(y_test, model.predict_proba(test)[:,1])
roc_score

0.7501019400806935

## Submission

In [20]:
mailout_test = pd.read_csv('arvato-test.csv', sep=';')
mailout_test_clean = fe_pipeline(data=mailout_test, 
                    selected_columns=config.COLS_FINAL, 
                    remove_null=False,
                   imputer = config.MODE_DICT)

yhat = pd.DataFrame(model.predict_proba(mailout_test_clean)[:,1], index=mailout_test['LNR'], columns=['RESPONSE'])
yhat.head()
yhat.to_csv("submission.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_DEUG_2015'] = pd.to_numeric(data['CAMEO_DEUG_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_INTL_2015'] = pd.to_numeric(data['CAMEO_INTL_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Fixing cameo columns
Replacing value to NaN
12 columns were affected
Imputing null values
Encoding categorical columns
