In [13]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

import config

sns.set()
SEED=42

In [5]:
data = pd.read_csv('arvato-train.csv', sep=";")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42962 entries, 0 to 42961
Columns: 367 entries, LNR to ALTERSKATEGORIE_GROB
dtypes: float64(267), int64(94), object(6)
memory usage: 120.3+ MB


In [6]:
def fix_cameo_features(data):
    data['CAMEO_DEUG_2015'] = pd.to_numeric(data['CAMEO_DEUG_2015'], errors='coerce')
    data['CAMEO_INTL_2015'] = pd.to_numeric(data['CAMEO_INTL_2015'], errors='coerce')
    data['EINGEFUEGT_AM'] = pd.to_datetime(data['EINGEFUEGT_AM']).dt.year
    data['CAMEO_DEU_2015'] = data['CAMEO_DEU_2015'].apply(lambda x: x[-1] if isinstance(x, str) else x)
    return data

def _core_replace_NaN(series):
    """
    Replace negative and outliers values (according to the scale) to NaN. Takes a pd.Series as input
    Works only on one column
    """
    temp_unique = sorted(series.value_counts().index.tolist())
    series = np.where(series < 0, np.nan, series)
    if temp_unique[-1] - temp_unique[-2] > 1:
        series = np.where(series == temp_unique[-1], np.nan, series)
    return series

def replace2NaN(data):
    """
    Replace negative and outliers values for all columns in a Dataframe. Takes a pd.DataFrame as input
    it returns the same dataframe with a summary of the percentage of null for each column to see which one
    were affected.
    """
    data= data.copy()
    num_var = data.select_dtypes(include='number').columns.tolist()
    
    previous_state = data.isnull().mean()
#     print(previous_state)
    
    for var in num_var:
        data[var] = _core_replace_NaN(data[var])  
        
    after_state = data.isnull().mean()
    
    summary = pd.concat([previous_state.to_frame(), after_state.to_frame()], axis=1)
    summary.columns = ['previous', 'after']
    summary['difference'] = summary.diff(axis=1).after
    cols_affected = (summary['difference'] > 0).sum()
    
    print(f'{cols_affected} columns were affected')
    
    return data, summary

def remove_column_null(data, limit):
    non_null_columns = [x for x in data.columns if data[x].isnull().mean() <= limit]
    print(f'{len(non_null_columns)} out of {data.shape[1]} columns remains')
    return data[non_null_columns], non_null_columns


def remove_row_null(data, limit):
    mask = data.isnull().sum(axis=1) <= limit
    print(f'{mask.sum()} out of {data.shape[0]} rows remains')
    return data[mask]

def null_imputer(df, imputer):
    return df.fillna(imputer)

def one_hot_encoder(data):
    data = pd.concat([data, pd.get_dummies(data['OST_WEST_KZ'], prefix='OST_WEST_KZ_')], axis=1).drop(columns=['OST_WEST_KZ'])
    data = pd.concat([data, pd.get_dummies(data['CAMEO_DEU_2015'], prefix='CAMEO_DEU_2015_')], axis=1).drop(columns=['CAMEO_DEU_2015'])
    return data

In [7]:
def fe_pipeline(data, selected_columns, imputer, remove_null, limit_null_row=20, limit_null_column=0.2):
    
    data = data[selected_columns]
    print('Fixing cameo columns')
    data = fix_cameo_features(data)
    print('Replacing value to NaN')
    data, _ = replace2NaN(data)
    if remove_null:
        print(f'Removing columns with more than {limit_null_column}')
        data, _ = remove_column_null(data, limit_null_column)
        print(f'Removing rows with more than {limit_null_row}')
        data = remove_row_null(data, limit_null_row)
    print(f'Imputing null values')
    data = null_imputer(data, imputer)
    print('Encoding categorical columns')
    data = one_hot_encoder(data)
    return data
    

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['RESPONSE']),
                                                    data['RESPONSE'],
                                                    test_size=0.2,
                                                    random_state=SEED
                                                   )


train = fe_pipeline(data=X_train, 
                    selected_columns=config.COLS_FINAL, 
                    remove_null=True,
                   limit_null_column=0.2,
                   limit_null_row=20,
                   imputer = config.MODE_DICT)

test = fe_pipeline(data=X_test, 
                    selected_columns=config.COLS_FINAL, 
                    remove_null=False,
                   imputer = config.MODE_DICT)

Fixing cameo columns
Replacing value to NaN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_DEUG_2015'] = pd.to_numeric(data['CAMEO_DEUG_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_INTL_2015'] = pd.to_numeric(data['CAMEO_INTL_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['EINGEFUEGT_AM'] = pd.to_datetime(da

12 columns were affected
Removing columns with more than 0.2
271 out of 271 columns remains
Removing rows with more than 20
27773 out of 34369 rows remains
Imputing null values
Encoding categorical columns
Fixing cameo columns
Replacing value to NaN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_DEUG_2015'] = pd.to_numeric(data['CAMEO_DEUG_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_INTL_2015'] = pd.to_numeric(data['CAMEO_INTL_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['EINGEFUEGT_AM'] = pd.to_datetime(da

11 columns were affected
Imputing null values
Encoding categorical columns


In [9]:
y_train = y_train.loc[train.index.tolist()]
y_train.shape

(27773,)

In [10]:
train = train.astype('float')
test = test.astype('float')

In [11]:
def train_and_predict(model, X_train, y_train, X_test, y_test):
    """
    Fit a model on X_train, y_train
    predicts on X_text, y_test 
    Calculate AUROC on predictions made on test data
    
    Outputs - AUROC score, time elapse for training and prediction    
    """
    start = time.time()
    model = model.fit(X_train, y_train)
    
    roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    
    end = time.time()
    time_elapsed = end - start
    
    return roc_score, time_elapsed

models = [("LogisticRegression", LogisticRegression(random_state=SEED, max_iter=10000)),
         ("RandomForestClassifier", RandomForestClassifier(random_state=SEED)),
         ("GradientBoostingClassifier", GradientBoostingClassifier(random_state=SEED)),
         ("AdaBoostClassifier", AdaBoostClassifier(random_state=SEED)),
          ('KNeighbors', KNeighborsClassifier()),
         ("XGBClassifier",xgb.XGBClassifier(random_state=SEED))]

## Without Scaling

In [44]:
import time
results = {"Model":[],
          "AUCROC_score":[],
          "Time_in_sec":[]}

for name, model in models:
    print(name)
    roc, time_ = train_and_predict(model, train, y_train, test, y_test)
    results["Model"].append(name)
    results["AUCROC_score"].append(roc)
    results["Time_in_sec"].append(time_)

    
results = pd.DataFrame.from_dict(results, orient='index').transpose()
results

LogisticRegression
RandomForestClassifier
GradientBoostingClassifier
AdaBoostClassifier
KNeighbors
XGBClassifier






Unnamed: 0,Model,AUCROC_score,Time_in_sec
0,LogisticRegression,0.656398,41.289523
1,RandomForestClassifier,0.634512,7.204932
2,GradientBoostingClassifier,0.737012,33.795716
3,AdaBoostClassifier,0.684497,8.62734
4,KNeighbors,0.507838,6.664183
5,XGBClassifier,0.683997,4.183281


### With Scaling

In [46]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

results = {"Model":[],
          "AUCROC_score":[],
          "Time_in_sec":[]}

for name, model in models:
    print(name)
    roc, time_ = train_and_predict(model, train_scaled, y_train, test_scaled, y_test)
    results["Model"].append(name)
    results["AUCROC_score"].append(roc)
    results["Time_in_sec"].append(time_)

    
results = pd.DataFrame.from_dict(results, orient='index').transpose()
results

LogisticRegression
RandomForestClassifier
GradientBoostingClassifier
AdaBoostClassifier
KNeighbors
XGBClassifier






Unnamed: 0,Model,AUCROC_score,Time_in_sec
0,LogisticRegression,0.645132,2.404022
1,RandomForestClassifier,0.63552,9.070027
2,GradientBoostingClassifier,0.737012,33.050811
3,AdaBoostClassifier,0.684497,8.879995
4,KNeighbors,0.526006,6.24712
5,XGBClassifier,0.683997,4.19005


In [60]:
cv = cross_val_score(best_model, train, y_train, scoring='roc_auc', n_jobs=-1, verbose=2, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   35.3s remaining:   23.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.5s finished


In [61]:
cv.mean(), cv.std()

(0.7636967803776888, 0.031072362995644615)

### Hyperparameter tuning for GB

In [62]:
from sklearn.model_selection import RandomizedSearchCV

xgbclassifier = xgb.XGBClassifier(random_state=SEED)
params = {
    'learning_rate'    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'max_depth'        : [3,4,5,6,8,10,12,15],
    'min_child_weight' : [1,3,5,7],
    'gamma'            : [0.0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree' : [0.3, 0.4, 0.5, 0.7]
}

random_search = RandomizedSearchCV(xgbclassifier, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
random_search.fit(train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits






RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [71]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [65]:
random_search.best_score_

0.7834846512733591

In [72]:
best_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

cv = cross_val_score(best_xgb, train, y_train, scoring='roc_auc', n_jobs=-1, verbose=3, cv=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   28.3s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   32.4s remaining:   13.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.5s finished


In [54]:
best_xgb = xgb.XGBClassifier(random_state=SEED, scale_pos_weight=10)
best_xgb.fit(train, y_train)
roc_score = roc_auc_score(y_test, best_xgb.predict_proba(test)[:,1])
roc_score





0.6913132418701566

In [74]:
cv.mean(), cv.std()

(0.7760395599263137, 0.042949681523086564)

## Hyperparameter tuning for GradientBoostingClassifier

In [17]:
params = {
    'learning_rate'    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'n_estimators'     : [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_depth'        : [3,4,5,6,8,10,12,15],
    'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
    'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
}

model = GradientBoostingClassifier(random_state=SEED)
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=5)
random_search.fit(train, y_train)
random_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GradientBoostingClassifier(max_depth=12, min_samples_leaf=0.1,
                           min_samples_split=0.8, n_estimators=64,
                           random_state=42)

In [None]:
# gridsearch = GridSearchCV(model, param_grid=params, scoring='roc_auc', n_jobs=-1, cv=0, verbose=10)
# gridsearch.fit(train, y_train)

In [22]:
## score 0.80139
model = GradientBoostingClassifier(learning_rate=0.15, max_depth=12,
                           min_samples_leaf=0.2,
                           min_samples_split=0.7000000000000001,
                           n_estimators=16, random_state=42)
model.fit(train, y_train)
roc_score = roc_auc_score(y_test, model.predict_proba(test)[:,1])
roc_score

0.750184857849679

In [26]:
from sklearn.metrics import recall_score

recall_score(y_test, model.predict(test))


0.0

In [91]:
## score 0.8009
model = GradientBoostingClassifier(learning_rate=0.2, min_samples_leaf=0.1,
                           min_samples_split=0.7000000000000001,
                           random_state=42)
model.fit(train, y_train)
roc_score = roc_auc_score(y_test, model.predict_proba(test)[:,1])
roc_score

0.7514813015553423

## SMOTE

In [35]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=SEED)
X_sm, y_sm = sm.fit_resample(train, y_train)

y_sm.value_counts(normalize=True)

0    0.5
1    0.5
Name: RESPONSE, dtype: float64

In [37]:

results = {"Model":[],
          "AUCROC_score":[],
          "Time_in_sec":[]}

for name, model in models:
    print(name)
    roc, time_ = train_and_predict(model, X_sm, y_sm, test, y_test)
    results["Model"].append(name)
    results["AUCROC_score"].append(roc)
    results["Time_in_sec"].append(time_)

    
results = pd.DataFrame.from_dict(results, orient='index').transpose()
results

LogisticRegression
RandomForestClassifier
GradientBoostingClassifier
AdaBoostClassifier
KNeighbors
XGBClassifier






Unnamed: 0,Model,AUCROC_score,Time_in_sec
0,LogisticRegression,0.621517,123.101335
1,RandomForestClassifier,0.64499,67.333851
2,GradientBoostingClassifier,0.693575,193.411212
3,AdaBoostClassifier,0.663011,41.426377
4,KNeighbors,0.553883,26.166415
5,XGBClassifier,0.665484,15.35159


In [44]:
model = GradientBoostingClassifier(learning_rate=0.2, min_samples_leaf=0.1,
                           min_samples_split=0.7000000000000001,
                           random_state=42)
model.fit(X_sm, y_sm)
roc_score = roc_auc_score(y_test, model.predict_proba(test)[:,1])
recall = recall_score(y_test, model.predict(test))
roc_score, recall

(0.7000596032433529, 0.0)

In [46]:
from sklearn.metrics import classification_report

In [49]:
classification_report(y_test, model.predict(test), output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'0': {'precision': 0.98591877109275,
  'recall': 1.0,
  'f1-score': 0.9929094638148257,
  'support': 8472},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 121},
 'accuracy': 0.98591877109275,
 'macro avg': {'precision': 0.492959385546375,
  'recall': 0.5,
  'f1-score': 0.49645473190741285,
  'support': 8593},
 'weighted avg': {'precision': 0.9720358231930382,
  'recall': 0.98591877109275,
  'f1-score': 0.9789280783706742,
  'support': 8593}}

In [45]:
model.predict(test).sum()

0

## Submission

In [50]:
best_model = GradientBoostingClassifier(random_state=SEED)
best_model.fit(train, y_train)

GradientBoostingClassifier(random_state=42)

In [20]:
mailout_test = pd.read_csv('arvato-test.csv', sep=';')
mailout_test_clean = fe_pipeline(data=mailout_test, 
                    selected_columns=config.COLS_FINAL, 
                    remove_null=False,
                   imputer = config.MODE_DICT)

yhat = pd.DataFrame(model.predict_proba(mailout_test_clean)[:,1], index=mailout_test['LNR'], columns=['RESPONSE'])
yhat.head()
yhat.to_csv("submission.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_DEUG_2015'] = pd.to_numeric(data['CAMEO_DEUG_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CAMEO_INTL_2015'] = pd.to_numeric(data['CAMEO_INTL_2015'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Fixing cameo columns
Replacing value to NaN
12 columns were affected
Imputing null values
Encoding categorical columns
