In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from math import sqrt

NFOLDS = 7
SEED = 0
NROWS = None

In [21]:
train = pd.read_csv('training.csv')
test = pd.read_csv('testing.csv')

In [22]:
x = train[['income', 'experience', 'current_job_years',
        'prof_prob', 'city_prob', 'state_prob', 'age_group']]
# 'current_house_years'
y = train['risk_flag']
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_test = test[['income', 'experience', 'current_job_years',
        'prof_prob', 'city_prob', 'state_prob', 'age_group']]
x_train = x
y_train = y
ntrain = x.shape[0]
ntest = X_test.shape[0]

kf = StratifiedKFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [23]:
x_train.head(10)

Unnamed: 0,income,experience,current_job_years,prof_prob,city_prob,state_prob,age_group
0,-1.283145,3,3,0.111558,0.138374,0.182549,0.0
1,0.895457,10,9,0.148427,0.139597,0.127719,2.0
2,-0.349269,4,4,0.134167,0.150502,0.20062,4.0
3,0.437526,2,2,0.148427,0.484108,0.166249,2.0
4,0.268128,11,3,0.115794,0.137834,0.115029,2.0
5,0.666649,0,0,0.115794,0.107235,0.127719,4.0
6,-0.362068,14,8,0.112576,0.077519,0.115029,3.0
7,-1.143362,2,2,0.099278,0.093909,0.133433,1.0
8,0.892794,17,11,0.123635,0.164021,0.163918,0.0
9,1.378495,12,5,0.1312,0.091445,0.14958,0.0


In [24]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


# def get_oof(clf):
#     oof_train = np.zeros((ntrain,))
#     print(oof_train.shape)
#     oof_test = np.zeros((ntest,))
#     print(oof_test.shape)
#     oof_test_skf = np.empty((NFOLDS, ntest))

#     for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
#         x_tr = x_train.loc[x_train.index.intersection(train_index), :]
#         y_tr = y_train.loc[y_train.index.intersection(train_index)]
#         x_te = x_train.loc[x_train.index.intersection(test_index), :]

#         clf.train(x_tr, y_tr)

#         oof_train[test_index] = clf.predict(x_te)
#         oof_test_skf[i, :] = clf.predict(X_test)

#     oof_test[:] = oof_test_skf.mean(axis=0)
#     print(clf)
#     return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [25]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}


rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
#     'silent': 1,
    'eval_metric': 'auc',
    'subsample': 0.7,
    'learning_rate': 0.05,
    'objective': 'binary:logistic',
    'max_depth': 16,
    'num_parallel_tree': 2,
    'min_child_weight': 1,
    'nrounds': 500
}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.5,
    'depth': 8,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'task': 'train', 
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc', 
    'learning_rate': 0.05, 
    'num_leaves': 32, 
    'num_iteration': 10000, 
    'verbose': 0 ,
    'colsample_bytree':.8, 
    'subsample':.9, 
    'max_depth':16, 
    'reg_alpha':.1, 
    'reg_lambda':.1, 
    'min_split_gain':.01, 
    'min_child_weight':1   
}

In [26]:
# %%time
# xg = XgbWrapper(seed=SEED, params=xgb_params)
# et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
# rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
# cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
# lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)


# et_oof_train, et_oof_test = get_oof(et)
# rf_oof_train, rf_oof_test = get_oof(rf)
# cb_oof_train, cb_oof_test = get_oof(cb)
# lg_oof_train, lg_oof_test = get_oof(lg)
# xg_oof_train, xg_oof_test = get_oof(xg)


# print("XG-CV: {}".format(roc_auc_score(y_train, xg_oof_train))) 
# print("ET-CV: {}".format(roc_auc_score(y_train, et_oof_train)))
# print("RF-CV: {}".format(roc_auc_score(y_train, rf_oof_train)))
# print("CB-CV: {}".format(roc_auc_score(y_train, cb_oof_train)))
# print("LG-CV: {}".format(roc_auc_score(y_train, lg_oof_train)))

# # x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train, lg_oof_train), axis=1)
# # x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test, lg_oof_test), axis=1)

# # x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, rf_oof_train), axis=1)
# # x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, rf_oof_test), axis=1)

# # x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, et_oof_train), axis=1)
# # x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, et_oof_test), axis=1)

# x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, et_oof_train, rf_oof_train), axis=1)
# x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, et_oof_test, rf_oof_test), axis=1)

# print("{},{}".format(x_train.shape, x_test.shape))

In [27]:
# x_train_new = x_train
# x_test_new = x_test

# x_train_bin = np.where(x_train_new>0.1, 1, 0)
# # x_train_bin[0:5]
# y_final_ = np.zeros(252000)
# co = 0
# for r in x_train_bin:
#     count = 0
#     for c in r:
#         if c == 1:
#             count += 1
#     if(count > 2):
#         y_final_[co] = 1
#     else:
#         y_final_[co] = 0
#     co += 1

# print('Training score before psuedo labels')
# print(roc_auc_score(y_train, y_final_))

In [28]:
######################## psuedo labels

# y_final = np.zeros(28000)
# co = 0
# for r in x_test_bin:
#     count = 0
#     for c in r:
#         if c == 1:
#             count += 1
#     if(count > 2):
#         y_final[co] = 1
#     else:
#         y_final[co] = 0
#     co += 1

# test['risk_flag'] = y_final

In [29]:
x_train

Unnamed: 0,income,experience,current_job_years,prof_prob,city_prob,state_prob,age_group
0,-1.283145,3,3,0.111558,0.138374,0.182549,0.0
1,0.895457,10,9,0.148427,0.139597,0.127719,2.0
2,-0.349269,4,4,0.134167,0.150502,0.200620,4.0
3,0.437526,2,2,0.148427,0.484108,0.166249,2.0
4,0.268128,11,3,0.115794,0.137834,0.115029,2.0
...,...,...,...,...,...,...,...
251995,1.097092,13,6,0.115465,0.088514,0.146967,2.0
251996,-0.748199,10,6,0.152113,0.138374,0.182549,0.0
251997,-0.164913,7,7,0.106999,0.094535,0.127719,2.0
251998,0.524618,0,0,0.115370,0.097919,0.131912,2.0


In [30]:
X_test

Unnamed: 0,income,experience,current_job_years,prof_prob,city_prob,state_prob,age_group
0,0.832425,19,4,0.144264,0.126478,0.146967,3.0
1,-1.314007,5,5,0.135789,0.121641,0.127719,0.0
2,1.356432,12,9,0.129514,0.078453,0.127719,3.0
3,-1.060588,9,3,0.121465,0.041293,0.127719,2.0
4,-1.731466,18,13,0.119604,0.173848,0.146967,0.0
...,...,...,...,...,...,...,...
27995,1.722668,13,5,0.115570,0.068729,0.131249,3.0
27996,-0.722422,9,9,0.134167,0.237569,0.182549,2.0
27997,1.071915,5,4,0.129514,0.040179,0.127719,0.0
27998,1.555451,13,13,0.153572,0.164935,0.153777,3.0


In [31]:
y_train

0         0
1         0
2         0
3         1
4         1
         ..
251995    0
251996    0
251997    0
251998    0
251999    0
Name: risk_flag, Length: 252000, dtype: int64

In [32]:
y_test = pd.read_csv('ensemble_2021-04-17 13:58:24.221825(0.88018).csv')['risk_flag']

In [33]:
y_test

0        0
1        0
2        0
3        0
4        0
        ..
27995    0
27996    0
27997    0
27998    1
27999    1
Name: risk_flag, Length: 28000, dtype: int64

In [34]:
f = [x_train, X_test]
x_tra = pd.concat(f, ignore_index=True)

In [35]:
y_tra = pd.concat([y_train, y_test], ignore_index=True)

In [36]:
x_tra

Unnamed: 0,income,experience,current_job_years,prof_prob,city_prob,state_prob,age_group
0,-1.283145,3,3,0.111558,0.138374,0.182549,0.0
1,0.895457,10,9,0.148427,0.139597,0.127719,2.0
2,-0.349269,4,4,0.134167,0.150502,0.200620,4.0
3,0.437526,2,2,0.148427,0.484108,0.166249,2.0
4,0.268128,11,3,0.115794,0.137834,0.115029,2.0
...,...,...,...,...,...,...,...
279995,1.722668,13,5,0.115570,0.068729,0.131249,3.0
279996,-0.722422,9,9,0.134167,0.237569,0.182549,2.0
279997,1.071915,5,4,0.129514,0.040179,0.127719,0.0
279998,1.555451,13,13,0.153572,0.164935,0.153777,3.0


In [37]:
X_test

Unnamed: 0,income,experience,current_job_years,prof_prob,city_prob,state_prob,age_group
0,0.832425,19,4,0.144264,0.126478,0.146967,3.0
1,-1.314007,5,5,0.135789,0.121641,0.127719,0.0
2,1.356432,12,9,0.129514,0.078453,0.127719,3.0
3,-1.060588,9,3,0.121465,0.041293,0.127719,2.0
4,-1.731466,18,13,0.119604,0.173848,0.146967,0.0
...,...,...,...,...,...,...,...
27995,1.722668,13,5,0.115570,0.068729,0.131249,3.0
27996,-0.722422,9,9,0.134167,0.237569,0.182549,2.0
27997,1.071915,5,4,0.129514,0.040179,0.127719,0.0
27998,1.555451,13,13,0.153572,0.164935,0.153777,3.0


In [38]:
ntrain = x_tra.shape[0]
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    print(oof_train.shape)
    oof_test = np.zeros((ntest,))
    print(oof_test.shape)
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_tra, y_tra)):
        x_tr = x_tra.loc[x_tra.index.intersection(train_index), :]
        y_tr = y_tra.loc[y_tra.index.intersection(train_index)]
        x_te = x_tra.loc[x_tra.index.intersection(test_index), :]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    print(clf)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [39]:
%%time
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)


et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)
lg_oof_train, lg_oof_test = get_oof(lg)
xg_oof_train, xg_oof_test = get_oof(xg)


(280000,)
(28000,)
<__main__.SklearnWrapper object at 0x7f1bfd3ed280>
(280000,)
(28000,)
<__main__.SklearnWrapper object at 0x7f1bfd3ed5e0>
(280000,)
(28000,)
0:	total: 144ms	remaining: 2m 23s
1:	total: 210ms	remaining: 1m 44s
2:	total: 265ms	remaining: 1m 28s
3:	total: 320ms	remaining: 1m 19s
4:	total: 385ms	remaining: 1m 16s
5:	total: 442ms	remaining: 1m 13s
6:	total: 496ms	remaining: 1m 10s
7:	total: 551ms	remaining: 1m 8s
8:	total: 611ms	remaining: 1m 7s
9:	total: 668ms	remaining: 1m 6s
10:	total: 725ms	remaining: 1m 5s
11:	total: 779ms	remaining: 1m 4s
12:	total: 846ms	remaining: 1m 4s
13:	total: 917ms	remaining: 1m 4s
14:	total: 984ms	remaining: 1m 4s
15:	total: 1.05s	remaining: 1m 4s
16:	total: 1.11s	remaining: 1m 4s
17:	total: 1.17s	remaining: 1m 3s
18:	total: 1.23s	remaining: 1m 3s
19:	total: 1.29s	remaining: 1m 3s
20:	total: 1.34s	remaining: 1m 2s
21:	total: 1.4s	remaining: 1m 2s
22:	total: 1.46s	remaining: 1m 1s
23:	total: 1.52s	remaining: 1m 1s
24:	total: 1.57s	remaining: 1



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




You can set `force_col_wise=true` to remove the overhead.




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
<__main__.LightGBMWrapper object at 0x7f1bfd3edb80>
(280000,)
(28000,)
<__main__.XgbWrapper object at 0x7f1bfd3ed220>
CPU times: user 6h 19min 16s, sys: 3min 21s, total: 6h 22min 37s
Wall time: 53min 17s


In [40]:
print("XG-CV: {}".format(roc_auc_score(y_tra, xg_oof_train))) 
print("ET-CV: {}".format(roc_auc_score(y_tra, et_oof_train)))
print("RF-CV: {}".format(roc_auc_score(y_tra, rf_oof_train)))
print("CB-CV: {}".format(roc_auc_score(y_tra, cb_oof_train)))
print("LG-CV: {}".format(roc_auc_score(y_tra, lg_oof_train)))

x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, et_oof_train, rf_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, et_oof_test, rf_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

XG-CV: 0.939669961354736
ET-CV: 0.8515776238566354
RF-CV: 0.8945986186525241
CB-CV: 0.9233739021991503
LG-CV: 0.9327680398523418
(280000, 5),(28000, 5)


In [41]:
x_train_new = x_train
x_test_new = x_test

In [42]:
x_train_bin = np.where(x_train_new>0.1, 1, 0)
# x_train_bin[0:5]
y_final_ = np.zeros(280000)
co = 0
for r in x_train_bin:
    count = 0
    for c in r:
        if c == 1:
            count += 1
    if(count > 2):
        y_final_[co] = 1
    else:
        y_final_[co] = 0
    co += 1
    
print(roc_auc_score(y_tra, y_final_))

0.8863090376398501


In [43]:
x_test_bin = np.where(x_test_new>0.1, 1, 0)
x_test_bin[0:5]

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [44]:
y_final = np.zeros(28000)
co = 0
for r in x_test_bin:
    count = 0
    for c in r:
        if c == 1:
            count += 1
    if(count > 2):
        y_final[co] = 1
    else:
        y_final[co] = 0
    co += 1

In [45]:
test['risk_flag'] = y_final
print(test.risk_flag.value_counts())
test['risk_flag'] = test.risk_flag.astype(int)

0.0    20094
1.0     7906
Name: risk_flag, dtype: int64


In [46]:
import datetime
date = datetime.datetime.now()
test[['id', 'risk_flag']].to_csv(f'ensemble_{date}.csv', index=False)