In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from math import sqrt

NFOLDS = 5
SEED = 0
NROWS = None

In [2]:
# train = pd.read_csv('cleaned_train.csv')
# test = pd.read_csv('cleaned_test.csv')
train = pd.read_csv('training.csv')
test = pd.read_csv('testing.csv')

In [3]:
x = train[['income', 'experience', 'current_job_years',
       'current_house_years', 'prof_prob', 'city_prob', 'state_prob', 'age_group']]
y = train['risk_flag']
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_test = test[['income', 'experience', 'current_job_years',
       'current_house_years', 'prof_prob', 'city_prob', 'state_prob', 'age_group']]
x_train = x
y_train = y
ntrain = x.shape[0]
ntest = X_test.shape[0]

In [4]:
kf = StratifiedKFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [5]:
train.head(10)

Unnamed: 0,Id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag,prof_prob,city_prob,state_prob,age_group
0,1,-1.283145,23,3,0,0,0,33,Rewa,13,3,13,0,0.111558,0.138374,0.182549,0.0
1,2,0.895457,40,10,0,0,0,43,Parbhani,14,9,13,0,0.148427,0.139597,0.127719,2.0
2,3,-0.349269,66,4,1,0,0,47,Alappuzha,12,4,10,0,0.134167,0.150502,0.20062,4.0
3,4,0.437526,41,2,0,0,1,43,Bhubaneswar,17,2,12,1,0.148427,0.484108,0.166249,2.0
4,5,0.268128,47,11,0,0,0,11,Tiruchirappalli[10],22,3,14,1,0.115794,0.137834,0.115029,2.0
5,6,0.666649,64,0,0,0,0,11,Jalgaon,14,0,12,0,0.115794,0.107235,0.127719,4.0
6,7,-0.362068,58,14,1,0,0,31,Tiruppur,22,8,12,0,0.112576,0.077519,0.115029,3.0
7,8,-1.143362,33,2,0,0,0,20,Jamnagar,6,2,14,0,0.099278,0.093909,0.133433,1.0
8,9,0.892794,24,17,0,0,1,25,Kota[6],20,11,11,0,0.123635,0.164021,0.163918,0.0
9,10,1.378495,23,12,0,0,0,2,Karimnagar,23,5,13,0,0.1312,0.091445,0.14958,0.0


In [6]:
x_train.head(10)

Unnamed: 0,income,experience,current_job_years,current_house_years,prof_prob,city_prob,state_prob,age_group
0,-1.283145,3,3,13,0.111558,0.138374,0.182549,0.0
1,0.895457,10,9,13,0.148427,0.139597,0.127719,2.0
2,-0.349269,4,4,10,0.134167,0.150502,0.20062,4.0
3,0.437526,2,2,12,0.148427,0.484108,0.166249,2.0
4,0.268128,11,3,14,0.115794,0.137834,0.115029,2.0
5,0.666649,0,0,12,0.115794,0.107235,0.127719,4.0
6,-0.362068,14,8,12,0.112576,0.077519,0.115029,3.0
7,-1.143362,2,2,14,0.099278,0.093909,0.133433,1.0
8,0.892794,17,11,11,0.123635,0.164021,0.163918,0.0
9,1.378495,12,5,13,0.1312,0.091445,0.14958,0.0


In [7]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    print(oof_train.shape)
    oof_test = np.zeros((ntest,))
    print(oof_test.shape)
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train.loc[x_train.index.intersection(train_index), :]
        y_tr = y_train.loc[y_train.index.intersection(train_index)]
        x_te = x_train.loc[x_train.index.intersection(test_index), :]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    print(clf)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [8]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}


rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
#     'silent': 1,
    'eval_metric': 'auc',
    'subsample': 0.7,
    'learning_rate': 0.05,
    'objective': 'binary:logistic',
    'max_depth': 16,
    'num_parallel_tree': 2,
    'min_child_weight': 1,
    'nrounds': 500
}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.5,
    'depth': 8,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'task': 'train', 
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc', 
    'learning_rate': 0.05, 
    'num_leaves': 32, 
    'num_iteration': 10000, 
    'verbose': 0 ,
    'colsample_bytree':.8, 
    'subsample':.9, 
    'max_depth':16, 
    'reg_alpha':.1, 
    'reg_lambda':.1, 
    'min_split_gain':.01, 
    'min_child_weight':1   
}

In [10]:
%%time
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)


et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)
lg_oof_train, lg_oof_test = get_oof(lg)
xg_oof_train, xg_oof_test = get_oof(xg)


print("XG-CV: {}".format(roc_auc_score(y_train, xg_oof_train))) 
print("ET-CV: {}".format(roc_auc_score(y_train, et_oof_train)))
print("RF-CV: {}".format(roc_auc_score(y_train, rf_oof_train)))
print("CB-CV: {}".format(roc_auc_score(y_train, cb_oof_train)))
print("LG-CV: {}".format(roc_auc_score(y_train, lg_oof_train)))

# x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train, lg_oof_train), axis=1)
# x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test, lg_oof_test), axis=1)

# x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, rf_oof_train), axis=1)
# x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, rf_oof_test), axis=1)

# x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, et_oof_train), axis=1)
# x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, et_oof_test), axis=1)

x_train = np.concatenate((xg_oof_train, cb_oof_train, lg_oof_train, et_oof_train, rf_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, cb_oof_test, lg_oof_test, et_oof_test, rf_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))


(252000,)
(28000,)


AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [11]:
x_train_new = x_train
x_test_new = x_test

In [12]:
# logistic_regression = LogisticRegression()
# logistic_regression.fit(x_train_new,y_train)

# test['risk_flag'] = logistic_regression.predict_proba(x_test)[:,1]
# test.risk_flag = np.where(test.risk_flag>0.5, 1, 0)
# y_preds = logistic_regression.predict_proba(x_train_new)[:, 1]
# y_preds = np.where(y_preds > 0.5, 1, 0)
# print('Ensemble-score: {}'.format(roc_auc_score(y_train, y_preds)))

In [13]:
# test['risk_flag'] = logistic_regression.predict_proba(x_test_new)[:,1]
# test.risk_flag = np.where(test.risk_flag>0.4, 1, 0)
# y_train[0:20]

In [13]:
# x_train[0:20, :]

In [14]:
# test.risk_flag.value_counts()

In [15]:
# import datetime
# date = datetime.datetime.now()
# test[['id', 'risk_flag']].to_csv(f'ensemble_{date}.csv', index=False)

In [16]:
# XG-CV: 0.6292151536394991
# ET-CV: 0.8988427797247228
# RF-CV: 0.9010358955914791
# CB-CV: 0.7645611386218554
# LG-CV: 0.929272754074071

# XG-CV: 0.9330286217604723
# ET-CV: 0.8988427797247228
# RF-CV: 0.9010358955914791
# CB-CV: 0.8646520054530108
# LG-CV: 0.929272754074071

# XG-CV: 0.9331201026431076
# ET-CV: 0.8988302562218672
# RF-CV: 0.9010479446584012
# CB-CV: 0.9009028763100921
# LG-CV: 0.9292682558375024

In [21]:
x_train_bin = np.where(x_train_new>0.1, 1, 0)
# x_train_bin[0:5]
y_final_ = np.zeros(252000)
co = 0
for r in x_train_bin:
    count = 0
    for c in r:
        if c == 1:
            count += 1
    if(count > 2):
        y_final_[co] = 1
    else:
        y_final_[co] = 0
    co += 1

In [22]:
x_test_bin = np.where(x_test_new>0.1, 1, 0)
x_test_bin[0:5]

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [23]:
y_final = np.zeros(28000)
co = 0
for r in x_test_bin:
    count = 0
    for c in r:
        if c == 1:
            count += 1
    if(count > 2):
        y_final[co] = 1
    else:
        y_final[co] = 0
    co += 1

In [24]:
test['risk'] = y_final
print(test.risk.value_counts())
test['risk'] = test.risk.astype(int)

0.0    20757
1.0     7243
Name: risk, dtype: int64


In [26]:
import datetime
date = datetime.datetime.now()
test[['id', 'risk']].to_csv(f'ensemble_{date}.csv', index=False)

In [25]:
print(roc_auc_score(y_train, y_final_))

0.8703814616606285
