In [12]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import pickle

from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
%matplotlib inline

# data

In [2]:
df = pd.read_csv('Processed_data/clean_data.csv',index_col=0)

In [3]:
df.head()

Unnamed: 0,claim_id,enrollee_id,provider_id,provider_status,hmo_id,care_id,qty,amount,approved_qty,approved_amount,hmo_approved,created_at,vetted_at,label,unit_price
0,10.0,89.0,1.0,1.0,1.0,586.0,6.0,1816.08,6.0,1816.08,1.0,2018-03-12 14:53:46,2018-05-21 10:05:30,0,302.68
1,11.0,89.0,1.0,1.0,1.0,586.0,6.0,1816.08,6.0,1816.08,1.0,2018-03-13 14:50:39,2018-05-21 10:07:19,0,302.68
2,13.0,74.0,1.0,1.0,1.0,434.0,5.0,115.0,5.0,115.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,23.0
3,13.0,74.0,1.0,1.0,1.0,1102.0,10.0,1265.0,10.0,1265.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,126.5
4,13.0,74.0,1.0,1.0,1.0,299.0,15.0,138.0,15.0,138.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,9.2


In [8]:
X = df[['enrollee_id','provider_id','provider_status','hmo_id','care_id','qty','unit_price']]
y = df['label']
Xtrain,Xtest,ytrain,ytest = train_test_split(X.values,y.values,test_size = 0.2,random_state=0)


In [5]:
def init_scores():
    
    global scores 
    scores = {}
    scores['f1_socre'] = []
    scores['precision'] = []
    scores['recall'] = []
    scores['FPR'] = []
    scores['specificity'] = []
    scores['roc_auc'] = []
    
    return scores


def evaluation(ytest,Xtest,cls):
    global scores
    scores = init_scores()
    ypred = cls.predict(Xtest)
    C = confusion_matrix(ytest,ypred)
    TN = C[0][0]
    FN = C[1][0]
    TP = C[1][1]
    FP = C[0][1]

    print('f1_socre: {:0.3f}'.format(f1_score(ytest,ypred)))
    scores['f1_socre'].append(f1_score(ytest,ypred))
    print('precision: {:0.3f}'.format(TP/(TP+FP)))
    scores['precision'].append(TP/(TP+FP))
    print('recall/sensitivity(true positive rate): {:0.3f}'.format(TP/(TP+FN)))
    scores['recall'].append(TP/(TP+FN))
    print('false positive rate (FPR): {:0.3f}'.format(1-(TN/(TN+FP)))) # 1 - specificity
    scores['FPR'].append(1-(TN/(TN+FP)))
    print('spcificity(true negative rate): {:0.3f}'.format(TN/(TN+FP)))
    scores['specificity'].append(TN/(TN+FP))
    print('ROC_AUC_score: {:0.3f}'.format(roc_auc_score(ytest,ypred)))
    scores['roc_auc'].append(roc_auc_score(ytest,ypred))
    
    
def cv(Xtrain,ytrain,model):
    global scores
    models = []
    kf = KFold(n_splits=4)
    print(model)
    n = 0
    for train_index, test_index in kf.split(Xtrain):
        print('cross_validate_run: {}'.format(n))
        Xtr, Xte = Xtrain[train_index], Xtrain[test_index]
        ytr, yte = ytrain[train_index], ytrain[test_index]
        cls = model.fit(Xtr, ytr) 
        models.append(cls)
        evaluation(yte,Xte,cls)
        n += 1
    
    print('\n mean scores +/- sd: \n')
    for k in scores:
        print('{} : {:0.3f} +/- {:0.3f}'.format(k, np.array(scores[k]).mean(),  np.array(scores[k]).std()))
        
    scores = init_scores()
    return models

# lightGBM no_upsampling

In [30]:
model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=100, max_depth=15, learning_rate=0.1, \
                       n_estimators=300, subsample_for_bin=200000, objective=None, class_weight={1:100}, \
                       min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, \
                       subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, \
                       random_state=None, n_jobs=3, silent=True, importance_type='split')
models_lgb = cv(Xtrain,ytrain,model_lgb)

LGBMClassifier(boosting_type='gbdt', class_weight={1: 100},
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=15, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=300, n_jobs=3, num_leaves=100,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)
cross_validate_run: 0
f1_socre: 0.195
precision: 0.111
recall/sensitivity(true positive rate): 0.825
false positive rate (FPR): 0.341
spcificity(true negative rate): 0.659
ROC_AUC_score: 0.742
cross_validate_run: 1
f1_socre: 0.200
precision: 0.114
recall/sensitivity(true positive rate): 0.823
false positive rate (FPR): 0.340
spcificity(true negative rate): 0.660
ROC_AUC_score: 0.742
cross_validate_run: 2
f1_socre: 0.193
precision: 0.109
recall/sensitivity(true positive rate): 0.823
false positive rate (FPR): 0.342
sp

In [22]:
model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=100, max_depth=10, learning_rate=0.1, \
                       n_estimators=300, subsample_for_bin=200000, objective=None, class_weight={1:50}, \
                       min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, \
                       subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, \
                       random_state=None, n_jobs=3, silent=True, importance_type='split')
models_lgb = cv(Xtrain,ytrain,model_lgb)

LGBMClassifier(boosting_type='gbdt', class_weight={1: 50}, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=10,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=3, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
cross_validate_run: 0
f1_socre: 0.216
precision: 0.125
recall/sensitivity(true positive rate): 0.796
false positive rate (FPR): 0.288
spcificity(true negative rate): 0.712
ROC_AUC_score: 0.754
cross_validate_run: 1
f1_socre: 0.225
precision: 0.131
recall/sensitivity(true positive rate): 0.805
false positive rate (FPR): 0.285
spcificity(true negative rate): 0.715
ROC_AUC_score: 0.760
cross_validate_run: 2
f1_socre: 0.214
precision: 0.123
recall/sensitivity(true positive rate): 0.791
false positive rate (FPR): 0.286
spcificity(true ne

# xgboost-no_upsampling_with_scale

In [19]:

model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=19, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
cv(Xtrain,ytrain,model_xgb)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=19, seed=None,
              silent=None, subsample=1, verbosity=1)
cross_validate_run: 0
f1_socre: 0.338
precision: 0.266
recall/sensitivity(true positive rate): 0.464
false positive rate (FPR): 0.066
spcificity(true negative rate): 0.934
ROC_AUC_score: 0.699
cross_validate_run: 1
f1_socre: 0.346
precision: 0.275
recall/sensitivity(true positive rate): 0.465
false positive rate (FPR): 0.065
spcificity(true negative rate): 0.935
ROC_AUC_score: 0.700
cross_validate_run: 2
f1_socre: 0.335
precision: 0.264
recall/sensitivity(true positive rate): 0.460
false positive rate (FPR): 0.065
spcifici

[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               learning_rate=0.1, max_delta_step=0, max_depth=15,
               min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
               nthread=None, objective='binary:logistic', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=19, seed=None,
               silent=None, subsample=1, verbosity=1),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               learning_rate=0.1, max_delta_step=0, max_depth=15,
               min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
               nthread=None, objective='binary:logistic', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=19, seed=None,
               silent=None, subsample=1, verbosity=1),
 XGBClassifier(base_score=0.5, booster='

# upsampling
## llightGBM

In [26]:
Xy = X.copy()
Xy['y'] = y

class0 = Xy.loc[Xy.y == 0]
class1 = Xy.loc[Xy.y == 1]
print(class0.shape,class1.shape)
class1 = class1.sample(n=class0.shape[0],replace=True,random_state=0)
Xy = pd.concat([class0,class1])
yup = Xy['y']
Xup = Xy.drop(columns=['y'])

# re-do train_test split of balanced data
Xuptrain,Xuptest,yuptrain,yuptest = train_test_split(Xup.values,yup.values,test_size = 0.2,random_state=42)

# lightgmb with class weight
model_lgb_up_sc = LGBMClassifier(boosting_type='gbdt', class_weight={1: 2}, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=15,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=3, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

models_up_sc = cv(Xuptrain,yuptrain,model_lgb_up_sc)

(268137, 8) (13914, 8)
LGBMClassifier(boosting_type='gbdt', class_weight={1: 2}, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=15,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=3, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
cross_validate_run: 0
f1_socre: 0.868
precision: 0.784
recall/sensitivity(true positive rate): 0.973
false positive rate (FPR): 0.268
spcificity(true negative rate): 0.732
ROC_AUC_score: 0.852
cross_validate_run: 1
f1_socre: 0.869
precision: 0.784
recall/sensitivity(true positive rate): 0.974
false positive rate (FPR): 0.270
spcificity(true negative rate): 0.730
ROC_AUC_score: 0.852
cross_validate_run: 2
f1_socre: 0.869
precision: 0.786
recall/sensitivity(true positive rate): 0.972
false positive rate (FPR): 0.

In [31]:
evaluation(yuptest,Xuptest,models_up_sc[3])

f1_socre: 0.868
precision: 0.783
recall/sensitivity(true positive rate): 0.973
false positive rate (FPR): 0.269
spcificity(true negative rate): 0.731
ROC_AUC_score: 0.852


In [33]:
# without class weight
model_lgb_up = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=15,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=3, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
models_up = cv(Xuptrain,yuptrain,model_lgb_up)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=15,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=3, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
cross_validate_run: 0
f1_socre: 0.872
precision: 0.848
recall/sensitivity(true positive rate): 0.897
false positive rate (FPR): 0.161
spcificity(true negative rate): 0.839
ROC_AUC_score: 0.868
cross_validate_run: 1
f1_socre: 0.871
precision: 0.848
recall/sensitivity(true positive rate): 0.896
false positive rate (FPR): 0.162
spcificity(true negative rate): 0.838
ROC_AUC_score: 0.867
cross_validate_run: 2
f1_socre: 0.871
precision: 0.845
recall/sensitivity(true positive rate): 0.899
false positive rate (FPR): 0.165
spcificity(true negat

In [34]:
evaluation(yuptest,Xuptest,models_up[3])

f1_socre: 0.874
precision: 0.846
recall/sensitivity(true positive rate): 0.904
false positive rate (FPR): 0.164
spcificity(true negative rate): 0.836
ROC_AUC_score: 0.870


## xgboost

In [32]:
# without scale positive class
model_xgb_up = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
models_up_xgb = cv(Xuptrain,yuptrain,model_xgb_up)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
cross_validate_run: 0
f1_socre: 0.954
precision: 0.919
recall/sensitivity(true positive rate): 0.991
false positive rate (FPR): 0.087
spcificity(true negative rate): 0.913
ROC_AUC_score: 0.952
cross_validate_run: 1
f1_socre: 0.952
precision: 0.917
recall/sensitivity(true positive rate): 0.990
false positive rate (FPR): 0.091
spcificity(true negative rate): 0.909
ROC_AUC_score: 0.950
cross_validate_run: 2
f1_socre: 0.953
precision: 0.916
recall/sensitivity(true positive rate): 0.992
false positive rate (FPR): 0.090
spcificit

In [35]:
evaluation(yuptest,Xuptest,models_up_xgb[0])

f1_socre: 0.951
precision: 0.914
recall/sensitivity(true positive rate): 0.990
false positive rate (FPR): 0.092
spcificity(true negative rate): 0.908
ROC_AUC_score: 0.949


In [28]:
# with scale positive class
model_xgb_up_sc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=1, verbosity=1)
models_up_xgb_sc = cv(Xuptrain,yuptrain,model_xgb_up)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=1, verbosity=1)
cross_validate_run: 0
f1_socre: 0.947
precision: 0.901
recall/sensitivity(true positive rate): 0.999
false positive rate (FPR): 0.110
spcificity(true negative rate): 0.890
ROC_AUC_score: 0.944
cross_validate_run: 1
f1_socre: 0.947
precision: 0.899
recall/sensitivity(true positive rate): 0.999
false positive rate (FPR): 0.112
spcificity(true negative rate): 0.888
ROC_AUC_score: 0.943
cross_validate_run: 2
f1_socre: 0.945
precision: 0.896
recall/sensitivity(true positive rate): 0.999
false positive rate (FPR): 0.116
spcificit

In [29]:
evaluation(yuptest,Xuptest,models_up_xgb_sc[0])

f1_socre: 0.944
precision: 0.896
recall/sensitivity(true positive rate): 0.998
false positive rate (FPR): 0.116
spcificity(true negative rate): 0.884
ROC_AUC_score: 0.941


In [38]:
evaluation(yuptest,Xuptest,models_up_xgb[0])

f1_socre: 0.951
precision: 0.914
recall/sensitivity(true positive rate): 0.990
false positive rate (FPR): 0.092
spcificity(true negative rate): 0.908
ROC_AUC_score: 0.949


In [39]:
pickle.dump(models_up_xgb[0],open("Models/xgb_up_tree15.dat", "wb"))

In [36]:
from sklearn.naive_bayes import ComplementNB
model1 = ComplementNB()
models_nb = cv(Xuptrain,yuptrain,model1)
evaluation(yuptest,Xuptest,models_nb[0])

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
cross_validate_run: 0
f1_socre: 0.450
precision: 0.551
recall/sensitivity(true positive rate): 0.380
false positive rate (FPR): 0.310
spcificity(true negative rate): 0.690
ROC_AUC_score: 0.535
cross_validate_run: 1
f1_socre: 0.457
precision: 0.559
recall/sensitivity(true positive rate): 0.387
false positive rate (FPR): 0.308
spcificity(true negative rate): 0.692
ROC_AUC_score: 0.540
cross_validate_run: 2
f1_socre: 0.451
precision: 0.550
recall/sensitivity(true positive rate): 0.383
false positive rate (FPR): 0.312
spcificity(true negative rate): 0.688
ROC_AUC_score: 0.535
cross_validate_run: 3
f1_socre: 0.453
precision: 0.554
recall/sensitivity(true positive rate): 0.383
false positive rate (FPR): 0.308
spcificity(true negative rate): 0.692
ROC_AUC_score: 0.538

 mean scores +/- sd: 

f1_socre : 0.453 +/- 0.000
precision : 0.554 +/- 0.000
recall : 0.383 +/- 0.000
FPR : 0.308 +/- 0.000
specificity : 0.692 +/- 0.000
ro