In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import ComplementNB
import pickle

from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# data

In [15]:
df = pd.read_csv('Processed_data/full_clean_data.csv',index_col = 0)

In [16]:
df.head()

Unnamed: 0,claim_id,enrollee_id,provider_id,provider_status,hmo_id,care_id,qty,amount,approved_qty,approved_amount,hmo_approved,created_at,vetted_at,label,unit_price,id,diagnosis_id
0,10.0,89.0,1.0,1.0,1.0,586.0,6.0,1816.08,6.0,1816.08,1.0,2018-03-12 14:53:46,2018-05-21 10:05:30,0,302.68,15.0,9.0
1,11.0,89.0,1.0,1.0,1.0,586.0,6.0,1816.08,6.0,1816.08,1.0,2018-03-13 14:50:39,2018-05-21 10:07:19,0,302.68,16.0,15925.0
2,13.0,74.0,1.0,1.0,1.0,434.0,5.0,115.0,5.0,115.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,23.0,36.0,4342.0
3,13.0,74.0,1.0,1.0,1.0,1102.0,10.0,1265.0,10.0,1265.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,126.5,36.0,4342.0
4,13.0,74.0,1.0,1.0,1.0,299.0,15.0,138.0,15.0,138.0,1.0,2018-03-16 10:28:53,2018-05-21 10:09:30,0,9.2,36.0,4342.0


In [17]:
df.loc[df['label'] == 0]['amount'].sum(),df.loc[df['label'] == 1]['amount'].sum()

(2496865315.030001, 266294130.82000002)

In [18]:
df.loc[df['label']==1]['claim_id'].count(),df.loc[df['label']==0]['claim_id'].count()


(29389, 634234)

In [20]:
X = df[['diagnosis_id','provider_id','provider_status','hmo_id','care_id','qty','unit_price']]
y = df['label']

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 663623 entries, 0 to 663622
Data columns (total 7 columns):
diagnosis_id       663615 non-null float64
provider_id        663623 non-null float64
provider_status    663623 non-null float64
hmo_id             663623 non-null float64
care_id            663623 non-null float64
qty                663623 non-null float64
unit_price         663623 non-null float64
dtypes: float64(7)
memory usage: 40.5 MB


In [22]:
dropindex = X.loc[X.diagnosis_id.isnull()].index

In [23]:
X.drop(dropindex, axis=0, inplace = True)
y.drop(dropindex, axis=0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 663615 entries, 0 to 663622
Data columns (total 7 columns):
diagnosis_id       663615 non-null float64
provider_id        663615 non-null float64
provider_status    663615 non-null float64
hmo_id             663615 non-null float64
care_id            663615 non-null float64
qty                663615 non-null float64
unit_price         663615 non-null float64
dtypes: float64(7)
memory usage: 40.5 MB


In [25]:
def init_scores():
     
    scores = {}
    scores['f1_socre'] = []
    scores['precision'] = []
    scores['recall'] = []
    scores['FPR'] = []
    scores['specificity'] = []
    scores['roc_auc'] = []
    
    return scores


def evaluation(ytest,Xtest,cls):

    scores = init_scores()
    ypred = cls.predict(Xtest)
    C = confusion_matrix(ytest,ypred)
    TN = C[0][0]
    FN = C[1][0]
    TP = C[1][1]
    FP = C[0][1]

    print('f1_socre: {:0.3f}'.format(f1_score(ytest,ypred)))
    scores['f1_socre'].append(f1_score(ytest,ypred))
    print('precision: {:0.3f}'.format(TP/(TP+FP)))
    scores['precision'].append(TP/(TP+FP))
    print('recall/sensitivity(true positive rate): {:0.3f}'.format(TP/(TP+FN)))
    scores['recall'].append(TP/(TP+FN))
    print('false positive rate (FPR): {:0.3f}'.format(1-(TN/(TN+FP)))) # 1 - specificity
    scores['FPR'].append(1-(TN/(TN+FP)))
    print('spcificity(true negative rate): {:0.3f}'.format(TN/(TN+FP)))
    scores['specificity'].append(TN/(TN+FP))
    print('ROC_AUC_score: {:0.3f}'.format(roc_auc_score(ytest,ypred)))
    scores['roc_auc'].append(roc_auc_score(ytest,ypred))
    
    return scores
      
def cv(Xtrain,ytrain,model):
    
    models = []
    kf = KFold(n_splits=4)
    print(model)
    n = 0
    for train_index, test_index in kf.split(Xtrain):
        print('cross_validate_run: {}'.format(n))
        Xtr, Xte = Xtrain[train_index], Xtrain[test_index]
        ytr, yte = ytrain[train_index], ytrain[test_index]
        cls = model.fit(Xtr, ytr) 
        models.append(cls)
        scores = evaluation(yte,Xte,cls)
        n += 1
        
    
    print('\n mean scores +/- sd: \n')
    for k in scores:
        print('{} : {:0.3f} +/- {:0.3f}'.format(k, np.array(scores[k]).mean(),  np.array(scores[k]).std()))

    return models, scores

In [26]:

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state=42)


# 3 models upsampling training data and testing data

In [32]:

Xy = X.copy()
Xy['y'] = y

class0up = Xy.loc[Xy.y == 0]
class1up = Xy.loc[Xy.y == 1]

class1up = class1up.sample(n=class0up.shape[0],replace=True,random_state=0)
Xy = pd.concat([class0up,class1up])
yup = Xy['y']
Xup = Xy.drop(columns=['y'])
# re-do train_test split of balanced data
Xuptrain,Xuptest,yuptrain,yuptest = train_test_split(Xup.values,yup.values,test_size = 0.2,random_state=42)

In [33]:
# nb
print('nb_up_train:')
model_nb = ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
models_nb_utt, scores_nb_utt = cv(Xuptrain,yuptrain, model_nb)

# lgb
print('lgb:')
model_lgb = LGBMClassifier(boosting_type='gbdt', class_weight={1: 1}, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=15,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=300, n_jobs=4, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
models_lgb_utt, scores_lgb_utt = cv(Xuptrain,yuptrain,model_lgb)

nb_up_train:
ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
cross_validate_run: 0
f1_socre: 0.381
precision: 0.525
recall/sensitivity(true positive rate): 0.299
false positive rate (FPR): 0.269
spcificity(true negative rate): 0.731
ROC_AUC_score: 0.515
cross_validate_run: 1
f1_socre: 0.373
precision: 0.523
recall/sensitivity(true positive rate): 0.289
false positive rate (FPR): 0.264
spcificity(true negative rate): 0.736
ROC_AUC_score: 0.513
cross_validate_run: 2
f1_socre: 0.366
precision: 0.528
recall/sensitivity(true positive rate): 0.280
false positive rate (FPR): 0.250
spcificity(true negative rate): 0.750
ROC_AUC_score: 0.515
cross_validate_run: 3
f1_socre: 0.378
precision: 0.526
recall/sensitivity(true positive rate): 0.295
false positive rate (FPR): 0.266
spcificity(true negative rate): 0.734
ROC_AUC_score: 0.514

 mean scores +/- sd: 

f1_socre : 0.378 +/- 0.000
precision : 0.526 +/- 0.000
recall : 0.295 +/- 0.000
FPR : 0.266 +/- 0.000
specificity : 0.734

In [34]:
# xgb
print('xgb:')
model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
models_xgb_utt, scores_xgb_utt = cv(Xuptrain,yuptrain,model_xgb)

xgb:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
cross_validate_run: 0
f1_socre: 0.942
precision: 0.911
recall/sensitivity(true positive rate): 0.975
false positive rate (FPR): 0.095
spcificity(true negative rate): 0.905
ROC_AUC_score: 0.940
cross_validate_run: 1
f1_socre: 0.942
precision: 0.911
recall/sensitivity(true positive rate): 0.975
false positive rate (FPR): 0.095
spcificity(true negative rate): 0.905
ROC_AUC_score: 0.940
cross_validate_run: 2
f1_socre: 0.942
precision: 0.912
recall/sensitivity(true positive rate): 0.974
false positive rate (FPR): 0.094
spci

In [35]:
# results without enrollee
test_scores_xgb_utt = evaluation(ytest.values,Xtest.values,models_xgb_utt[0])
test_scores_lgb_utt = evaluation(ytest.values,Xtest.values,models_lgb_utt[0])

f1_socre: 0.512
precision: 0.347
recall/sensitivity(true positive rate): 0.979
false positive rate (FPR): 0.086
spcificity(true negative rate): 0.914
ROC_AUC_score: 0.946
f1_socre: 0.335
precision: 0.206
recall/sensitivity(true positive rate): 0.898
false positive rate (FPR): 0.162
spcificity(true negative rate): 0.838
ROC_AUC_score: 0.868


In [58]:
# past results with all features
"""
test_scores_xgb_utt = evaluation(ytest.values,Xtest.values,models_xgb_utt[0])
test_scores_lgb_utt = evaluation(ytest.values,Xtest.values,models_lgb_utt[0])
"""


f1_socre: 0.696
precision: 0.534
recall/sensitivity(true positive rate): 0.998
false positive rate (FPR): 0.041
spcificity(true negative rate): 0.959
ROC_AUC_score: 0.979
f1_socre: 0.378
precision: 0.238
recall/sensitivity(true positive rate): 0.920
false positive rate (FPR): 0.138
spcificity(true negative rate): 0.862
ROC_AUC_score: 0.891


In [36]:
pickle.dump(models_xgb_utt[0],open("Models/upsample/xgb_noEnrollee.dat", "wb"))
pickle.dump(models_lgb_utt[0],open("Models/upsample/lgb_noEnrollee.dat", "wb"))