In [1]:
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import ComplementNB
import lightgbm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import pickle

from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt

from tqdm import tqdm 

%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Feature Engineering

In [2]:
df = pd.read_csv('Processed_data/feature_engineered.csv',index_col=0)

In [3]:
df.head()

Unnamed: 0,claim_id,enrollee_id,enrollee_cum_claim_count,enrollee_cum_claim_amount,provider_id,hmo_id,care_id,diagnosis_id,qty,amount,unit_price,tariffs,create_m,vetted_m,label
0,10.0,89.0,1,1816.08,1.0,1.0,586.0,9.0,6.0,1816.08,302.68,302.68,3,5,0
1,11.0,89.0,2,3632.16,1.0,1.0,586.0,15925.0,6.0,1816.08,302.68,302.68,3,5,0
2,13.0,74.0,1,115.0,1.0,1.0,434.0,4342.0,5.0,115.0,23.0,23.0,3,5,0
3,13.0,74.0,2,1380.0,1.0,1.0,1102.0,4342.0,10.0,1265.0,126.5,126.5,3,5,0
4,13.0,74.0,3,1518.0,1.0,1.0,299.0,4342.0,15.0,138.0,9.2,9.2,3,5,0


In [4]:
df['tariffs'] = df['tariffs'].fillna(0)

In [5]:
df.dropna(inplace=True)

In [6]:
# training and testing data
y = df['label']
X = df.drop(columns=['claim_id','enrollee_id','vetted_m','label'])


In [7]:
X.head()

Unnamed: 0,enrollee_cum_claim_count,enrollee_cum_claim_amount,provider_id,hmo_id,care_id,diagnosis_id,qty,amount,unit_price,tariffs,create_m
0,1,1816.08,1.0,1.0,586.0,9.0,6.0,1816.08,302.68,302.68,3
1,2,3632.16,1.0,1.0,586.0,15925.0,6.0,1816.08,302.68,302.68,3
2,1,115.0,1.0,1.0,434.0,4342.0,5.0,115.0,23.0,23.0,3
3,2,1380.0,1.0,1.0,1102.0,4342.0,10.0,1265.0,126.5,126.5,3
4,3,1518.0,1.0,1.0,299.0,4342.0,15.0,138.0,9.2,9.2,3


In [8]:
columns = ['enrollee_cum_claim_count', 'enrollee_cum_claim_amount', 'provider_id',
       'care_id', 'diagnosis_id', 'qty', 'amount', 'unit_price',
       'tariffs', 'create_m']

In [9]:
# create feature correlation features
for i in tqdm(range(len(columns))):
    col1 = columns[i]
    for col2 in columns[i+1:]:
        new = []
        pairs = {}
        n = 1
        for x,y in zip(df[col1],df[col2]):
            if (x,y) not in pairs:
                pairs[(x,y)] = n
                n += 1
            new.append(pairs[(x,y)])
                
            
        X[col1+"&"+col2] = new

100%|██████████| 10/10 [00:40<00:00,  4.03s/it]


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 744861 entries, 0 to 744869
Data columns (total 56 columns):
enrollee_cum_claim_count                              744861 non-null int64
enrollee_cum_claim_amount                             744861 non-null float64
provider_id                                           744861 non-null float64
hmo_id                                                744861 non-null float64
care_id                                               744861 non-null float64
diagnosis_id                                          744861 non-null float64
qty                                                   744861 non-null float64
amount                                                744861 non-null float64
unit_price                                            744861 non-null float64
tariffs                                               744861 non-null float64
create_m                                              744861 non-null int64
enrollee_cum_claim_count&enrollee_c

In [11]:
X.head()

Unnamed: 0,enrollee_cum_claim_count,enrollee_cum_claim_amount,provider_id,hmo_id,care_id,diagnosis_id,qty,amount,unit_price,tariffs,...,qty&amount,qty&unit_price,qty&tariffs,qty&create_m,amount&unit_price,amount&tariffs,amount&create_m,unit_price&tariffs,unit_price&create_m,tariffs&create_m
0,1,1816.08,1.0,1.0,586.0,9.0,6.0,1816.08,302.68,302.68,...,1,1,1,1,1,1,1,1,1,1
1,2,3632.16,1.0,1.0,586.0,15925.0,6.0,1816.08,302.68,302.68,...,1,1,1,1,1,1,1,1,1,1
2,1,115.0,1.0,1.0,434.0,4342.0,5.0,115.0,23.0,23.0,...,2,2,2,2,2,2,2,2,2,2
3,2,1380.0,1.0,1.0,1102.0,4342.0,10.0,1265.0,126.5,126.5,...,3,3,3,3,3,3,3,3,3,3
4,3,1518.0,1.0,1.0,299.0,4342.0,15.0,138.0,9.2,9.2,...,4,4,4,4,4,4,4,4,4,4


# NB

In [13]:
y = df['label']

In [14]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2, random_state=0)

In [15]:
def init_scores():
    
    scores = {}
    scores['f1_socre'] = []
    scores['precision'] = []
    scores['recall'] = []
    scores['FPR'] = []
    scores['specificity'] = []
    scores['roc_auc'] = []
    
    return scores


def evaluation(ytest,Xtest,cls):

    scores = init_scores()
    ypred = cls.predict(Xtest)
    C = confusion_matrix(ytest,ypred)
    TN = C[0][0]
    FN = C[1][0]
    TP = C[1][1]
    FP = C[0][1]

    print('f1_socre: {:0.3f}'.format(f1_score(ytest,ypred)))
    scores['f1_socre'].append(f1_score(ytest,ypred))
    print('precision: {:0.3f}'.format(TP/(TP+FP)))
    scores['precision'].append(TP/(TP+FP))
    print('recall/sensitivity(true positive rate): {:0.3f}'.format(TP/(TP+FN)))
    scores['recall'].append(TP/(TP+FN))
    print('false positive rate (FPR): {:0.3f}'.format(1-(TN/(TN+FP)))) # 1 - specificity
    scores['FPR'].append(1-(TN/(TN+FP)))
    print('spcificity(true negative rate): {:0.3f}'.format(TN/(TN+FP)))
    scores['specificity'].append(TN/(TN+FP))
    print('ROC_AUC_score: {:0.3f}'.format(roc_auc_score(ytest,ypred)))
    scores['roc_auc'].append(roc_auc_score(ytest,ypred))
    
    return scores
    
def cv(Xtrain,ytrain,model):

    models = []
    kf = KFold(n_splits=4)
    print(model)
    n = 0
    for train_index, test_index in kf.split(Xtrain):
        print('cross_validate_run: {}'.format(n))
        Xtr, Xte = Xtrain[train_index], Xtrain[test_index]
        ytr, yte = ytrain[train_index], ytrain[test_index]
        cls = model.fit(Xtr, ytr) 
        models.append(cls)
        scores = evaluation(yte,Xte,cls)
        n += 1
    
    print('\n mean scores +/- sd: \n')
    for k in scores:
        print('{} : {:0.3f} +/- {:0.3f}'.format(k, np.array(scores[k]).mean(),  np.array(scores[k]).std()))
        
    
    return models, scores

In [16]:
cls = ComplementNB()
models, scores = cv(Xtrain.values,ytrain.values,cls)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
cross_validate_run: 0
f1_socre: 0.088
precision: 0.047
recall/sensitivity(true positive rate): 0.818
false positive rate (FPR): 0.751
spcificity(true negative rate): 0.249
ROC_AUC_score: 0.534
cross_validate_run: 1
f1_socre: 0.086
precision: 0.045
recall/sensitivity(true positive rate): 0.811
false positive rate (FPR): 0.749
spcificity(true negative rate): 0.251
ROC_AUC_score: 0.531
cross_validate_run: 2
f1_socre: 0.086
precision: 0.045
recall/sensitivity(true positive rate): 0.808
false positive rate (FPR): 0.750
spcificity(true negative rate): 0.250
ROC_AUC_score: 0.529
cross_validate_run: 3
f1_socre: 0.088
precision: 0.047
recall/sensitivity(true positive rate): 0.819
false positive rate (FPR): 0.751
spcificity(true negative rate): 0.249
ROC_AUC_score: 0.534

 mean scores +/- sd: 

f1_socre : 0.088 +/- 0.000
precision : 0.047 +/- 0.000
recall : 0.819 +/- 0.000
FPR : 0.751 +/- 0.000
specificity : 0.249 +/- 0.000
ro

In [17]:
roc_auc_score(ytest,cls.predict(Xtest))

0.5294378260703889

In [18]:
confusion_matrix(ytest,cls.predict(Xtest))

array([[ 35628, 106997],
       [  1212,   5136]])

In [19]:
confusion_matrix(ytrain,cls.predict(Xtrain))

array([[142599, 427918],
       [  4714,  20657]])

In [20]:
roc_auc_score(ytrain,cls.predict(Xtrain))

0.5320721449040254

In [21]:
f1_score(ytrain,cls.predict(Xtrain)),f1_score(ytest,cls.predict(Xtest))

(0.08717026834280699, 0.08669744515998346)

# LightGBM

In [22]:
a = [i for i in range(11,57)]
np.array(a)

array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
       28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56])

In [None]:
lgb = LGBMClassifier(categorical_features="2,3,4,5,11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, \
                     23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, \
                     43, 44,45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56", feature_fraction=0.8, 
                     boosting_type='gbdt',
               class_weight={1: 250}, colsample_bytree=0.8,bagging_fraction=0.6, bagging_freq=1,
               importance_type='split', learning_rate=0.1, max_depth=20,metric='roc_auc',
               min_child_samples=1, min_child_weight=1, min_split_gain=0.0,
               n_estimators=300, n_jobs=4, num_leaves=1000, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=1, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=1)
lgb.fit(Xtrain,ytrain)


In [None]:
evaluation(ytest,Xtest,lgb)

In [None]:
# plot importance
import lightgbm

def feature_importance(model,columns):
    # Plot importance for beset model: model_mse
    fig, ax = plt.subplots(figsize=(15,5))

    g1 = lightgbm.plot_importance(model,ax=ax,grid=False)

    oldy = g1.yaxis.get_majorticklabels()
    feature_dict = dict(zip(['Column_'+str(i) for i in range(len(columns))], columns))
    newy = [feature_dict[y._text] for y in oldy]
    g1.set_yticklabels(newy,rotation=0,fontsize=18)

    #fig.savefig('feature_importance.jpg')
    plt.title('Feature Importance',fontsize=26)
    plt.xlabel('Feature importance',fontsize = 24)
    plt.ylabel('Features', fontsize = 24)
    plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,20))

g = lightgbm.plot_importance(lgb,ax=ax,grid=False)

# Probability Solution

In [None]:
# tranform value of each feature according to their probability in two types of groups
def prob(col,df,labels): # A is the column value array
    counts = {}
    value = {} # normalized value in total
    total = {} # total claim items each feature pair
    dif = {} # diference between false claim (1) and legal claim (0)
    
    for i,label in zip(df[col],labels):
        if i in counts:
            if label:
                counts[i][1] += 1
            else:
                counts[i][0] += 1
        else:
            if label:
                counts[i] = {1:1,0:0}
            else:
                counts[i] = {1:0,0:1}
        total[i] = counts[i][1] + counts[i][0]
        dif[i] = counts[i][1] - counts[i][0]
        value[i] = dif[i]/total[i]
        
    return value

In [None]:
Rules = {}
for col in tqdm(Xtrain.columns):
    Rules[col] = prob(col,Xtrain,ytrain)

In [None]:
def convert_prob(df,Rules):
    
    col_prob = {}
    for col in tqdm(df.columns):
        col_prob[col] = []
        for x in df[col]:
            if x in Rules[col]:
                col_prob[col].append(Rules[col][x])
            else:
                col_prob[col].append(0)
    return col_prob

In [None]:
X_train = convert_prob(Xtrain,Rules)
X_test = convert_prob(Xtest,Rules)

In [None]:
from sklearn.linear_model import LogisticRegression

cls = LogisticRegression(class_weight='balanced')
cls.fit(Xtrain,ytrain)


In [None]:
confusion_matrix(ytest,cls.predict(X_test))

In [None]:
confusion_matrix(ytrain,cls.predict(X_train))