## Importing all libraries

In [2]:
# These libraries will be used for EDA and data manipulation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams

# These libraries will be used for machine learning models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import svm
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Importing csv with new features and merging them to combine all features

In [6]:
pd.set_option('display.max_columns', 100) # shows all the columns

# loading all the datasets
features_0 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Combined_Provider.csv')
features_2 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/new_features.csv')

# dropping because this column already exist in features_2, we don't need duplicates
features_1 = features_0.drop('PotentialFraud', axis = 1) 

# merging features_1 & features_2 to have the complete dataset of new features
All_Features = features_2.merge(features_1, on = 'Provider')

# setting provider as index for ML modeling
Train_set = features_2.merge(features_1, on = 'Provider').set_index('Provider')

# replacing categories with ordinal data for the Potential Fraud labels
Train_set['PotentialFraud'] = Train_set['PotentialFraud'].replace(['Yes', 'No'], [1,0])

# seperating the dataset into 2 frames: fraud & not fraud
## Potential Fraud dataset
fraud_df = All_Features.loc[All_Features.PotentialFraud == 'Yes']
Fraud_df = fraud_df.reset_index().rename(columns = {'index': 'Index'})

## Not Potential Fraud Dataset
nfraud_df = All_Features.loc[All_Features.PotentialFraud == 'No']
NFraud_df = nfraud_df.reset_index().rename(columns = {'index': 'Index'})


In [3]:
All_Features.shape # checking the count of rows and columns

(5410, 72)

In [4]:
# check the distribution of the potential fraud/not fraud labels (0 means not potential fraud, 1 means potential fraud)
np.round(Train_set.PotentialFraud.value_counts()/All_Features.shape[0], 2)

0    0.91
1    0.09
Name: PotentialFraud, dtype: float64

In [5]:
Train_set.sample(5) # randomly sampling 5 rows of data

Unnamed: 0_level_0,Male_Patient,Female_Patient,Age(26-40),Age(41-60),Age(61-80),Age(81-100),Race_White,Race_Black,Race_Hispanic,Race_Native,Hos_Stay_Total,Payment_Total,Claim_Len_Total,Alzheimer,Heartfailure,KidneyDisease,Cancer,ObstrPulmonary,Depression,Diabetes,IschemicHeart,Osteoporasis,rheumatoidarthritis,stroke,RenalDisease,State_Count,County_Count,DiagGroupCode,PotentialFraud,CDC_2449,CDC_25000,CDC_2720,CDC_2724,CDC_4011,CDC_4019,CDC_41401,CDC_42731,CDC_4280,CDC_496,CDC_53081,CDC_5990,CDC_V5861,CDC_V5869,CPC_2724,CPC_3722,CPC_3893,CPC_3995,CPC_4019,CPC_4516,CPC_66,CPC_8151,CPC_8154,CPC_9904,DGC_166,DGC_167,DGC_183,DGC_186,DGC_187,DGC_188,DGC_192,DGC_202,DGC_208,DGC_881,DGC_882,DGC_883,DGC_884,DGC_887,DGC_939,DGC_940,DGC_941,DGC_945
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1
PRV51490,23.0,33.0,0.0,8.0,33.0,15.0,39.0,4.0,8.0,5.0,235.0,493334.0,259,36,42,30,9,30,39,52,53,21,25,11,17,3,9,46.0,1,2,5,0.0,6,0.0,8,5.0,4,2,5.0,5.0,6.0,0.0,0.0,1,1,1,1,1,0,0,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
PRV56077,2.0,5.0,0.0,2.0,3.0,2.0,7.0,0.0,0.0,0.0,0.0,2450.0,40,3,4,4,3,0,4,6,5,3,0,0,3,1,1,0.0,0,1,1,0.0,0,1.0,0,0.0,0,0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRV54947,4.0,20.0,1.0,2.0,14.0,7.0,19.0,4.0,0.0,1.0,0.0,7680.0,22,9,13,8,8,7,7,12,17,8,6,1,6,2,8,0.0,0,1,1,0.0,1,0.0,2,0.0,0,0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRV56871,5.0,6.0,0.0,0.0,8.0,3.0,9.0,2.0,0.0,0.0,0.0,4400.0,20,3,7,4,2,4,6,6,9,5,1,2,3,1,1,0.0,0,1,2,0.0,0,1.0,2,0.0,0,0,0.0,0.0,0.0,1.0,2.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRV57501,23.0,40.0,1.0,7.0,34.0,21.0,58.0,5.0,0.0,0.0,0.0,11970.0,17,22,31,19,13,14,26,46,45,15,22,2,11,4,12,0.0,0,4,2,3.0,3,1.0,5,0.0,4,2,0.0,0.0,0.0,1.0,4.0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Baseline model (Logistic Regression, No Penalty)

In [87]:
# define class weights
w = {0:9, 1:91}

# Create instance of Logistic Regression Class
lm = LogisticRegression(class_weight = 'balanced', solver = 'liblinear', random_state = 42)

In [90]:
# preparing dataset for train, test, split
x = Train_set.drop('PotentialFraud', axis = 1)
y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables 

# spliting the dataset into training set & testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

# fitting the logistic regression model
lm.fit(X_train, y_train)

# making prediction with the model
y_pred = lm.predict(X_test)

In [91]:
lm.score(X_train, y_train) # checking the score of on the train dataset

0.9992078162133615

In [93]:
print('Confusion Matrix') # tells us how many predictions were correst as well as false positives and negatives
print(confusion_matrix(y_test, y_pred))
print('Classification Report') 
print(classification_report(y_test, y_pred))
print('Accuracy Score') # checking accuracy of the model
print(accuracy_score(y_test, y_pred))

Confusion Matrix
[[1443    1]
 [   4  175]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1444
           1       0.99      0.98      0.99       179

    accuracy                           1.00      1623
   macro avg       1.00      0.99      0.99      1623
weighted avg       1.00      1.00      1.00      1623

Accuracy Score
0.9969192852741836


In [95]:
Features = pd.DataFrame(np.column_stack(lm.coef_ ), index = x.columns, columns = ['Coefficients'])
Features.loc[Features.Coefficients > 0].sort_values(ascending = False, by = 'Coefficients')

Unnamed: 0,Coefficients
CDC_496,3.458879
CDC_5990,3.257098
CDC_53081,3.005338
CDC_41401,2.804348
Age(26-40),1.316515
State_Count,1.075006
CDC_4019,0.910069
DGC_202,0.733904
DGC_192,0.684615
Age(81-100),0.62735


# Logistic Regression (Lasso Penalty, Feature Selection)

In [135]:
x = Train_set.drop('PotentialFraud', axis = 1)
y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

In [138]:
w = {0:10, 1:90}
lm_lasso = LogisticRegression(solver = 'liblinear', penalty = 'l1', class_weight = 'balanced')
lm_lasso.fit(X_train, y_train)
y_pred = lm_lasso.predict(X_test)

In [137]:
print('Train Model Score')
print(lm_lasso.score(X_train, y_train))
print(' * '* 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Model Score
0.9992078162133615
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Accuracy Score
0.9969192852741836
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1443    1]
 [   4  175]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1444
           1       0.99      0.98      0.99       179

    accuracy                           1.00      1623
   macro avg       1.00      0.99      0.99      1623
weighted avg       1.00      1.00      1.00      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [140]:
Features = pd.DataFrame(np.column_stack(lm_lasso.coef_ ), index = x.columns, columns = ['Coefficients'])
Features.loc[Features.Coefficients > 0].sort_values(ascending = False, by = 'Coefficients')

Unnamed: 0,Coefficients
CDC_41401,6.187871
CDC_496,5.065089
CDC_5990,5.058769
CDC_53081,4.375149
Age(26-40),3.034713
State_Count,1.308091
Age(81-100),1.283769
Female_Patient,0.98272
CDC_4019,0.775491
DGC_202,0.558643


In [24]:
# grid_params = {'C' : [0.001,0.01,0.1,1,10,100]}
# grid_search_lm = GridSearchCV(lm_lasso, grid_params, cv = 3, scoring = 'accuracy', n_jobs = -1)
# %time grid_search_lm.fit(X, y)

In [25]:
# grid_search_lm.best_params_

In [26]:
# grid_search_lm.best_score_

In [27]:
# y_Pred = grid_search_lm.predict(X_test)
# print(accuracy_score(y_test, y_Pred))

# Logistic Regression (Features Selected from Lasso Penalty)

In [166]:
x = Train_set[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']]
               
y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.4)

In [167]:
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print('Train Model Score')
print(lm.score(X_train, y_train))
print(' * '* 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))
print(' * ' * 30)
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Model Score
0.9987677141096735
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Accuracy Score
0.9981515711645101
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1934    0]
 [   4  226]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1934
           1       1.00      0.98      0.99       230

    accuracy                           1.00      2164
   macro avg       1.00      0.99      1.00      2164
weighted avg       1.00      1.00      1.00      2164



In [143]:
Features = pd.DataFrame(np.column_stack(lm.coef_ ), index = x.columns, columns = ['Coefficients'])
Features

Unnamed: 0,Coefficients
CDC_41401,5.511437
CDC_496,3.814458
CDC_5990,4.750317
CDC_53081,3.235203


# Random Forest with Lasso Feature Selection

In [44]:
x = Train_set[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']]

y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

rf = RandomForestClassifier(class_weight='balanced')

params = {'n_estimators' : range(20, 50, 5), 'max_depth' : range(2, 6), 'max_features' : range(1,5), \
          'min_samples_leaf' : range(2,5), 'min_samples_split' : range(2,5)}

Grid_Search_rf = GridSearchCV(rf, params, cv = 3, scoring = 'accuracy', n_jobs = -1)
%time Grid_Search_rf.fit(X_train, y_train)

CPU times: user 19.2 s, sys: 1.37 s, total: 20.6 s
Wall time: 2min 39s


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                             

In [45]:
Grid_Search_rf.best_params_

{'max_depth': 4,
 'max_features': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 20}

In [47]:
gs_rf = RandomForestClassifier(class_weight = 'balanced', max_depth = 4, max_features = 4, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 20, random_state = 42)
gs_rf.fit(X_train, y_train)
y_pred = gs_rf.predict(X_test)

In [48]:
print('Train Model Score')
print(gs_rf.score(X_train, y_train))
print(' * '* 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))
print(' * ' * 30)
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Model Score
0.9989437549511486
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Accuracy Score
0.9987677141096735
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1444    0]
 [   2  177]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1444
           1       1.00      0.99      0.99       179

    accuracy                           1.00      1623
   macro avg       1.00      0.99      1.00      1623
weighted avg       1.00      1.00      1.00      1623



# Gradient Boosting with Lasso Feature Selection

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier() # creating an instance of the Class

x = Train_set[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']]

y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

grid_para_gbm = [{
    "learning_rate": [0.01],
    "verbose": [1],
    "subsample": [0.7],
    "n_estimators": range(500, 2000, 500),
    "max_depth": range(2, 5),
    "max_features": range(2, 5),  #  sqrt(number of samples) 
    "min_impurity_decrease": np.linspace(0.01,0.05, 3), 
    "min_samples_split": range(2,5),
    "random_state": [42] }]

grid_search_gbm = GridSearchCV(gbm, grid_para_gbm, cv = 5, scoring = 'accuracy', n_jobs = -1)
%time grid_search_gbm.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5991           0.0198            3.45s
         2           0.5911           0.0170            3.34s
         3           0.5641           0.0165            3.44s
         4           0.5561           0.0148            3.36s
         5           0.5257           0.0143            3.32s
         6           0.5252           0.0134            3.31s
         7           0.5170           0.0116            3.33s
         8           0.5044           0.0103            3.34s
         9           0.4870           0.0107            3.30s
        10           0.4857           0.0095            3.31s
        20           0.4137           0.0063            2.94s
        30           0.3341           0.0057            2.89s
        40           0.2955           0.0043            2.78s
        50           0.2623           0.0032            2.70s
        60           0.2322           0.0028            2.65s
       

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
             iid=

In [9]:
grid_search_gbm.best_params_

{'learning_rate': 0.01,
 'max_depth': 4,
 'max_features': 3,
 'min_impurity_decrease': 0.01,
 'min_samples_split': 2,
 'n_estimators': 1000,
 'random_state': 42,
 'subsample': 0.7,
 'verbose': 1}

In [16]:
gbm_cv = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 4, max_features = 3, min_impurity_decrease = 0.01, min_samples_split = 2,
                                    n_estimators = 1000, random_state = 42, subsample = 0.7, verbose = 1)
gbm_grid = gbm_cv.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5470           0.0222            4.47s
         2           0.5544           0.0166            4.03s
         3           0.5484           0.0149            3.70s
         4           0.5230           0.0141            3.42s
         5           0.5156           0.0130            3.29s
         6           0.4752           0.0142            3.17s
         7           0.4966           0.0106            3.11s
         8           0.4819           0.0102            3.09s
         9           0.4612           0.0106            3.12s
        10           0.4557           0.0097            3.09s
        20           0.3774           0.0064            2.75s
        30           0.3101           0.0054            2.61s
        40           0.2771           0.0038            2.65s
        50           0.2454           0.0032            2.53s
        60           0.2153           0.0025            2.43s
       

In [17]:
y_pred = gbm_grid.predict(X_test)
print('Train Model Score')
print(gbm_grid.score(X_train, y_train))
print(' * '* 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))
print(' * ' * 30)
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Model Score
0.9992078162133615
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Accuracy Score
0.9981515711645101
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1444    0]
 [   3  176]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1444
           1       1.00      0.98      0.99       179

    accuracy                           1.00      1623
   macro avg       1.00      0.99      1.00      1623
weighted avg       1.00      1.00      1.00      1623



# Support Vector Machines

In [18]:
svm_model = svm.SVC(kernel='linear', C=1, class_weight = 'balanced', random_state = 42)

x = Train_set[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']]

y = Train_set['PotentialFraud']

X = preprocessing.scale(x) # scaling the X variables
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)
svm_model.fit(X_test, y_test)
y_pred = svm_model.predict(X_test)

In [22]:
svm_index = svm_model.support_ # index of support vector
print('Index of support vector: ' + str(svm_index))

Index of support vector: [  57  205  230  290  342  352  367  370  392  409  515  585  607  637
  660  667  683  684  697  700  793  800  813  923  970  974 1093 1111
 1145 1231 1236 1268 1290 1338 1362 1393 1430 1443 1494 1514 1544 1604
  218  271  389  875 1466]


In [20]:
print('Train Model Score')
print(svm_model.score(X_train, y_train))
print(' * '* 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))
print(' * ' * 30)
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Model Score
0.9981515711645101
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Accuracy Score
0.9993838570548367
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1444    0]
 [   1  178]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1444
           1       1.00      0.99      1.00       179

    accuracy                           1.00      1623
   macro avg       1.00      1.00      1.00      1623
weighted avg       1.00      1.00      1.00      1623



In [21]:
Features = pd.DataFrame(np.column_stack(svm_model.coef_ ), index = x.columns, columns = ['Coefficients'])
Features.sort_values(ascending = False, by = 'Coefficients')

Unnamed: 0,Coefficients
CDC_53081,3.153058
CDC_5990,2.922397
CDC_41401,0.6486061
CDC_496,0.3593607
CDC_4019,0.0001338277
Female_Patient,0.0001206486
State_Count,2.047328e-05
CPC_66,6.661338e-16
DGC_202,1.110223e-16
DGC_188,1.110223e-16


# Log Regression (Random Forest Features from Anthony)

In [None]:
reduced_features = Train_set.drop(['RenalDisease', 'DGC_945', 'CPC_8151', 'DGC_939', 'DGC_187', 'DGC_167',
       'DGC_940', 'Cancer', 'CPC_4516', 'DGC_883', 'Race_Black', 'CPC_3722',
       'Race_White', 'Age(26-40)', 'Race_Hispanic', 'Race_Native', 'CPC_3893',
       'rheumatoidarthritis', 'State_Count', 'Depression', 'CPC_8154',
       'Age(81-100)', 'CPC_9904', 'DGC_166', 'DGC_183', 'DGC_186', 'DGC_188',
       'DGC_192', 'DGC_202', 'DGC_208', 'DGC_881', 'Age(61-80)', 'DGC_884',
       'DGC_887', 'DGC_941', 'Age(41-60)', 'DGC_882'], axis = 1)

# train/test/split
x = reduced_features.drop('PotentialFraud', axis = 1)
X = preprocessing.scale(x)
y = reduced_features['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle = True, test_size = 0.3)

lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

print(lm.score(X_train, y_train))
lm.score(X_test, y_test)

print(np.round(lm.coef_ , 3))
print(' * ' * 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)
print('Accuracy Score')
print(accuracy_score(y_test, y_pred))

Features = pd.DataFrame(np.column_stack(lm.coef_ ), index = x.columns, columns = ['Coefficients'])
Features.loc[Features.Coefficients > 0].sort_values(ascending = False, by = 'Coefficients')

# Market-basket Analysis (MBA)

In [3]:
pd.set_option('display.max_colwidth', 500)

In [4]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [50]:
chronic_F = Fraud_df[['Alzheimer',
       'Heartfailure', 'KidneyDisease', 'Cancer', 'ObstrPulmonary',
       'Depression', 'Diabetes', 'IschemicHeart', 'Osteoporasis',
       'rheumatoidarthritis', 'stroke', 'RenalDisease']].copy()

chronic_F = chronic_F.astype(bool)

frequent_chronic_F = apriori(chronic_F, min_support=0.8, use_colnames=True)
MBA_chronic_F = association_rules(frequent_chronic_F, metric="lift", min_threshold=0.7)

# chronic_N = NFraud_df[['Alzheimer',
#        'Heartfailure', 'KidneyDisease', 'Cancer', 'ObstrPulmonary',
#        'Depression', 'Diabetes', 'IschemicHeart', 'Osteoporasis',
#        'rheumatoidarthritis', 'stroke', 'RenalDisease']].copy()

# chronic_N = chronic_N.astype(bool)

# frequent_chronic_N = apriori(chronic_N, min_support=0.8, use_colnames=True)
# MBA_chronic_N = association_rules(frequent_chronic_N, metric="lift", min_threshold=0.7)

In [52]:
MBA_chronic_F.loc[MBA_chronic_F.support > 0.95] # IschemicHeart, Heartfailure, Diabetes

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Heartfailure),(Alzheimer),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
1,(Alzheimer),(Heartfailure),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
2,(Alzheimer),(KidneyDisease),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
3,(KidneyDisease),(Alzheimer),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
4,(Alzheimer),(Cancer),1.000000,0.980237,0.980237,0.980237,1.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
523245,(Depression),"(ObstrPulmonary, IschemicHeart, Diabetes, Cancer, rheumatoidarthritis, Heartfailure, stroke, KidneyDisease, RenalDisease, Osteoporasis, Alzheimer)",0.998024,0.956522,0.956522,0.958416,1.001980,0.001890,1.045549
523246,(KidneyDisease),"(ObstrPulmonary, IschemicHeart, Diabetes, Cancer, rheumatoidarthritis, Heartfailure, stroke, Depression, RenalDisease, Osteoporasis, Alzheimer)",1.000000,0.956522,0.956522,0.956522,1.000000,0.000000,1.000000
523247,(RenalDisease),"(ObstrPulmonary, IschemicHeart, Diabetes, Cancer, rheumatoidarthritis, Heartfailure, stroke, Depression, KidneyDisease, Osteoporasis, Alzheimer)",0.992095,0.958498,0.956522,0.964143,1.005890,0.005601,1.157444
523248,(Osteoporasis),"(ObstrPulmonary, IschemicHeart, Diabetes, Cancer, rheumatoidarthritis, Heartfailure, stroke, Depression, KidneyDisease, RenalDisease, Alzheimer)",0.994071,0.956522,0.956522,0.962227,1.005964,0.005671,1.151030


In [10]:
# CDC_N = NFraud_df[['CDC_2449', 'CDC_25000', 'CDC_2720', 'CDC_2724', 'CDC_4011', 'CDC_4019',
#        'CDC_41401', 'CDC_42731', 'CDC_4280', 'CDC_496', 'CDC_53081',
#        'CDC_5990', 'CDC_V5861', 'CDC_V5869']].copy()

# CDC_N = CDC_N.astype(bool)

# frequent_CDC_N = apriori(CDC_N, min_support=0.8, use_colnames=True)
# MBA_CDC_N = association_rules(frequent_CDC_N, metric="lift", min_threshold=0.7)


CDC_F = Fraud_df[['CDC_2449', 'CDC_25000', 'CDC_2720', 'CDC_2724', 'CDC_4011', 'CDC_4019',
       'CDC_41401', 'CDC_42731', 'CDC_4280', 'CDC_496', 'CDC_53081',
       'CDC_5990', 'CDC_V5861', 'CDC_V5869']].copy()

CDC_F = CDC_F.astype(bool)

frequent_CDC_F = apriori(CDC_F, min_support=0.8, use_colnames=True)
MBA_CDC_F = association_rules(frequent_CDC_F, metric="lift", min_threshold=0.7)

In [16]:
MBA_CDC_N

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [38]:
DGC_F = Fraud_df[['Provider', 'Male_Patient', 'Female_Patient', 'Age(26-40)',
       'Age(41-60)', 'Age(61-80)', 'Age(81-100)', 'Race_White', 'Race_Black',
       'Race_Hispanic', 'Race_Native', 'Hos_Stay_Total', 'Payment_Total',
       'Claim_Len_Total']]
DGC_F = DGC_F.astype(bool)

frequent_DGC_F = apriori(DGC_F, min_support=0.8, use_colnames=True)
MBA_DGC_F = association_rules(frequent_DGC_F, metric="lift", min_threshold=0.7)

# DGC_N = NFraud_df[['Provider', 'Male_Patient', 'Female_Patient', 'Age(26-40)',
#        'Age(41-60)', 'Age(61-80)', 'Age(81-100)', 'Race_White', 'Race_Black',
#        'Race_Hispanic', 'Race_Native', 'Hos_Stay_Total', 'Payment_Total',
#        'Claim_Len_Total', ]]
# DGC_N = DGC_N.astype(bool)

# frequent_DGC_N = apriori(DGC_N, min_support=0.8, use_colnames=True)
# # MBA_DGC_N = association_rules(frequent_DGC_N, metric="lift", min_threshold=0.7)

In [43]:
MBA_DGC_F.loc[ (MBA_DGC_F.confidence > 0.95) & (MBA_DGC_F.lift > 1.1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [17]:
Fraud_df.columns

Index(['Index', 'Provider', 'Male_Patient', 'Female_Patient', 'Age(26-40)',
       'Age(41-60)', 'Age(61-80)', 'Age(81-100)', 'Race_White', 'Race_Black',
       'Race_Hispanic', 'Race_Native', 'Hos_Stay_Total', 'Payment_Total',
       'Claim_Len_Total', 'Alzheimer', 'Heartfailure', 'KidneyDisease',
       'Cancer', 'ObstrPulmonary', 'Depression', 'Diabetes', 'IschemicHeart',
       'Osteoporasis', 'rheumatoidarthritis', 'stroke', 'RenalDisease',
       'State_Count', 'County_Count', 'DiagGroupCode', 'PotentialFraud',
       'CDC_2449', 'CDC_25000', 'CDC_2720', 'CDC_2724', 'CDC_4011', 'CDC_4019',
       'CDC_41401', 'CDC_42731', 'CDC_4280', 'CDC_496', 'CDC_53081',
       'CDC_5990', 'CDC_V5861', 'CDC_V5869', 'CPC_2724', 'CPC_3722',
       'CPC_3893', 'CPC_3995', 'CPC_4019', 'CPC_4516', 'CPC_66', 'CPC_8151',
       'CPC_8154', 'CPC_9904', 'DGC_166', 'DGC_167', 'DGC_183', 'DGC_186',
       'DGC_187', 'DGC_188', 'DGC_192', 'DGC_202', 'DGC_208', 'DGC_881',
       'DGC_882', 'DGC_883', 'DGC_

# EDA on New Features

In [191]:
Fraud_df[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']].describe()

Unnamed: 0,CDC_41401,CDC_496,CDC_5990,CDC_53081,Age(26-40),State_Count,Age(81-100),Female_Patient,CDC_4019,DGC_202,DGC_192,DGC_188,CPC_66,CPC_3995
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,5.05336,4.41502,4.48419,4.626482,6.494071,5.379447,70.411067,139.179842,7.197628,0.187747,0.193676,0.1917,0.478261,0.44664
std,2.754496,2.713679,2.741819,2.681216,9.29095,5.430961,101.843041,198.716638,2.272625,0.473378,0.493562,0.508161,0.503966,0.497637
min,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,2.0,2.0,2.0,1.0,2.0,14.0,29.0,6.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,4.0,5.0,5.0,3.0,4.0,34.0,67.5,8.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,7.0,7.0,7.0,7.75,6.0,76.75,146.75,9.0,0.0,0.0,0.0,1.0,1.0
max,10.0,10.0,10.0,10.0,82.0,38.0,876.0,1641.0,10.0,2.0,4.0,3.0,2.0,1.0


In [193]:
Fraud_df[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']].median()

CDC_41401          5.0
CDC_496            4.0
CDC_5990           5.0
CDC_53081          5.0
Age(26-40)         3.0
State_Count        4.0
Age(81-100)       34.0
Female_Patient    67.5
CDC_4019           8.0
DGC_202            0.0
DGC_192            0.0
DGC_188            0.0
CPC_66             0.0
CPC_3995           0.0
dtype: float64

In [192]:
NFraud_df[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']].sample(500).describe()

Unnamed: 0,CDC_41401,CDC_496,CDC_5990,CDC_53081,Age(26-40),State_Count,Age(81-100),Female_Patient,CDC_4019,DGC_202,DGC_192,DGC_188,CPC_66,CPC_3995
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.0,0.0,0.0,0.0,1.358,2.328,15.298,30.974,3.384,0.0,0.0,0.0,0.072,0.066
std,0.0,0.0,0.0,0.0,2.516048,2.472376,25.950956,50.249331,2.658322,0.0,0.0,0.0,0.258747,0.248531
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,6.0,13.0,3.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,2.0,3.0,17.0,34.25,5.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,24.0,19.0,189.0,344.0,10.0,0.0,0.0,0.0,1.0,1.0


In [194]:
NFraud_df[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'Age(26-40)',  'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019', 'DGC_202', 'DGC_192',
 'DGC_188', 'CPC_66', 'CPC_3995']].median()

CDC_41401          0.0
CDC_496            0.0
CDC_5990           0.0
CDC_53081          0.0
Age(26-40)         0.0
State_Count        1.0
Age(81-100)        6.0
Female_Patient    13.0
CDC_4019           3.0
DGC_202            0.0
DGC_192            0.0
DGC_188            0.0
CPC_66             0.0
CPC_3995           0.0
dtype: float64

In [59]:
Fraud_CM = Fraud_df.drop('Index', axis = 1).corr().unstack().drop_duplicates().\
to_frame().reset_index().rename(columns = {'level_0' : 'Feature1', 'level_1' : 'Feature2', 0 : 'Corr_Coef'})
Fraud_CM.loc[Fraud_CM.Corr_Coef > 0.6].sort_values(ascending = False, by = 'Corr_Coef').head(15)

Unnamed: 0,Feature1,Feature2,Corr_Coef
0,Male_Patient,Male_Patient,1.0
914,Diabetes,IschemicHeart,0.999344
714,Heartfailure,IschemicHeart,0.998905
713,Heartfailure,Diabetes,0.99889
875,Depression,Diabetes,0.998363
670,Alzheimer,Diabetes,0.998023
876,Depression,IschemicHeart,0.998007
671,Alzheimer,IschemicHeart,0.997896
712,Heartfailure,Depression,0.997711
709,Heartfailure,KidneyDisease,0.997623


In [60]:
NFraud_CM = NFraud_df.drop('Index', axis = 1).corr().unstack().drop_duplicates().\
to_frame().reset_index().rename(columns = {'level_0' : 'Feature1', 'level_1' : 'Feature2', 0 : 'Corr_Coef'})
NFraud_CM.loc[NFraud_CM.Corr_Coef > 0.6].sort_values(ascending = False, by = 'Corr_Coef').head(15)

Unnamed: 0,Feature1,Feature2,Corr_Coef
0,Male_Patient,Male_Patient,1.0
914,Diabetes,IschemicHeart,0.997403
714,Heartfailure,IschemicHeart,0.996281
713,Heartfailure,Diabetes,0.995883
61,Female_Patient,Age(61-80),0.993114
875,Depression,Diabetes,0.992857
63,Female_Patient,Race_White,0.99281
876,Depression,IschemicHeart,0.992658
4,Male_Patient,Age(61-80),0.992476
755,KidneyDisease,Diabetes,0.99239
