## Importing Necessary Libraries

In [1]:
# These libraries will be used for EDA and data manipulation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams

In [2]:
# These libraries will be used for machine learning models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Importing csv with engineered features and merging them 

In [3]:
pd.set_option('display.max_columns', 100) # shows all the columns

# loading all the datasets
features_1 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Final_Provider1.csv') # Anthony
features_2 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Final_Provider2.csv') # Suborna

# merging features_1 & features_2 to have the complete dataset of new features
All_Features = features_1.merge(features_2, on = 'Provider')

# setting provider as index for ML modeling
Train_set = features_1.merge(features_2, on = 'Provider').set_index('Provider')

# replacing categories with ordinal data for the Potential Fraud labels
Train_set['PotentialFraud'] = Train_set['PotentialFraud'].replace(['Yes', 'No'], [1,0])

# seperating the dataset into 2 frames: fraud & not fraud
## Potential Fraud dataset
fraud_df = All_Features.loc[All_Features.PotentialFraud == 'Yes']
Fraud_df = fraud_df.reset_index().rename(columns = {'index': 'Index'})

## Not Potential Fraud Dataset
nfraud_df = All_Features.loc[All_Features.PotentialFraud == 'No']
NFraud_df = nfraud_df.reset_index().rename(columns = {'index': 'Index'})

In [4]:
Train_set.isnull().sum().sum()

0

In [5]:
All_Features.shape # checking the count of rows and columns

(5410, 60)

In [6]:
# check the distribution of the potential fraud/Nfraud labels (0 means not potential fraud, 1 means potential fraud)
np.round(Train_set.PotentialFraud.value_counts()/All_Features.shape[0], 2)

0    0.91
1    0.09
Name: PotentialFraud, dtype: float64

In [7]:
All_Features.sample(5) # randomly sampling 5 rows of data

Unnamed: 0,Provider,CDC_4019,CDC_25000,CDC_2724,CDC_V5869,CDC_42731,CDC_4011,CDC_V5861,CDC_2449,CDC_2720,CDC_4280,DGC_882,DGC_884,DGC_166,DGC_183,DGC_939,DGC_883,DGC_168,DGC_887,DGC_204,DGC_867,CPC_4019.0,CPC_2724.0,CPC_9904.0,CPC_8154.0,CPC_66.0,CPC_3893.0,CPC_3995.0,CPC_4516.0,CPC_3722.0,CPC_8151.0,Male_Patient,Female_Patient,Age(26-40),Age(41-60),Age(61-80),Age(81-100),Race_White,Race_Black,Race_Hispanic,Race_Native,Hos_Stay_Total,Payment_Total,Claim_Len_Total,Alzheimer,Heartfailure,KidneyDisease,Cancer,ObstrPulmonary,Depression,Diabetes,IschemicHeart,Osteoporasis,rheumatoidarthritis,stroke,RenalDisease,State_Count,County_Count,DiagGroupCode,PotentialFraud
5302,PRV57636,4,1,1,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,3.0,0.0,1.0,1.0,3.0,4.0,1.0,0.0,0.0,21.0,33340.0,21,5,2,2,0,2,2,3,3,1,2,0,1,1,2,5.0,No
4861,PRV57103,255,107,120,99,78,89,71,81,70,45,0,0,0,1,0,0,0,0,0,0,1,1,1,0,3,0,0,0,0,1,365.0,439.0,20.0,85.0,505.0,194.0,754.0,13.0,22.0,15.0,121.0,927942.0,2718,711,1180,775,279,588,848,1359,1483,662,714,172,316,18,76,29.0,Yes
4845,PRV57085,3,1,2,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1.0,4.0,0.0,1.0,3.0,1.0,5.0,0.0,0.0,0.0,23.0,47340.0,23,2,4,4,1,3,3,5,5,1,2,1,3,1,2,5.0,No
5275,PRV57601,9,5,4,3,0,1,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17.0,31.0,1.0,0.0,32.0,15.0,43.0,5.0,0.0,0.0,0.0,21090.0,104,21,37,23,8,18,20,42,48,14,18,4,10,1,6,0.0,No
3726,PRV55665,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,4.0,10068.0,4,0,1,0,1,0,1,1,1,1,1,0,0,1,1,1.0,No


In [8]:
# All_Features.to_csv('/Users/suborna/Github/Capstone_Project/Data/All_Features.csv', index = False)

# Baseline model (Logistic Regression, No Penalty)

In [70]:
# Create instance of Logistic Regression Class
lm = LogisticRegression(class_weight = 'balanced', solver = 'saga', random_state = 42, max_iter = 10000)

In [71]:
# preparing dataset for train, test, split
X = Train_set.drop('PotentialFraud', axis = 1)
y = Train_set['PotentialFraud']

# spliting the dataset into training set & testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

# Grid Search 
params = {'C' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1]}
gs_lr = GridSearchCV(lm, params, cv = 3, scoring = 'accuracy')

%time gs_lr.fit(X_train, y_train)

CPU times: user 1min 8s, sys: 431 ms, total: 1min 8s
Wall time: 1min 6s


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [72]:
print('Best Params:', gs_lr.best_params_)
print('Best Score:', gs_lr.best_score_)

Best Params: {'C': 0.0001}
Best Score: 0.9078412821500557


In [73]:
# Using Grid Search Hyperparameters for Featue Selection with Lasso Penalty
lm.set_params(class_weight = 'balanced', solver = 'saga', random_state = 42, C = 0.0001)
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

In [74]:
lm.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=42, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)>

In [75]:
print('Train Set Score')
print(lm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(lm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.9067863744388698
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.8995686999383857
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1358  106]
 [  57  102]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      1464
           1       0.49      0.64      0.56       159

    accuracy                           0.90      1623
   macro avg       0.73      0.78      0.75      1623
weighted avg       0.91      0.90      0.91      1623



In [76]:
Feature_Importance1 = pd.DataFrame(np.column_stack(lm.coef_ ), index = X.columns, columns = ['Coefficients']).sort_values(ascending = False, by = 'Coefficients').reset_index()
Feature_Importance1.loc[Feature_Importance1.Coefficients > 0]

Unnamed: 0,index,Coefficients
0,DiagGroupCode,0.052595
1,Payment_Total,0.048645
2,Hos_Stay_Total,0.047924
3,CPC_3995.0,0.039842
4,CPC_4019.0,0.038947
5,CPC_2724.0,0.038821
6,CPC_66.0,0.038562
7,CPC_8154.0,0.036569
8,CPC_9904.0,0.035171
9,CPC_3893.0,0.033539


# Logistic Regression (Lasso Penalty for Feature Selection)

In [16]:
X = Train_set.drop('PotentialFraud', axis = 1)
y = Train_set['PotentialFraud']

# spliting the dataset into training set & testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

In [17]:
# Create instance of Logistic Regression Class
lm = LogisticRegression(class_weight = 'balanced', solver = 'saga', random_state = 42, max_iter = 10000)

# Grid Search 
params = {'C' :  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1], 'penalty': ['l1']}
gs_lr = GridSearchCV(lm, params, cv = 3, scoring = 'accuracy')

%time gs_lr.fit(X_train, y_train)

CPU times: user 2min 1s, sys: 1.64 s, total: 2min 3s
Wall time: 2min 17s


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1],
                         'penalty': ['l1']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [18]:
print('Best Params:', gs_lr.best_params_)
print('Best Score:', gs_lr.best_score_)

Best Params: {'C': 0.001, 'penalty': 'l1'}
Best Score: 0.934511194512098


In [19]:
# Using Grid Search Hyperparameters for Featue Selection with Lasso Penalty
lm.set_params(class_weight = 'balanced', solver = 'saga', random_state = 42, C = 0.001, penalty = 'l1')
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

In [20]:
lm.get_params # checking the parameters 

<bound method BaseEstimator.get_params of LogisticRegression(C=0.001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='auto', n_jobs=None,
                   penalty='l1', random_state=42, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)>

In [21]:
print('Train Set Score')
print(lm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(lm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Set Score
0.9324003168735147
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.929143561306223
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1419   45]
 [  70   89]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1464
           1       0.66      0.56      0.61       159

    accuracy                           0.93      1623
   macro avg       0.81      0.76      0.78      1623
weighted avg       0.92      0.93      0.93      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [22]:
Feature_Importance2 = pd.DataFrame(np.column_stack(lm.coef_ ), index = X.columns, columns = ['Coefficients']).\
sort_values(ascending = False, by = 'Coefficients').reset_index().rename(columns = {'index' : 'Features'})
Feature_Importance2

Unnamed: 0,Features,Coefficients
0,Payment_Total,0.324231
1,CDC_4019,0.0
2,Alzheimer,0.0
3,Female_Patient,0.0
4,Age(26-40),0.0
5,Age(41-60),0.0
6,Age(61-80),0.0
7,Age(81-100),0.0
8,Race_White,0.0
9,Race_Black,0.0


## Comapison of Lasso Penalty and Random Forest Features with coef > 0.0099

In [286]:
## Features from Lasso
lis1 = ['Payment_Total', 'Hos_Stay_Total', 'RenalDisease', 'stroke',  'CDC_42731', 'CPC_66.0', 'DGC_183', 'DGC_867', 'DGC_204', 'CPC_8151.0']


## Features from Random Forest 
lis2 = ['Hos_Stay_Total', 'DiagGroupCode', 'Payment_Total', 'Claim_Len_Total', 'CDC_4280', 'CDC_2449', 'CDC_42731', 'Race_White', 'CDC_4019',
 'Age(61-80)', 'County_Count', 'Male_Patient', 'ObstrPulmonary', 'Female_Patient', 'Heartfailure', 'Alzheimer', 'IschemicHeart', 'Age(41-60)', 'CDC_2724', 'CPC_8154.0',
 'RenalDisease', 'stroke', 'Age(81-100)', 'CDC_2720', 'KidneyDisease', 'CPC_4019.0', 'CPC_2724.0', 'CDC_4011', 'Cancer']

print('Only in Forest : \n', list(set(lis2) - set(lis1)))
print(' * '* 70)
print('Only in Lasso : \n', list(set(lis1) - set(lis2)))

Only in Forest : 
 ['Age(81-100)', 'CDC_2724', 'Male_Patient', 'CDC_2720', 'DiagGroupCode', 'Female_Patient', 'IschemicHeart', 'County_Count', 'Cancer', 'CPC_8154.0', 'Age(41-60)', 'KidneyDisease', 'CDC_4011', 'Alzheimer', 'CDC_4019', 'CPC_4019.0', 'Age(61-80)', 'Heartfailure', 'CDC_4280', 'CDC_2449', 'ObstrPulmonary', 'CPC_2724.0', 'Race_White', 'Claim_Len_Total']
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Only in Lasso : 
 ['DGC_183', 'DGC_867', 'DGC_204', 'CPC_66.0', 'CPC_8151.0']


# Logistic Regression (Lasso Features)

In [158]:
# Using Lasso Feature Selection and Grid Search Hypermarameters & running Log Regression
lm = LogisticRegression(class_weight = 'balanced', solver = 'saga', random_state = 42, max_iter = 10000)

X = Train_set[['Payment_Total']]
#  'Hos_Stay_Total',
#  'RenalDisease',
#  'CDC_42731',
#  'stroke',
#  'County_Count',
#  'CPC_66.0',
#  'DGC_183',
#  'DGC_867',
#  'Heartfailure',
#  'DGC_204',
#  'CDC_V5861',
#  'State_Count']]
               
y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

# Grid Search 
params = {'C' : [0.0001, 0.001, 0.01, 0.1, 0.12, 0.15, 0.18, 0.2, 0.25, 0.3, 1]}

gs_lr = GridSearchCV(lm, params, cv = 3, scoring = 'accuracy')

%time gs_lr.fit(X_train, y_train)

CPU times: user 359 ms, sys: 6.11 ms, total: 365 ms
Wall time: 366 ms


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.12, 0.15, 0.18, 0.2,
                               0.25, 0.3, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [159]:
print('Best Params:', gs_lr.best_params_)
print('Best Score:', gs_lr.best_score_)

Best Params: {'C': 0.0001}
Best Score: 0.934511194512098


In [160]:
# Using Grid Search Hyperparameters for Featue Selection with Lasso Penalty
lm.set_params(class_weight = 'balanced', solver = 'saga', random_state = 42, C = 0.0001)
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
lm.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=42, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)>

In [161]:
lm.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=42, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)>

In [162]:
print('Train Set Score')
print(lm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(lm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Set Score
0.9345128069712173
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9328404189772027
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1428   36]
 [  73   86]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1464
           1       0.70      0.54      0.61       159

    accuracy                           0.93      1623
   macro avg       0.83      0.76      0.79      1623
weighted avg       0.93      0.93      0.93      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [332]:
Feature_Importance3 = pd.DataFrame(np.column_stack(lm.coef_ ), index = X.columns, columns = ['Coefficients']).\
sort_values(ascending = False, by = 'Coefficients').reset_index().rename(columns = {'index' : 'Features'})
Feature_Importance3.loc[Feature_Importance3.Coefficients > 0.0099]

Unnamed: 0,Features,Coefficients
0,Payment_Total,0.139709


# Random Forest (Lasso Features)

In [25]:
rf = RandomForestClassifier(class_weight = 'balanced')

X = Train_set[['Payment_Total']]
#  'Hos_Stay_Total',
#  'RenalDisease',
#  'CDC_42731',
#  'stroke',
#  'County_Count',
#  'CPC_66.0',
#  'DGC_183',
#  'DGC_867',
#  'Heartfailure',
#  'DGC_204',
#  'CDC_V5861',
#  'State_Count']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

###############################################

# Grid Search
params = {'n_estimators' : range(20, 41, 10), 'max_depth' : range(10, 41, 10), 'max_features' : range(2, 8), \
          'min_samples_leaf' : range(1, 4), 'min_samples_split' : range(3, 8)}

gs_rf = GridSearchCV(rf, params, cv = 3, scoring = 'accuracy')
%time gs_rf.fit(X_train, y_train)

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_feat

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]

ValueError: max_features must be in (0, n_features]



KeyboardInterrupt: 

In [193]:
print('Best Params:', gs_rf.best_params_)
print('Best Score:', gs_rf.best_score_)

Best Params: {'max_depth': 30, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 30}
Best Score: 0.9350402930494855


In [194]:
# Setting the optimized hyperparameters
rf.set_params( max_depth = 30, max_features = 5, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 30 )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [195]:
rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=30, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [196]:
print('Train Set Score')
print(rf.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(rf.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Set Score
0.996831264853446
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9260628465804066
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1425   39]
 [  81   78]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1464
           1       0.67      0.49      0.57       159

    accuracy                           0.93      1623
   macro avg       0.81      0.73      0.76      1623
weighted avg       0.92      0.93      0.92      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [182]:
Feature_Importance4 = pd.DataFrame(sorted(zip(X.columns, rf.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance4.loc[Feature_Importance4.Importance > 0.009]

Unnamed: 0,FeatureName,Importance
0,Payment_Total,0.398717
1,Hos_Stay_Total,0.233027
2,CDC_42731,0.145915
3,RenalDisease,0.071974
4,stroke,0.047924
5,Heartfailure,0.037354
6,County_Count,0.030877
7,State_Count,0.017366
8,CDC_V5861,0.012073


# Random Forest (All Features)

In [132]:
rf = RandomForestClassifier(class_weight = 'balanced')

X = Train_set.drop('PotentialFraud', axis = 1)

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

###############################################

# Grid Search
params = {'n_estimators' : range(30, 50, 5), 'max_depth' : range(2, 8), 'max_features' : range(2,8), \
          'min_samples_leaf' : range(1,5), 'min_samples_split' : range(3,9)}

gs_rf = GridSearchCV(rf, params, cv = 3, scoring = 'accuracy')
%time gs_rf.fit(X_train, y_train)

In [27]:
print('Best Params:', gs_rf.best_params_)
print('Best Score:', gs_rf.best_score_)

Best Params: {'max_depth': 7, 'max_features': 7, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 45}
Best Score: 0.9231583711126419


In [133]:
# Setting the optimized hyperparameters
rf.set_params(n_estimators = 45, min_samples_split = 3, min_samples_leaf = 1, max_depth = 7, max_features = 7)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [134]:
rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=7, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=45,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [135]:
print('Train Set Score')
print(rf.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(rf.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Set Score
0.9358331132822815
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9199014171287738
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1383   81]
 [  49  110]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      1464
           1       0.58      0.69      0.63       159

    accuracy                           0.92      1623
   macro avg       0.77      0.82      0.79      1623
weighted avg       0.93      0.92      0.92      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [138]:
Feature_Importance5 = pd.DataFrame(sorted(zip(X.columns, rf.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance5.loc[Feature_Importance5.Importance > 0.01]['FeatureName'].to_list()

['Payment_Total',
 'Hos_Stay_Total',
 'DiagGroupCode',
 'Claim_Len_Total',
 'CDC_2724',
 'CDC_42731',
 'CDC_25000',
 'CDC_4280',
 'CDC_2449',
 'CPC_4019.0',
 'Male_Patient',
 'CDC_4019',
 'stroke',
 'KidneyDisease',
 'Race_White',
 'County_Count',
 'CPC_2724.0',
 'IschemicHeart',
 'ObstrPulmonary',
 'Age(81-100)',
 'rheumatoidarthritis',
 'CPC_66.0',
 'Age(61-80)']

# Random Forest (Features from above with coef > 0.01)

In [139]:
rf = RandomForestClassifier(class_weight = 'balanced')

X = Train_set[['Payment_Total',
 'Hos_Stay_Total',
 'DiagGroupCode',
 'Claim_Len_Total',
 'CDC_2724',
 'CDC_42731',
 'CDC_25000',
 'CDC_4280',
 'CDC_2449',
 'CPC_4019.0',
 'Male_Patient',
 'CDC_4019',
 'stroke',
 'KidneyDisease',
 'Race_White',
 'County_Count',
 'CPC_2724.0',
 'IschemicHeart',
 'ObstrPulmonary',
 'Age(81-100)',
 'rheumatoidarthritis',
 'CPC_66.0',
 'Age(61-80)']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

###############################################

# Grid Search
params = {'n_estimators' : range(30, 50, 5), 'max_depth' : range(2, 8), 'max_features' : range(2,8), \
          'min_samples_leaf' : range(1,5), 'min_samples_split' : range(3,9)}

gs_rf = GridSearchCV(rf, params, cv = 3, scoring = 'accuracy')
%time gs_rf.fit(X_train, y_train)

CPU times: user 20min 48s, sys: 8.64 s, total: 20min 57s
Wall time: 21min 5s


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                             

In [145]:
print('Best Params:', gs_rf.best_params_)
print('Best Score:', gs_rf.best_score_)

Best Params: {'max_depth': 7, 'max_features': 7, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 30}
Best Score: 0.9228942401036614


In [148]:
# Setting the optimized hyperparameters
rf.set_params(n_estimators = 30, min_samples_split = 3, min_samples_leaf = 3, max_depth = 7, max_features = 7)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [149]:
rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=7, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [150]:
print('Train Set Score')
print(rf.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(rf.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))
print(' * ' * 30)

Train Set Score
0.9339846844467916
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9162045594577942
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1370   94]
 [  42  117]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      1464
           1       0.55      0.74      0.63       159

    accuracy                           0.92      1623
   macro avg       0.76      0.84      0.79      1623
weighted avg       0.93      0.92      0.92      1623

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 


In [157]:
Feature_Importance5_1 = pd.DataFrame(sorted(zip(X.columns, rf.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance5_1.loc[Feature_Importance5_1.Importance > 0.01]['FeatureName']

0       DiagGroupCode
1      Hos_Stay_Total
2            CPC_66.0
3       Payment_Total
4          CPC_4019.0
5          CPC_3893.0
6             DGC_204
7          CPC_8154.0
8            CDC_4280
9           CDC_42731
10    Claim_Len_Total
11         CPC_4516.0
12         CPC_2724.0
13            DGC_867
Name: FeatureName, dtype: object

# Gradient Boosting (Lasso Features)

In [198]:
gbm = GradientBoostingClassifier(random_state = 42) # creating an instance of the Class, GB does not need class weight

X = Train_set[['Payment_Total',
 'Hos_Stay_Total',
 'RenalDisease',
 'CDC_42731',
 'stroke',
 'County_Count',
 'CPC_66.0',
 'DGC_183',
 'DGC_867',
 'Heartfailure',
 'DGC_204',
 'CDC_V5861',
 'State_Count']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

#########################################################

params = [{
    "learning_rate": [0.01],
    "n_estimators": range(500, 2000, 500),
    "max_depth": range(1, 5),
    "max_features": range(1, 6),  #  sqrt(number of samples) 
    "min_impurity_decrease": [0.01], 
    "min_samples_split": range(2, 6), # having 1 does not make sense, so min starts from 2
    "random_state": [42] }]

gs_gbm = GridSearchCV(gbm, params, cv = 3, scoring = 'accuracy')
%time gs_gbm.fit(X_train, y_train)

CPU times: user 19min 36s, sys: 7.2 s, total: 19min 44s
Wall time: 20min 13s


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [141]:
print('Best Params:', gs_gbm.best_params_)
print('Best Score:', gs_gbm.best_score_)

Best Params: {'learning_rate': 0.01, 'max_depth': 4, 'max_features': 4, 'min_impurity_decrease': 0.01, 'min_samples_split': 2, 'n_estimators': 400, 'random_state': 42}
Best Score: 0.9371537593810425


In [142]:
gbm.set_params(learning_rate = 0.01, max_depth = 3, max_features = 5, min_impurity_decrease = 0.01, \
               min_samples_split = 2, n_estimators = 500, random_state = 42)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

In [143]:
gbm.get_params

<bound method BaseEstimator.get_params of GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=3,
                           max_features=5, max_leaf_nodes=None,
                           min_impurity_decrease=0.01, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)>

In [144]:
print('Train Set Score')
print(gbm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gbm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.9543174016371798
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9322242760320394
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1435   29]
 [  81   78]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1464
           1       0.73      0.49      0.59       159

    accuracy                           0.93      1623
   macro avg       0.84      0.74      0.77      1623
weighted avg       0.93      0.93      0.93      1623



In [203]:
Feature_Importance6 = pd.DataFrame(sorted(zip(X.columns, gbm.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance6.loc[Feature_Importance6.Importance > 0.0099]

Unnamed: 0,FeatureName,Importance
0,Payment_Total,0.447989
1,Hos_Stay_Total,0.243133
2,CDC_42731,0.10167
3,RenalDisease,0.050891
4,stroke,0.044852
5,CPC_66.0,0.043999
6,Heartfailure,0.028575
7,State_Count,0.011767


# Gradient Boosting (All Features)

In [33]:
gbm = GradientBoostingClassifier(random_state = 42) # creating an instance of the Class, GB does not need class weight

X = Train_set.drop('PotentialFraud', axis = 1)

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

#########################################################

params = [{
    "learning_rate": [0.01],
    "n_estimators": range(500, 2000, 500),
    "max_depth": range(2, 10, 2),
    "max_features": range(4, 10, 2),  #  sqrt(number of samples) 
    "min_impurity_decrease": [0.01], 
    "min_samples_split": range(2, 8, 2), # having 1 does not make sense, so min starts from 2
    "random_state": [42] }]

gs_gbm = GridSearchCV(gbm, params, cv = 3, scoring = 'accuracy')
%time gs_gbm.fit(X_train, y_train)

CPU times: user 19min 12s, sys: 15.2 s, total: 19min 27s
Wall time: 20min 28s


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [34]:
print('Best Params:', gs_gbm.best_params_)
print('Best Score:', gs_gbm.best_score_)

Best Params: {'learning_rate': 0.01, 'max_depth': 2, 'max_features': 4, 'min_impurity_decrease': 0.01, 'min_samples_split': 2, 'n_estimators': 1000, 'random_state': 42}
Best Score: 0.9368885827227786


In [35]:
gbm.set_params(learning_rate = 0.01, max_depth = 2, max_features = 4, min_impurity_decrease = 0.01, \
               min_samples_split = 2, n_estimators = 1000, random_state = 42)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

In [36]:
gbm.get_params

<bound method BaseEstimator.get_params of GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=2,
                           max_features=4, max_leaf_nodes=None,
                           min_impurity_decrease=0.01, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)>

In [37]:
print('Train Set Score')
print(gbm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gbm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.950884605228413
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9260628465804066
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1434   30]
 [  90   69]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1464
           1       0.70      0.43      0.53       159

    accuracy                           0.93      1623
   macro avg       0.82      0.71      0.75      1623
weighted avg       0.92      0.93      0.92      1623



In [114]:
Feature_Importance7 = pd.DataFrame(sorted(zip(X.columns, gbm.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance7.loc[Feature_Importance7.Importance > 0.0]['FeatureName'].to_list()

['CPC_8151.0',
 'CPC_3995.0',
 'Hos_Stay_Total',
 'Payment_Total',
 'DiagGroupCode',
 'CPC_8154.0',
 'CPC_66.0',
 'CPC_9904.0',
 'CPC_2724.0',
 'CPC_3893.0',
 'CPC_4019.0',
 'DGC_204',
 'DGC_183',
 'Claim_Len_Total',
 'DGC_867',
 'CDC_42731',
 'CPC_4516.0',
 'CDC_4280']

# Gradient Boosting (Above Features coef > 0.0)

In [115]:
gbm = GradientBoostingClassifier(random_state = 42) # creating an instance of the Class, GB does not need class weight

X = Train_set[['CPC_8151.0',
 'CPC_3995.0',
 'Hos_Stay_Total',
 'Payment_Total',
 'DiagGroupCode',
 'CPC_8154.0',
 'CPC_66.0',
 'CPC_9904.0',
 'CPC_2724.0',
 'CPC_3893.0',
 'CPC_4019.0',
 'DGC_204',
 'DGC_183',
 'Claim_Len_Total',
 'DGC_867',
 'CDC_42731',
 'CPC_4516.0',
 'CDC_4280']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

#########################################################

params = [{
    "learning_rate": [0.01],
    "n_estimators": range(100, 800, 300),
    "max_depth": range(3, 7),
    "max_features": range(4, 8),  #  sqrt(number of samples) 
    "min_impurity_decrease": [0.01], 
    "min_samples_split": range(2, 5), # having 1 does not make sense, so min starts from 2
    "random_state": [42] }]

gs_gbm = GridSearchCV(gbm, params, cv = 3, scoring = 'accuracy')
%time gs_gbm.fit(X_train, y_train)

CPU times: user 8min 56s, sys: 3.33 s, total: 8min 59s
Wall time: 9min 18s


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [117]:
print('Best Params:', gs_gbm.best_params_)
print('Best Score:', gs_gbm.best_score_)

Best Params: {'learning_rate': 0.01, 'max_depth': 4, 'max_features': 4, 'min_impurity_decrease': 0.01, 'min_samples_split': 2, 'n_estimators': 400, 'random_state': 42}
Best Score: 0.9371537593810425


In [118]:
gbm.set_params(learning_rate = 0.01, max_depth = 4, max_features = 4, min_impurity_decrease = 0.01, \
               min_samples_split = 2, n_estimators = 400, random_state = 42)

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

In [119]:
print('Train Set Score')
print(gbm.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gbm.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.9590705043570108
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9322242760320394
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1440   24]
 [  86   73]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1464
           1       0.75      0.46      0.57       159

    accuracy                           0.93      1623
   macro avg       0.85      0.72      0.77      1623
weighted avg       0.92      0.93      0.92      1623



In [226]:
Feature_Importance7 = pd.DataFrame(sorted(zip(X.columns, gbm.feature_importances_), key=lambda t:t[1], reverse=True), columns = ['FeatureName',  'Importance'])
Feature_Importance7.loc[Feature_Importance7.Importance > 0.0099]['FeatureName']

0     Payment_Total
1    Hos_Stay_Total
2         CDC_42731
3      RenalDisease
4          CPC_66.0
5            stroke
6      Heartfailure
7       State_Count
Name: FeatureName, dtype: object

# Support Vector Machines (Lasso Features)

In [215]:
svc = SVC(kernel='linear', class_weight = 'balanced', random_state = 42)

X = Train_set[['Payment_Total',
 'Hos_Stay_Total',
 'RenalDisease',
 'CDC_42731',
 'stroke',
 'County_Count',
 'CPC_66.0',
 'DGC_183',
 'DGC_867',
 'Heartfailure',
 'DGC_204',
 'CDC_V5861',
 'State_Count']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

##############################################################

grid_params = {'C': [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gs_svc = GridSearchCV(svc, grid_params, cv = 3, scoring = 'accuracy')
%time gs_svc.fit(X_train, y_train)

CPU times: user 4.69 s, sys: 33.7 ms, total: 4.73 s
Wall time: 4.85 s


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [216]:
print('Best Params:', gs_svc.best_params_)
print('Best Score:', gs_svc.best_score_)

Best Params: {'C': 0.3}
Best Score: 0.891999486377072


In [217]:
svc.set_params(kernel='linear', class_weight = 'balanced', random_state = 42, C = 0.3)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [85]:
svc_index = svc.support_ # index of support vector
print('Index of support vector: ' + str(svc_index))

Index of support vector: [   0   13   15 ... 3699 3700 3780]


In [219]:
print('Train Set Score')
print(svc.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gs_svc.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.8941114338526538
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.893407270486753
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1326  138]
 [  35  124]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      1464
           1       0.47      0.78      0.59       159

    accuracy                           0.89      1623
   macro avg       0.72      0.84      0.76      1623
weighted avg       0.93      0.89      0.90      1623



In [220]:
Feature_Importance8 = pd.DataFrame(np.column_stack(lm.coef_ ), index = X.columns, columns = ['Coefficients']).\
sort_values(ascending = False, by = 'Coefficients').reset_index()
Feature_Importance8.loc[Feature_Importance8.Coefficients > 0]

Unnamed: 0,index,Coefficients
0,Payment_Total,1.523955
1,Hos_Stay_Total,1.174641
2,RenalDisease,0.331702
3,CDC_42731,0.28398
4,stroke,0.248491
5,CPC_66.0,0.182121
6,DGC_183,0.130161
7,DGC_867,0.077975
8,DGC_204,0.028054


# Support Vector (All Features)

In [102]:
svc = SVC(kernel='linear', class_weight = 'balanced', random_state = 42)

X = Train_set.drop('PotentialFraud', axis = 1)

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

##############################################################

grid_params = {'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gs_svc = GridSearchCV(svc, grid_params, cv = 3, scoring = 'accuracy')
%time gs_svc.fit(X_train, y_train)

CPU times: user 15.7 s, sys: 48.8 ms, total: 15.8 s
Wall time: 15.8 s


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 0.8, 0.9, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [103]:
print('Best Params:', gs_svc.best_params_)
print('Best Score:', gs_svc.best_score_)

Best Params: {'C': 0.0001}
Best Score: 0.9189312293196713


In [104]:
svc.set_params(kernel='linear', class_weight = 'balanced', random_state = 42, C = 0.0001)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [105]:
svc.get_params

<bound method BaseEstimator.get_params of SVC(C=0.0001, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale',
    kernel='linear', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)>

In [106]:
print('Train Set Score')
print(svc.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gs_svc.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.9184050699762345
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9143561306223044
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1385   79]
 [  60   99]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1464
           1       0.56      0.62      0.59       159

    accuracy                           0.91      1623
   macro avg       0.76      0.78      0.77      1623
weighted avg       0.92      0.91      0.92      1623



In [108]:
Feature_Importance9 = pd.DataFrame(np.column_stack(svc.coef_ ), index = X.columns, columns = ['Coefficients']).\
sort_values(ascending = False, by = 'Coefficients').reset_index()
Feature_Importance9.loc[Feature_Importance9.Coefficients > 0.03]

Unnamed: 0,index,Coefficients
0,DiagGroupCode,0.071391
1,Hos_Stay_Total,0.066165
2,Payment_Total,0.065082
3,CPC_66.0,0.056345
4,CPC_3995.0,0.054783
5,CPC_2724.0,0.05238
6,CPC_4019.0,0.052177
7,CPC_8154.0,0.051157
8,CPC_9904.0,0.046443
9,CPC_8151.0,0.044089


# Suppport Vectors (Above Features Top 18)

In [151]:
svc = SVC(kernel='linear', class_weight = 'balanced', random_state = 42)

X = Train_set[['DiagGroupCode',
 'Hos_Stay_Total',
 'Payment_Total',
 'CPC_66.0',
 'CPC_3995.0',
 'CPC_2724.0',
 'CPC_4019.0',
 'CPC_8154.0',
 'CPC_9904.0',
 'CPC_8151.0',
 'DGC_204',
 'CDC_4280',
 'DGC_867',
 'CPC_3893.0',
 'CDC_42731',
 'CPC_4516.0',
 'DGC_183',
 'Claim_Len_Total']]

y = Train_set['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True, test_size = 0.3)

X_train = preprocessing.scale(X_train) # scaling the X variables
X_test = preprocessing.scale(X_test) # scaling the X variables

##############################################################

grid_params = {'C': [ 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gs_svc = GridSearchCV(svc, grid_params, cv = 3, scoring = 'accuracy')
%time gs_svc.fit(X_train, y_train)

CPU times: user 6.98 s, sys: 79.1 ms, total: 7.06 s
Wall time: 7.27 s


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 0.8, 0.9, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [152]:
print('Best Params:', gs_svc.best_params_)
print('Best Score:', gs_svc.best_score_)

Best Params: {'C': 0.0001}
Best Score: 0.9318707209417201


In [153]:
svc.set_params(kernel='linear', class_weight = 'balanced', random_state = 42, C = 0.0001)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [154]:
print('Train Set Score')
print(svc.score(X_train, y_train))
print(' * '* 30)
print('Test Set Score')
print(gs_svc.score(X_test, y_test))
print(' * '* 30)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
print(' * ' * 30)
print('Classification Report') 
print(classification_report(y_test, y_pred))

Train Set Score
0.9313440718246633
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Test Set Score
0.9322242760320394
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Confusion Matrix
[[1421   43]
 [  67   92]]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1464
           1       0.68      0.58      0.63       159

    accuracy                           0.93      1623
   macro avg       0.82      0.77      0.79      1623
weighted avg       0.93      0.93      0.93      1623



In [156]:
Feature_Importance10 = pd.DataFrame(np.column_stack(svc.coef_ ), index = X.columns, columns = ['Coefficients']).\
sort_values(ascending = False, by = 'Coefficients').reset_index()
Feature_Importance10.loc[Feature_Importance10.Coefficients > 0.01]

Unnamed: 0,index,Coefficients
0,CDC_42731,0.093197
1,Claim_Len_Total,0.092754
2,CDC_4280,0.090046
3,Payment_Total,0.089594
4,DiagGroupCode,0.078254
5,Hos_Stay_Total,0.071867
6,CPC_3995.0,0.065245
7,CPC_66.0,0.061826
8,CPC_4019.0,0.057919
9,CPC_2724.0,0.057417


********************************************************

************************************************** 

************************************************** 

# Combined feature engineered test features

In [233]:
# features_1 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test_New_Features1.csv') # Anthony
# features_2 = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test_New_Features2.csv') # Suborna

# features = features_1.merge(features_2, on = 'Provider')

# # features.to_csv('/Users/suborna/Github/Capstone_Project/Data/test_features.csv', index = False)

# Market-basket Analysis (MBA)

In [3]:
pd.set_option('display.max_colwidth', 500)

In [4]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [50]:
chronic_F = Fraud_df[['Alzheimer',
       'Heartfailure', 'KidneyDisease', 'Cancer', 'ObstrPulmonary',
       'Depression', 'Diabetes', 'IschemicHeart', 'Osteoporasis',
       'rheumatoidarthritis', 'stroke', 'RenalDisease']].copy()

chronic_F = chronic_F.astype(bool)

frequent_chronic_F = apriori(chronic_F, min_support=0.8, use_colnames=True)
MBA_chronic_F = association_rules(frequent_chronic_F, metric="lift", min_threshold=0.7)

# chronic_N = NFraud_df[['Alzheimer',
#        'Heartfailure', 'KidneyDisease', 'Cancer', 'ObstrPulmonary',
#        'Depression', 'Diabetes', 'IschemicHeart', 'Osteoporasis',
#        'rheumatoidarthritis', 'stroke', 'RenalDisease']].copy()

# chronic_N = chronic_N.astype(bool)

# frequent_chronic_N = apriori(chronic_N, min_support=0.8, use_colnames=True)
# MBA_chronic_N = association_rules(frequent_chronic_N, metric="lift", min_threshold=0.7)

In [66]:
MBA_chronic_F.loc[MBA_chronic_F.support > 0.95] # IschemicHeart, Heartfailure, Diabetes

In [167]:
# CDC_N = NFraud_df[['CDC_2449', 'CDC_25000', 'CDC_2720', 'CDC_2724', 'CDC_4011', 'CDC_4019',
#        'CDC_41401', 'CDC_42731', 'CDC_4280', 'CDC_496', 'CDC_53081',
#        'CDC_5990', 'CDC_V5861', 'CDC_V5869']].copy()

# CDC_N = CDC_N.astype(bool)

# frequent_CDC_N = apriori(CDC_N, min_support=0.8, use_colnames=True)
# MBA_CDC_N = association_rules(frequent_CDC_N, metric="lift", min_threshold=0.7)

# 'State_Count',  'Age(81-100)',  'Female_Patient', 'CDC_4019'
#        'Alzheimer', 'Heartfailure', 'KidneyDisease', 'Cancer', 'ObstrPulmonary','Depression', 'Diabetes', 'IschemicHeart', 'Osteoporasis', 'rheumatoidarthritis', 'stroke', 'RenalDisease' 
CDC_Chronic_F = Fraud_df[['CDC_41401', 'CDC_496', 'CDC_5990', 'CDC_53081', 'CDC_4019']].copy()

CDC_Chronic_F = CDC_Chronic_F.astype(bool)

frequent_CDC_F = apriori(CDC_Chronic_F, min_support=0.8, use_colnames=True)
MBA_CDC_F = association_rules(frequent_CDC_F, metric="lift", min_threshold=0.7)

In [168]:
MBA_CDC_F.loc[MBA_CDC_F.lift > 1.1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [38]:
DGC_F = Fraud_df[['Provider', 'Male_Patient', 'Female_Patient', 'Age(26-40)',
       'Age(41-60)', 'Age(61-80)', 'Age(81-100)', 'Race_White', 'Race_Black',
       'Race_Hispanic', 'Race_Native', 'Hos_Stay_Total', 'Payment_Total',
       'Claim_Len_Total']]

DGC_F = DGC_F.astype(bool)

frequent_DGC_F = apriori(DGC_F, min_support=0.8, use_colnames=True)
MBA_DGC_F = association_rules(frequent_DGC_F, metric="lift", min_threshold=0.7)

# DGC_N = NFraud_df[['Provider', 'Male_Patient', 'Female_Patient', 'Age(26-40)',
#        'Age(41-60)', 'Age(61-80)', 'Age(81-100)', 'Race_White', 'Race_Black',
#        'Race_Hispanic', 'Race_Native', 'Hos_Stay_Total', 'Payment_Total',
#        'Claim_Len_Total', ]]
# DGC_N = DGC_N.astype(bool)

# frequent_DGC_N = apriori(DGC_N, min_support=0.8, use_colnames=True)
# # MBA_DGC_N = association_rules(frequent_DGC_N, metric="lift", min_threshold=0.7)

In [43]:
MBA_DGC_F.loc[ (MBA_DGC_F.confidence > 0.95) & (MBA_DGC_F.lift > 1.1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
