In [197]:
import pandas as pd
from tabulate import tabulate

# Create a DataFrame with the initial specified data
data_1 = {
    'Data': ['Main'], 'AUC': [0.7291], 'TPR': [0.7052], 'TNR': [0.7398], 'G-Mean' : [0.7223]
}

data_2 = {
    'Data': ['Main + features 1'], 'AUC': [0.7357], 'TPR': [0.7531], 'TNR': [0.6040], 'G-Mean' : [0.6744]
}
# Additional metrics to be added
data_3 = {
    'Data': ['Main + features 2'], 'AUC': [0.7898], 'TPR': [0.7884], 'TNR': [0.6459], 'G-Mean' : [0.7136]
}
data_4 = {
    'Data': ['All'], 'AUC': [ 0.7893], 'TPR': [0.7405], 'TNR': [0.6968], 'G-Mean' : [0.7183]
}

df = pd.DataFrame(data_1)
# Update DataFrame with new data
df = pd.concat([df, pd.DataFrame(data_2), pd.DataFrame(data_3), pd.DataFrame(data_4)]).reset_index(drop=True)

# Format numbers with commas for thousands and percentages/decimals appropriately
for col in ['AUC', 'TPR', 'TNR']:
    df[col] = df[col].apply(lambda x: f"{x:.4f}")

# Print the DataFrame using tabulate
print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))


+-------------------+--------+--------+--------+----------+
| Data              |    AUC |    TPR |    TNR |   G-Mean |
| Main              | 0.7291 | 0.7052 | 0.7398 |   0.7223 |
+-------------------+--------+--------+--------+----------+
| Main + features 1 | 0.7357 | 0.7531 | 0.604  |   0.6744 |
+-------------------+--------+--------+--------+----------+
| Main + features 2 | 0.7898 | 0.7884 | 0.6459 |   0.7136 |
+-------------------+--------+--------+--------+----------+
| All               | 0.7893 | 0.7405 | 0.6968 |   0.7183 |
+-------------------+--------+--------+--------+----------+


In [76]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

def count_zero(df, column):
    print(df[df[column] == 0].shape, df[df[column] != 0].shape)


In [77]:
df_leie = pd.read_csv('LEIE_2024.csv', low_memory=False, usecols = ['NPI'])
df_leie['fraud'] = 1
df_leie = df_leie[df_leie['NPI'] != 0]
df_leie

Unnamed: 0,NPI,fraud
2,1972902351,1
6,1922348218,1
26,1942476080,1
30,1275600959,1
33,1891731758,1
...,...,...
79339,1578637385,1
79344,1174561708,1
79347,1538703194,1
79348,1881770485,1


In [168]:
features_main = [
        'Rndrng_NPI', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Type', 'Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
        'Bene_Avg_Age', 'Bene_Avg_Risk_Scre'
]
features_ben_1 = [
        'Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt', 'Bene_Age_75_84_Cnt', 'Bene_Age_GT_84_Cnt',
        'Bene_Feml_Cnt', 'Bene_Male_Cnt', 'Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_API_Cnt', 
        'Bene_Race_Hspnc_Cnt', 'Bene_Race_NatInd_Cnt', 'Bene_Race_Othr_Cnt',
]
features_ben_2 = [ 
        'Bene_CC_AF_Pct', 'Bene_CC_Alzhmr_Pct', 'Bene_CC_Asthma_Pct',
       'Bene_CC_Cncr_Pct', 'Bene_CC_CHF_Pct', 'Bene_CC_CKD_Pct',
       'Bene_CC_COPD_Pct', 'Bene_CC_Dprssn_Pct', 'Bene_CC_Dbts_Pct',
       'Bene_CC_Hyplpdma_Pct', 'Bene_CC_Hyprtnsn_Pct', 'Bene_CC_IHD_Pct',
       'Bene_CC_Opo_Pct', 'Bene_CC_RAOA_Pct', 'Bene_CC_Sz_Pct',
       'Bene_CC_Strok_Pct'
]
features_all = features_main + features_ben_1 + features_ben_2 

In [169]:
# df_prov = pd.read_csv('Medicare_Physician_Other_Practitioners_by_Provider_2013.csv', low_memory=False, usecols=features_all)

df_prov = pd.DataFrame(columns=features_all)

# List of years for which you have data
years = list(range(2021, 2012, -1)) #from 2021 to 2013

# Iterate through each year, loading the data and appending new NPIs
for year in years:
    file_path = f'Medicare_Physician_Other_Practitioners_by_Provider_{year}.csv'
    # Load data from the current year
    df_current = pd.read_csv(file_path, usecols=features_all, low_memory=False, encoding='ISO-8859-1')
    
    # Drop any rows in df_current that have NPIs already in df_prov
    df_current = df_current[~df_current['Rndrng_NPI'].isin(df_prov['Rndrng_NPI'])]
    
    # Append the new data to df_prov
    df_prov = pd.concat([df_prov, df_current], ignore_index=True)

# Check the result
print(df_prov.head())
print(f"Total unique NPIs in the final DataFrame: {df_prov['Rndrng_NPI'].nunique()}")

  df_prov = pd.concat([df_prov, df_current], ignore_index=True)


   Rndrng_NPI Rndrng_Prvdr_Gndr        Rndrng_Prvdr_Type  Tot_Sbmtd_Chrg  \
0  1003000126                 M        Internal Medicine       515976.55   
1  1003000134                 M                Pathology      1136431.94   
2  1003000142                 M           Anesthesiology       295950.73   
3  1003000423                 F  Obstetrics & Gynecology        21300.00   
4  1003000480                 M          General Surgery       180891.00   

   Tot_Mdcr_Pymt_Amt Bene_Avg_Age  Bene_Avg_Risk_Scre  Bene_Age_LT_65_Cnt  \
0          231289.23           78              1.8026                42.0   
1          203094.28           76              1.0785                88.0   
2           93430.64           68              1.4920                77.0   
3            6735.97           66              0.6362                 NaN   
4           28071.92           65              1.8233                39.0   

   Bene_Age_65_74_Cnt  Bene_Age_75_84_Cnt  ...  Bene_CC_COPD_Pct  \
0           

In [170]:
df_prov.rename(columns={'Rndrng_NPI': 'NPI'}, inplace=True)
df_prov.columns
df_prov.dtypes
for i in df_prov.columns:
    i

Index(['NPI', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Type', 'Tot_Sbmtd_Chrg',
       'Tot_Mdcr_Pymt_Amt', 'Bene_Avg_Age', 'Bene_Avg_Risk_Scre',
       'Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt', 'Bene_Age_75_84_Cnt',
       'Bene_Age_GT_84_Cnt', 'Bene_Feml_Cnt', 'Bene_Male_Cnt',
       'Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_API_Cnt',
       'Bene_Race_Hspnc_Cnt', 'Bene_Race_NatInd_Cnt', 'Bene_Race_Othr_Cnt',
       'Bene_CC_AF_Pct', 'Bene_CC_Alzhmr_Pct', 'Bene_CC_Asthma_Pct',
       'Bene_CC_Cncr_Pct', 'Bene_CC_CHF_Pct', 'Bene_CC_CKD_Pct',
       'Bene_CC_COPD_Pct', 'Bene_CC_Dprssn_Pct', 'Bene_CC_Dbts_Pct',
       'Bene_CC_Hyplpdma_Pct', 'Bene_CC_Hyprtnsn_Pct', 'Bene_CC_IHD_Pct',
       'Bene_CC_Opo_Pct', 'Bene_CC_RAOA_Pct', 'Bene_CC_Sz_Pct',
       'Bene_CC_Strok_Pct'],
      dtype='object')

NPI                      object
Rndrng_Prvdr_Gndr        object
Rndrng_Prvdr_Type        object
Tot_Sbmtd_Chrg          float64
Tot_Mdcr_Pymt_Amt       float64
Bene_Avg_Age             object
Bene_Avg_Risk_Scre      float64
Bene_Age_LT_65_Cnt      float64
Bene_Age_65_74_Cnt      float64
Bene_Age_75_84_Cnt      float64
Bene_Age_GT_84_Cnt      float64
Bene_Feml_Cnt           float64
Bene_Male_Cnt           float64
Bene_Race_Wht_Cnt       float64
Bene_Race_Black_Cnt     float64
Bene_Race_API_Cnt       float64
Bene_Race_Hspnc_Cnt     float64
Bene_Race_NatInd_Cnt    float64
Bene_Race_Othr_Cnt      float64
Bene_CC_AF_Pct          float64
Bene_CC_Alzhmr_Pct      float64
Bene_CC_Asthma_Pct      float64
Bene_CC_Cncr_Pct        float64
Bene_CC_CHF_Pct         float64
Bene_CC_CKD_Pct         float64
Bene_CC_COPD_Pct        float64
Bene_CC_Dprssn_Pct      float64
Bene_CC_Dbts_Pct        float64
Bene_CC_Hyplpdma_Pct    float64
Bene_CC_Hyprtnsn_Pct    float64
Bene_CC_IHD_Pct         float64
Bene_CC_

'NPI'

'Rndrng_Prvdr_Gndr'

'Rndrng_Prvdr_Type'

'Tot_Sbmtd_Chrg'

'Tot_Mdcr_Pymt_Amt'

'Bene_Avg_Age'

'Bene_Avg_Risk_Scre'

'Bene_Age_LT_65_Cnt'

'Bene_Age_65_74_Cnt'

'Bene_Age_75_84_Cnt'

'Bene_Age_GT_84_Cnt'

'Bene_Feml_Cnt'

'Bene_Male_Cnt'

'Bene_Race_Wht_Cnt'

'Bene_Race_Black_Cnt'

'Bene_Race_API_Cnt'

'Bene_Race_Hspnc_Cnt'

'Bene_Race_NatInd_Cnt'

'Bene_Race_Othr_Cnt'

'Bene_CC_AF_Pct'

'Bene_CC_Alzhmr_Pct'

'Bene_CC_Asthma_Pct'

'Bene_CC_Cncr_Pct'

'Bene_CC_CHF_Pct'

'Bene_CC_CKD_Pct'

'Bene_CC_COPD_Pct'

'Bene_CC_Dprssn_Pct'

'Bene_CC_Dbts_Pct'

'Bene_CC_Hyplpdma_Pct'

'Bene_CC_Hyprtnsn_Pct'

'Bene_CC_IHD_Pct'

'Bene_CC_Opo_Pct'

'Bene_CC_RAOA_Pct'

'Bene_CC_Sz_Pct'

'Bene_CC_Strok_Pct'

In [171]:
unique_npi_leie = set(df_leie['NPI'])
df_prov['fraud'] = 0
df_prov['fraud'] = df_prov['NPI'].isin(unique_npi_leie).astype(int)


In [172]:
df_prov[df_prov['fraud'] == 0]
len(df_prov[df_prov['fraud'] == 1]['NPI'].unique())

Unnamed: 0,NPI,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Type,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Bene_Age_LT_65_Cnt,Bene_Age_65_74_Cnt,Bene_Age_75_84_Cnt,...,Bene_CC_Dprssn_Pct,Bene_CC_Dbts_Pct,Bene_CC_Hyplpdma_Pct,Bene_CC_Hyprtnsn_Pct,Bene_CC_IHD_Pct,Bene_CC_Opo_Pct,Bene_CC_RAOA_Pct,Bene_CC_Sz_Pct,Bene_CC_Strok_Pct,fraud
0,1003000126,M,Internal Medicine,515976.55,231289.23,78,1.8026,42.0,197.0,247.0,...,0.37,0.44,0.75,0.75,0.62,0.11,0.58,0.06,0.14,0
1,1003000134,M,Pathology,1136431.94,203094.28,76,1.0785,88.0,1396.0,1198.0,...,0.15,0.20,0.52,0.51,0.24,0.11,0.38,0.01,0.03,0
2,1003000142,M,Anesthesiology,295950.73,93430.64,68,1.4920,77.0,98.0,52.0,...,0.39,0.38,0.58,0.70,0.33,0.09,0.75,,0.06,0
3,1003000423,F,Obstetrics & Gynecology,21300.00,6735.97,66,0.6362,,47.0,,...,0.28,0.17,0.55,0.46,0.16,0.17,0.38,,,0
4,1003000480,M,General Surgery,180891.00,28071.92,65,1.8233,39.0,55.0,,...,0.32,0.40,0.48,0.65,0.31,0.12,0.59,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640057,1992996797,M,Chiropractic,7664.00,3429.29,68,0.8558,,17.0,,...,,,0.52,0.33,,0.00,0.33,0.00,,0
1640058,1992996995,F,Occupational therapist,28836.00,14939.43,85,1.8190,,,,...,,,,0.73,,,0.50,,,0
1640059,1992997076,M,Anesthesiology,70856.00,10980.87,72,2.0383,,24.0,14.0,...,0.24,0.33,0.53,0.69,0.39,,0.47,,,0
1640060,1992997316,M,Family Practice,11605.00,2849.56,72,1.0128,,37.0,,...,,0.53,0.47,0.75,0.38,,0.53,,,0


2016

In [173]:
df_prov.isnull().sum()

NPI                           0
Rndrng_Prvdr_Gndr         84744
Rndrng_Prvdr_Type             0
Tot_Sbmtd_Chrg                0
Tot_Mdcr_Pymt_Amt             0
Bene_Avg_Age                  0
Bene_Avg_Risk_Scre            0
Bene_Age_LT_65_Cnt       801654
Bene_Age_65_74_Cnt       412413
Bene_Age_75_84_Cnt       606870
Bene_Age_GT_84_Cnt       822154
Bene_Feml_Cnt            322124
Bene_Male_Cnt            322124
Bene_Race_Wht_Cnt        661288
Bene_Race_Black_Cnt     1146414
Bene_Race_API_Cnt       1312250
Bene_Race_Hspnc_Cnt     1233908
Bene_Race_NatInd_Cnt    1097054
Bene_Race_Othr_Cnt      1311252
Bene_CC_AF_Pct           644380
Bene_CC_Alzhmr_Pct       650701
Bene_CC_Asthma_Pct       797533
Bene_CC_Cncr_Pct         667939
Bene_CC_CHF_Pct          543187
Bene_CC_CKD_Pct          387822
Bene_CC_COPD_Pct         616021
Bene_CC_Dprssn_Pct       384461
Bene_CC_Dbts_Pct         384700
Bene_CC_Hyplpdma_Pct     222061
Bene_CC_Hyprtnsn_Pct     171037
Bene_CC_IHD_Pct          383530
Bene_CC_

In [174]:
df_prov.dropna(subset=['Rndrng_Prvdr_Gndr'], inplace=True)
df_prov.isnull().sum()

NPI                           0
Rndrng_Prvdr_Gndr             0
Rndrng_Prvdr_Type             0
Tot_Sbmtd_Chrg                0
Tot_Mdcr_Pymt_Amt             0
Bene_Avg_Age                  0
Bene_Avg_Risk_Scre            0
Bene_Age_LT_65_Cnt       776620
Bene_Age_65_74_Cnt       400563
Bene_Age_75_84_Cnt       590588
Bene_Age_GT_84_Cnt       796790
Bene_Feml_Cnt            313355
Bene_Male_Cnt            313355
Bene_Race_Wht_Cnt        639695
Bene_Race_Black_Cnt     1099643
Bene_Race_API_Cnt       1253916
Bene_Race_Hspnc_Cnt     1183416
Bene_Race_NatInd_Cnt    1040853
Bene_Race_Othr_Cnt      1259699
Bene_CC_AF_Pct           620384
Bene_CC_Alzhmr_Pct       626241
Bene_CC_Asthma_Pct       766232
Bene_CC_Cncr_Pct         644059
Bene_CC_CHF_Pct          525299
Bene_CC_CKD_Pct          375720
Bene_CC_COPD_Pct         594418
Bene_CC_Dprssn_Pct       369594
Bene_CC_Dbts_Pct         373726
Bene_CC_Hyplpdma_Pct     215507
Bene_CC_Hyprtnsn_Pct     166527
Bene_CC_IHD_Pct          372588
Bene_CC_

In [175]:
exclude_columns = ['NPI', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Type', 'fraud']
median_values = df_prov.drop(columns=exclude_columns).median()

# Fill NaN values with medians in the selected columns
df_prov.fillna(median_values, inplace=True)

print(df_prov.isnull().sum())
df_prov.shape
df_prov[df_prov['fraud'] == 1]
len(df_prov['NPI'].unique())

NPI                     0
Rndrng_Prvdr_Gndr       0
Rndrng_Prvdr_Type       0
Tot_Sbmtd_Chrg          0
Tot_Mdcr_Pymt_Amt       0
Bene_Avg_Age            0
Bene_Avg_Risk_Scre      0
Bene_Age_LT_65_Cnt      0
Bene_Age_65_74_Cnt      0
Bene_Age_75_84_Cnt      0
Bene_Age_GT_84_Cnt      0
Bene_Feml_Cnt           0
Bene_Male_Cnt           0
Bene_Race_Wht_Cnt       0
Bene_Race_Black_Cnt     0
Bene_Race_API_Cnt       0
Bene_Race_Hspnc_Cnt     0
Bene_Race_NatInd_Cnt    0
Bene_Race_Othr_Cnt      0
Bene_CC_AF_Pct          0
Bene_CC_Alzhmr_Pct      0
Bene_CC_Asthma_Pct      0
Bene_CC_Cncr_Pct        0
Bene_CC_CHF_Pct         0
Bene_CC_CKD_Pct         0
Bene_CC_COPD_Pct        0
Bene_CC_Dprssn_Pct      0
Bene_CC_Dbts_Pct        0
Bene_CC_Hyplpdma_Pct    0
Bene_CC_Hyprtnsn_Pct    0
Bene_CC_IHD_Pct         0
Bene_CC_Opo_Pct         0
Bene_CC_RAOA_Pct        0
Bene_CC_Sz_Pct          0
Bene_CC_Strok_Pct       0
fraud                   0
dtype: int64


(1555318, 36)

Unnamed: 0,NPI,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Type,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Bene_Age_LT_65_Cnt,Bene_Age_65_74_Cnt,Bene_Age_75_84_Cnt,...,Bene_CC_Dprssn_Pct,Bene_CC_Dbts_Pct,Bene_CC_Hyplpdma_Pct,Bene_CC_Hyprtnsn_Pct,Bene_CC_IHD_Pct,Bene_CC_Opo_Pct,Bene_CC_RAOA_Pct,Bene_CC_Sz_Pct,Bene_CC_Strok_Pct,fraud
4425,1003278615,F,Nurse Practitioner,99924.32,58166.69,81,2.7284,12.0,34.0,54.0,...,0.46,0.75,0.74,0.75,0.53,0.18,0.65,0.06,0.18,1
10924,1003926270,M,Physical Medicine and Rehabilitation,21677.68,12461.53,35,0.9738,31.0,70.0,61.0,...,0.30,0.35,0.64,0.73,0.00,0.11,0.47,0.01,0.00,1
11095,1003939471,M,Obstetrics & Gynecology,37052.35,23597.37,71,0.8549,31.0,83.0,54.0,...,0.20,0.24,0.59,0.64,0.28,0.08,0.48,0.01,0.06,1
25699,1023045606,M,Emergency Medicine,14847.00,3400.29,72,2.3741,31.0,70.0,61.0,...,0.30,0.35,0.58,0.75,0.38,0.11,0.47,0.00,0.06,1
27160,1023087293,M,Emergency Medicine,386911.00,41937.87,73,1.4829,45.0,100.0,70.0,...,0.31,0.37,0.51,0.75,0.44,0.09,0.51,0.05,0.05,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639234,1972791820,M,Clinical Psychologist,14550.00,6370.72,66,1.2544,31.0,70.0,61.0,...,0.30,0.35,0.64,0.73,0.38,0.11,0.47,0.00,0.06,1
1639257,1972839447,M,General Practice,52592.64,27964.40,57,1.9736,79.0,70.0,61.0,...,0.42,0.54,0.48,0.75,0.66,0.11,0.75,0.11,0.06,1
1639598,1982881587,F,Chiropractic,59990.00,29352.74,59,2.0302,46.0,70.0,13.0,...,0.57,0.44,0.50,0.75,0.33,0.11,0.57,0.38,0.06,1
1639671,1992034250,F,General Practice,7748.50,3778.07,69,0.8083,31.0,25.0,61.0,...,0.30,0.26,0.57,0.74,0.24,0.11,0.30,0.00,0.00,1


1555318

In [176]:
numeric_columns = df_prov.columns.difference(['NPI', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Type', 'fraud'])

df_prov[numeric_columns] = (df_prov[numeric_columns] - df_prov[numeric_columns].min()) / (df_prov[numeric_columns].max() - df_prov[numeric_columns].min())


In [177]:
columns_to_encode = ['Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr']

# Perform one-hot encoding
df_prov = pd.get_dummies(df_prov, columns=columns_to_encode)

In [178]:
df_prov[df_prov['fraud'] == 1]
print(df_prov.isnull().sum())

Unnamed: 0,NPI,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Bene_Age_LT_65_Cnt,Bene_Age_65_74_Cnt,Bene_Age_75_84_Cnt,Bene_Age_GT_84_Cnt,Bene_Feml_Cnt,...,Rndrng_Prvdr_Type_Thoracic Surgery,Rndrng_Prvdr_Type_Undefined Physician type,Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine,Rndrng_Prvdr_Type_Unknown Physician Specialty Code,Rndrng_Prvdr_Type_Unknown Supplier/Provider,Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty,Rndrng_Prvdr_Type_Urology,Rndrng_Prvdr_Type_Vascular Surgery,Rndrng_Prvdr_Gndr_F,Rndrng_Prvdr_Gndr_M
4425,1003278615,0.001607,0.002828,0.852273,0.203374,0.000561,0.000216,0.000498,0.002169,0.000784,...,False,False,False,False,False,False,False,False,True,False
10924,1003926270,0.000349,0.000606,0.329545,0.058146,0.001450,0.000445,0.000563,0.000846,0.000088,...,False,False,False,False,False,False,False,False,False,True
11095,1003939471,0.000596,0.001147,0.738636,0.048304,0.001450,0.000528,0.000498,0.000846,0.000638,...,False,False,False,False,False,False,False,False,False,True
25699,1023045606,0.000239,0.000165,0.750000,0.174048,0.001450,0.000445,0.000563,0.000846,0.000088,...,False,False,False,False,False,False,False,False,False,True
27160,1023087293,0.006224,0.002039,0.761364,0.100284,0.002105,0.000636,0.000646,0.001349,0.000907,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639234,1972791820,0.000234,0.000310,0.681818,0.081371,0.001450,0.000445,0.000563,0.000000,0.000497,...,False,False,False,False,False,False,False,False,False,True
1639257,1972839447,0.000846,0.001359,0.579545,0.140899,0.003695,0.000445,0.000563,0.000846,0.000287,...,False,False,False,False,False,False,False,False,False,True
1639598,1982881587,0.000965,0.001427,0.602273,0.145584,0.002152,0.000445,0.000120,0.000846,0.000222,...,False,False,False,False,False,False,False,False,True,False
1639671,1992034250,0.000125,0.000184,0.715909,0.044447,0.001450,0.000159,0.000563,0.000846,0.000146,...,False,False,False,False,False,False,False,False,True,False


NPI                                                      0
Tot_Sbmtd_Chrg                                           0
Tot_Mdcr_Pymt_Amt                                        0
Bene_Avg_Age                                             0
Bene_Avg_Risk_Scre                                       0
                                                        ..
Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty    0
Rndrng_Prvdr_Type_Urology                                0
Rndrng_Prvdr_Type_Vascular Surgery                       0
Rndrng_Prvdr_Gndr_F                                      0
Rndrng_Prvdr_Gndr_M                                      0
Length: 155, dtype: int64


In [196]:
X = df_prov.drop(['fraud', 'NPI'], axis=1)
y = df_prov['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.001
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9980 - loss: 0.0165 - val_accuracy: 0.9987 - val_loss: 0.0094
Epoch 2/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9988 - loss: 0.0096 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 3/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0096 - val_accuracy: 0.9987 - val_loss: 0.0097
Epoch 4/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0097 - val_accuracy: 0.9987 - val_loss: 0.0093
Epoch 5/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1ms/step - accuracy: 0.9988 - loss: 0.0095 - val_accuracy: 0.9987 - val_loss: 0.0101
Epoch 6/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0097 - val_accuracy: 0.9987 - val_loss: 0.010

<keras.src.callbacks.history.History at 0x27c81cff390>

[1m9721/9721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 740us/step
ROC AUC Score: 0.7893747165388416
True Positive Rate (TPR): 0.7405541561712846
True Negative Rate (TNR): 0.6968329433122926
Geometric Mean (G-Mean): 0.7183610041803408
              precision    recall  f1-score   support

           0       1.00      0.70      0.82    310667
           1       0.00      0.74      0.01       397

    accuracy                           0.70    311064
   macro avg       0.50      0.72      0.41    311064
weighted avg       1.00      0.70      0.82    311064



In [120]:
# Adjust threshold and compute binary outcomes
threshold = 0.0022
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

True Positive Rate (TPR): 0.7052896725440806
True Negative Rate (TNR): 0.7398822533452217
Geometric Mean (G-Mean): 0.7223789256221612
              precision    recall  f1-score   support

           0       1.00      0.74      0.85    310667
           1       0.00      0.71      0.01       397

    accuracy                           0.74    311064
   macro avg       0.50      0.72      0.43    311064
weighted avg       1.00      0.74      0.85    311064



In [65]:
for i in df_prov.columns:
    i

'NPI'

'Tot_Sbmtd_Chrg'

'Tot_Mdcr_Pymt_Amt'

'Bene_Avg_Age'

'Bene_Age_LT_65_Cnt'

'Bene_Age_65_74_Cnt'

'Bene_Age_75_84_Cnt'

'Bene_Age_GT_84_Cnt'

'Bene_Feml_Cnt'

'Bene_Male_Cnt'

'Bene_Race_Wht_Cnt'

'Bene_Race_NatInd_Cnt'

'Bene_Race_Othr_Cnt'

'Bene_CC_AF_Pct'

'Bene_CC_Alzhmr_Pct'

'Bene_CC_Asthma_Pct'

'Bene_CC_Cncr_Pct'

'Bene_CC_CHF_Pct'

'Bene_CC_CKD_Pct'

'Bene_CC_COPD_Pct'

'Bene_CC_Dprssn_Pct'

'Bene_CC_Dbts_Pct'

'Bene_CC_Hyplpdma_Pct'

'Bene_CC_Hyprtnsn_Pct'

'Bene_CC_IHD_Pct'

'Bene_CC_Opo_Pct'

'Bene_CC_RAOA_Pct'

'Bene_CC_Sz_Pct'

'Bene_CC_Strok_Pct'

'Bene_Avg_Risk_Scre'

'fraud'

'Rndrng_Prvdr_Type_Addiction Medicine'

'Rndrng_Prvdr_Type_All Other Suppliers'

'Rndrng_Prvdr_Type_Allergy/Immunology'

'Rndrng_Prvdr_Type_Ambulance Service Supplier'

'Rndrng_Prvdr_Type_Anesthesiologist Assistants'

'Rndrng_Prvdr_Type_Anesthesiology'

'Rndrng_Prvdr_Type_Audiologist (billing independently)'

'Rndrng_Prvdr_Type_CRNA'

'Rndrng_Prvdr_Type_Cardiac Electrophysiology'

'Rndrng_Prvdr_Type_Cardiac Surgery'

'Rndrng_Prvdr_Type_Cardiology'

'Rndrng_Prvdr_Type_Certified Clinical Nurse Specialist'

'Rndrng_Prvdr_Type_Certified Nurse Midwife'

'Rndrng_Prvdr_Type_Chiropractic'

'Rndrng_Prvdr_Type_Clinical Laboratory'

'Rndrng_Prvdr_Type_Clinical Psychologist'

'Rndrng_Prvdr_Type_Colorectal Surgery (formerly proctology)'

'Rndrng_Prvdr_Type_Critical Care (Intensivists)'

'Rndrng_Prvdr_Type_Dermatology'

'Rndrng_Prvdr_Type_Diagnostic Radiology'

'Rndrng_Prvdr_Type_Emergency Medicine'

'Rndrng_Prvdr_Type_Endocrinology'

'Rndrng_Prvdr_Type_Family Practice'

'Rndrng_Prvdr_Type_Gastroenterology'

'Rndrng_Prvdr_Type_General Practice'

'Rndrng_Prvdr_Type_General Surgery'

'Rndrng_Prvdr_Type_Geriatric Medicine'

'Rndrng_Prvdr_Type_Geriatric Psychiatry'

'Rndrng_Prvdr_Type_Gynecological/Oncology'

'Rndrng_Prvdr_Type_Hand Surgery'

'Rndrng_Prvdr_Type_Hematology'

'Rndrng_Prvdr_Type_Hematology/Oncology'

'Rndrng_Prvdr_Type_Hospice and Palliative Care'

'Rndrng_Prvdr_Type_Independent Diagnostic Testing Facility'

'Rndrng_Prvdr_Type_Infectious Disease'

'Rndrng_Prvdr_Type_Internal Medicine'

'Rndrng_Prvdr_Type_Interventional Pain Management'

'Rndrng_Prvdr_Type_Interventional Radiology'

'Rndrng_Prvdr_Type_Licensed Clinical Social Worker'

'Rndrng_Prvdr_Type_Mass Immunization Roster Biller'

'Rndrng_Prvdr_Type_Maxillofacial Surgery'

'Rndrng_Prvdr_Type_Medical Oncology'

'Rndrng_Prvdr_Type_Multispecialty Clinic/Group Practice'

'Rndrng_Prvdr_Type_Nephrology'

'Rndrng_Prvdr_Type_Neurology'

'Rndrng_Prvdr_Type_Neuropsychiatry'

'Rndrng_Prvdr_Type_Neurosurgery'

'Rndrng_Prvdr_Type_Nuclear Medicine'

'Rndrng_Prvdr_Type_Nurse Practitioner'

'Rndrng_Prvdr_Type_Obstetrics/Gynecology'

'Rndrng_Prvdr_Type_Occupational therapist'

'Rndrng_Prvdr_Type_Ophthalmology'

'Rndrng_Prvdr_Type_Optometry'

'Rndrng_Prvdr_Type_Oral Surgery (dentists only)'

'Rndrng_Prvdr_Type_Orthopedic Surgery'

'Rndrng_Prvdr_Type_Osteopathic Manipulative Medicine'

'Rndrng_Prvdr_Type_Otolaryngology'

'Rndrng_Prvdr_Type_Pain Management'

'Rndrng_Prvdr_Type_Pathology'

'Rndrng_Prvdr_Type_Pediatric Medicine'

'Rndrng_Prvdr_Type_Peripheral Vascular Disease'

'Rndrng_Prvdr_Type_Physical Medicine and Rehabilitation'

'Rndrng_Prvdr_Type_Physical Therapist'

'Rndrng_Prvdr_Type_Physician Assistant'

'Rndrng_Prvdr_Type_Plastic and Reconstructive Surgery'

'Rndrng_Prvdr_Type_Podiatry'

'Rndrng_Prvdr_Type_Portable X-ray'

'Rndrng_Prvdr_Type_Preventive Medicine'

'Rndrng_Prvdr_Type_Psychiatry'

'Rndrng_Prvdr_Type_Psychologist (billing independently)'

'Rndrng_Prvdr_Type_Pulmonary Disease'

'Rndrng_Prvdr_Type_Radiation Oncology'

'Rndrng_Prvdr_Type_Registered Dietician/Nutrition Professional'

'Rndrng_Prvdr_Type_Rheumatology'

'Rndrng_Prvdr_Type_Sleep Medicine'

'Rndrng_Prvdr_Type_Slide Preparation Facility'

'Rndrng_Prvdr_Type_Speech Language Pathologist'

'Rndrng_Prvdr_Type_Sports Medicine'

'Rndrng_Prvdr_Type_Surgical Oncology'

'Rndrng_Prvdr_Type_Thoracic Surgery'

'Rndrng_Prvdr_Type_Unknown Physician Specialty Code'

'Rndrng_Prvdr_Type_Unknown Supplier/Provider'

'Rndrng_Prvdr_Type_Urology'

'Rndrng_Prvdr_Type_Vascular Surgery'

'Rndrng_Prvdr_Gndr_F'

'Rndrng_Prvdr_Gndr_M'

In [164]:
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2]
y = df_prov['fraud']
X

Unnamed: 0,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Rndrng_Prvdr_Type_Addiction Medicine,Rndrng_Prvdr_Type_Adult Congenital Heart Disease,Rndrng_Prvdr_Type_Advanced Heart Failure and Transplant Cardiology,Rndrng_Prvdr_Type_Allergy/ Immunology,Rndrng_Prvdr_Type_Allergy/Immunology,Rndrng_Prvdr_Type_Ambulance Service Provider,...,Rndrng_Prvdr_Type_Thoracic Surgery,Rndrng_Prvdr_Type_Undefined Physician type,Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine,Rndrng_Prvdr_Type_Unknown Physician Specialty Code,Rndrng_Prvdr_Type_Unknown Supplier/Provider,Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty,Rndrng_Prvdr_Type_Urology,Rndrng_Prvdr_Type_Vascular Surgery,Rndrng_Prvdr_Gndr_F,Rndrng_Prvdr_Gndr_M
0,0.008300,0.011243,0.818182,0.126745,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0.018280,0.009873,0.795455,0.066812,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,0.004761,0.004542,0.704545,0.101037,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,0.000343,0.000327,0.681818,0.030203,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,0.002910,0.001365,0.670455,0.128459,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640057,0.000123,0.000167,0.704545,0.048379,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1640058,0.000464,0.000726,0.897727,0.128103,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1640059,0.001140,0.000534,0.750000,0.146254,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1640060,0.000187,0.000139,0.750000,0.061374,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [132]:
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2]
y = df_prov['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.001
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1ms/step - accuracy: 0.9972 - loss: 0.0188 - val_accuracy: 0.9987 - val_loss: 0.0098
Epoch 2/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0103 - val_accuracy: 0.9987 - val_loss: 0.0101
Epoch 3/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0104 - val_accuracy: 0.9987 - val_loss: 0.0104
Epoch 4/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0100 - val_accuracy: 0.9987 - val_loss: 0.0106
Epoch 5/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 1ms/step - accuracy: 0.9988 - loss: 0.0099 - val_accuracy: 0.9987 - val_loss: 0.0099
Epoch 6/50
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0108 - val_accuracy: 0.9987 - val_loss: 0.010

<keras.src.callbacks.history.History at 0x27ce5eaa810>

[1m9721/9721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 630us/step
ROC AUC Score: 0.7291468890300783
True Positive Rate (TPR): 0.7707808564231738
True Negative Rate (TNR): 0.5959113777774917
Geometric Mean (G-Mean): 0.6777293575725111
              precision    recall  f1-score   support

           0       1.00      0.60      0.75    310667
           1       0.00      0.77      0.00       397

    accuracy                           0.60    311064
   macro avg       0.50      0.68      0.38    311064
weighted avg       1.00      0.60      0.75    311064



In [137]:
# Adjust threshold and compute binary outcomes
threshold = 0.0017
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

True Positive Rate (TPR): 0.6750629722921915
True Negative Rate (TNR): 0.7104713406959864
Geometric Mean (G-Mean): 0.692540897693884
              precision    recall  f1-score   support

           0       1.00      0.71      0.83    310667
           1       0.00      0.68      0.01       397

    accuracy                           0.71    311064
   macro avg       0.50      0.69      0.42    311064
weighted avg       1.00      0.71      0.83    311064



In [182]:

features_ben_1 = [
        'Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt', 'Bene_Age_75_84_Cnt', 'Bene_Age_GT_84_Cnt',
        'Bene_Feml_Cnt', 'Bene_Male_Cnt', 'Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_API_Cnt', 
        'Bene_Race_Hspnc_Cnt', 'Bene_Race_NatInd_Cnt', 'Bene_Race_Othr_Cnt',
]
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2 + features_ben_1]
y = df_prov['fraud']
X

Unnamed: 0,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Rndrng_Prvdr_Type_Addiction Medicine,Rndrng_Prvdr_Type_Adult Congenital Heart Disease,Rndrng_Prvdr_Type_Advanced Heart Failure and Transplant Cardiology,Rndrng_Prvdr_Type_Allergy/ Immunology,Rndrng_Prvdr_Type_Allergy/Immunology,Rndrng_Prvdr_Type_Ambulance Service Provider,...,Bene_Age_75_84_Cnt,Bene_Age_GT_84_Cnt,Bene_Feml_Cnt,Bene_Male_Cnt,Bene_Race_Wht_Cnt,Bene_Race_Black_Cnt,Bene_Race_API_Cnt,Bene_Race_Hspnc_Cnt,Bene_Race_NatInd_Cnt,Bene_Race_Othr_Cnt
0,0.008300,0.011243,0.818182,0.126745,False,False,False,False,False,False,...,0.002278,0.004628,0.002211,0.002027,0.002428,0.003823,0.001406,0.000347,0.0,0.002313
1,0.018280,0.009873,0.795455,0.066812,False,False,False,False,False,False,...,0.011049,0.014122,0.009933,0.010875,0.015118,0.001290,0.001497,0.001164,0.0,0.010866
2,0.004761,0.004542,0.704545,0.101037,False,False,False,False,False,False,...,0.000480,0.000317,0.000860,0.000659,0.000885,0.002533,0.000000,0.000446,0.0,0.000771
3,0.000343,0.000327,0.681818,0.030203,False,False,False,False,False,False,...,0.000563,0.000846,0.000404,0.000000,0.000823,0.001151,0.000000,0.000446,0.0,0.000771
4,0.002910,0.001365,0.670455,0.128459,False,False,False,False,False,False,...,0.000563,0.000846,0.000374,0.000344,0.000425,0.001151,0.000000,0.000371,0.0,0.000771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640057,0.000123,0.000167,0.704545,0.048379,False,False,False,False,False,False,...,0.000563,0.000846,0.000099,0.000115,0.000823,0.001151,0.000000,0.000446,0.0,0.000771
1640058,0.000464,0.000726,0.897727,0.128103,False,False,False,False,False,False,...,0.000563,0.000846,0.000497,0.000444,0.000823,0.001151,0.000000,0.000446,0.0,0.000771
1640059,0.001140,0.000534,0.750000,0.146254,False,False,False,False,False,False,...,0.000129,0.000846,0.000205,0.000115,0.000207,0.001151,0.000000,0.000446,0.0,0.000771
1640060,0.000187,0.000139,0.750000,0.061374,False,False,False,False,False,False,...,0.000563,0.000846,0.000175,0.000215,0.000823,0.001151,0.001089,0.000000,0.0,0.000771


In [184]:
# List of columns you want to explicitly keep

features_ben_1 = [
        'Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt', 'Bene_Age_75_84_Cnt', 'Bene_Age_GT_84_Cnt',
        'Bene_Feml_Cnt', 'Bene_Male_Cnt', 'Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_API_Cnt', 
        'Bene_Race_Hspnc_Cnt', 'Bene_Race_NatInd_Cnt', 'Bene_Race_Othr_Cnt',
]
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2 + features_ben_1]
y = df_prov['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.001
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9978 - loss: 0.0164 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 2/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0100 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 3/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0099 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 4/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0099 - val_accuracy: 0.9987 - val_loss: 0.0097
Epoch 5/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0098 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 6/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0100 - val_accuracy: 0.9987 - val_loss: 0.010

<keras.src.callbacks.history.History at 0x27cbe878990>

[1m9721/9721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 762us/step
ROC AUC Score: 0.735767668458275
True Positive Rate (TPR): 0.7531486146095718
True Negative Rate (TNR): 0.6040165192955802
Geometric Mean (G-Mean): 0.6744732794623979
              precision    recall  f1-score   support

           0       1.00      0.60      0.75    310667
           1       0.00      0.75      0.00       397

    accuracy                           0.60    311064
   macro avg       0.50      0.68      0.38    311064
weighted avg       1.00      0.60      0.75    311064



In [161]:
# Adjust threshold and compute binary outcomes
threshold = 0.0015
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

True Positive Rate (TPR): 0.7329974811083123
True Negative Rate (TNR): 0.628747179455816
Geometric Mean (G-Mean): 0.678874140614495
              precision    recall  f1-score   support

           0       1.00      0.63      0.77    310667
           1       0.00      0.73      0.01       397

    accuracy                           0.63    311064
   macro avg       0.50      0.68      0.39    311064
weighted avg       1.00      0.63      0.77    311064



In [185]:
# List of columns you want to explicitly keep
features_ben_2 = [ 
        'Bene_CC_AF_Pct', 'Bene_CC_Alzhmr_Pct', 'Bene_CC_Asthma_Pct',
       'Bene_CC_Cncr_Pct', 'Bene_CC_CHF_Pct', 'Bene_CC_CKD_Pct',
       'Bene_CC_COPD_Pct', 'Bene_CC_Dprssn_Pct', 'Bene_CC_Dbts_Pct',
       'Bene_CC_Hyplpdma_Pct', 'Bene_CC_Hyprtnsn_Pct', 'Bene_CC_IHD_Pct',
       'Bene_CC_Opo_Pct', 'Bene_CC_RAOA_Pct', 'Bene_CC_Sz_Pct',
       'Bene_CC_Strok_Pct'
]
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2 + features_ben_2]
y = df_prov['fraud']
X

Unnamed: 0,Tot_Sbmtd_Chrg,Tot_Mdcr_Pymt_Amt,Bene_Avg_Age,Bene_Avg_Risk_Scre,Rndrng_Prvdr_Type_Addiction Medicine,Rndrng_Prvdr_Type_Adult Congenital Heart Disease,Rndrng_Prvdr_Type_Advanced Heart Failure and Transplant Cardiology,Rndrng_Prvdr_Type_Allergy/ Immunology,Rndrng_Prvdr_Type_Allergy/Immunology,Rndrng_Prvdr_Type_Ambulance Service Provider,...,Bene_CC_COPD_Pct,Bene_CC_Dprssn_Pct,Bene_CC_Dbts_Pct,Bene_CC_Hyplpdma_Pct,Bene_CC_Hyprtnsn_Pct,Bene_CC_IHD_Pct,Bene_CC_Opo_Pct,Bene_CC_RAOA_Pct,Bene_CC_Sz_Pct,Bene_CC_Strok_Pct
0,0.008300,0.011243,0.818182,0.126745,False,False,False,False,False,False,...,0.320000,0.493333,0.586667,1.000000,1.000000,0.826667,0.146667,0.773333,0.080000,0.186667
1,0.018280,0.009873,0.795455,0.066812,False,False,False,False,False,False,...,0.080000,0.200000,0.266667,0.693333,0.680000,0.320000,0.146667,0.506667,0.013333,0.040000
2,0.004761,0.004542,0.704545,0.101037,False,False,False,False,False,False,...,0.280000,0.520000,0.506667,0.773333,0.933333,0.440000,0.120000,1.000000,0.013333,0.080000
3,0.000343,0.000327,0.681818,0.030203,False,False,False,False,False,False,...,0.240000,0.373333,0.226667,0.733333,0.613333,0.213333,0.226667,0.506667,0.013333,0.080000
4,0.002910,0.001365,0.670455,0.128459,False,False,False,False,False,False,...,0.173333,0.426667,0.533333,0.640000,0.866667,0.413333,0.160000,0.786667,0.013333,0.080000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640057,0.000123,0.000167,0.704545,0.048379,False,False,False,False,False,False,...,0.240000,0.400000,0.466667,0.693333,0.440000,0.506667,0.000000,0.440000,0.000000,0.080000
1640058,0.000464,0.000726,0.897727,0.128103,False,False,False,False,False,False,...,0.240000,0.400000,0.466667,0.853333,0.973333,0.506667,0.146667,0.666667,0.013333,0.080000
1640059,0.001140,0.000534,0.750000,0.146254,False,False,False,False,False,False,...,0.360000,0.320000,0.440000,0.706667,0.920000,0.520000,0.146667,0.626667,0.013333,0.080000
1640060,0.000187,0.000139,0.750000,0.061374,False,False,False,False,False,False,...,0.240000,0.400000,0.706667,0.626667,1.000000,0.506667,0.146667,0.706667,0.013333,0.080000


In [186]:
# List of columns you want to explicitly keep
features_ben_2 = [ 
        'Bene_CC_AF_Pct', 'Bene_CC_Alzhmr_Pct', 'Bene_CC_Asthma_Pct',
       'Bene_CC_Cncr_Pct', 'Bene_CC_CHF_Pct', 'Bene_CC_CKD_Pct',
       'Bene_CC_COPD_Pct', 'Bene_CC_Dprssn_Pct', 'Bene_CC_Dbts_Pct',
       'Bene_CC_Hyplpdma_Pct', 'Bene_CC_Hyprtnsn_Pct', 'Bene_CC_IHD_Pct',
       'Bene_CC_Opo_Pct', 'Bene_CC_RAOA_Pct', 'Bene_CC_Sz_Pct',
       'Bene_CC_Strok_Pct'
]
keep_columns_1 = ['Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                'Bene_Avg_Age', 'Bene_Avg_Risk_Scre']

# Finding columns that start with the specified prefixes to drop
keep_columns_2 = [col for col in df_prov.columns if col.startswith('Rndrng_Prvdr_Gndr_') or col.startswith('Rndrng_Prvdr_Type_')]

# Ensure the columns to drop does not include any of the keep columns

# Drop the identified columns and keep the explicitly listed ones
X = df_prov[keep_columns_1+ keep_columns_2 + features_ben_2]
y = df_prov['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.001
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1ms/step - accuracy: 0.9975 - loss: 0.0174 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 2/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0101 - val_accuracy: 0.9987 - val_loss: 0.0099
Epoch 3/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1ms/step - accuracy: 0.9988 - loss: 0.0095 - val_accuracy: 0.9987 - val_loss: 0.0092
Epoch 4/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0099 - val_accuracy: 0.9987 - val_loss: 0.0096
Epoch 5/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0103 - val_accuracy: 0.9987 - val_loss: 0.0094
Epoch 6/10
[1m34995/34995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0098 - val_accuracy: 0.9987 - val_loss: 0.009

<keras.src.callbacks.history.History at 0x27cb7f81e50>

[1m9721/9721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 727us/step
ROC AUC Score: 0.7898592269972402
True Positive Rate (TPR): 0.7884130982367759
True Negative Rate (TNR): 0.6459875043052529
Geometric Mean (G-Mean): 0.7136560864250701
              precision    recall  f1-score   support

           0       1.00      0.65      0.78    310667
           1       0.00      0.79      0.01       397

    accuracy                           0.65    311064
   macro avg       0.50      0.72      0.40    311064
weighted avg       1.00      0.65      0.78    311064

