In [1]:
import numpy as np
import pandas as pd

In [2]:
#import seaborn as sns
#import matplotlib.pyplot as plt
#import matplotlib.gridspec as gridspec

#for data preprocessing
from sklearn.decomposition import PCA

#for modeling
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

#filter warnings
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [3]:
df = pd.read_csv('Healthcare_Providers.csv')
df.head()

Unnamed: 0,index,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Country Code of the Provider,Provider Type,Medicare Participation Indicator,Place of Service,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,8774979,1891106191,UPADHYAYULA,SATYASREE,,M.D.,F,I,1402 S GRAND BLVD,FDT 14TH FLOOR,SAINT LOUIS,631041004.0,MO,US,Internal Medicine,Y,F,99223,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,3354385,1346202256,JONES,WENDY,P,M.D.,F,I,2950 VILLAGE DR,,FAYETTEVILLE,283043815.0,NC,US,Obstetrics & Gynecology,Y,O,G0202,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,3001884,1306820956,DUROCHER,RICHARD,W,DPM,M,I,20 WASHINGTON AVE,STE 212,NORTH HAVEN,64732343.0,CT,US,Podiatry,Y,O,99348,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155.0,64.4396875,60.5959375
3,7594822,1770523540,FULLARD,JASPER,,MD,M,I,5746 N BROADWAY ST,,KANSAS CITY,641183998.0,MO,US,Internal Medicine,Y,O,81002,"Urinalysis, manual test",N,20,18,20,3.5,5.0,3.43,3.43
4,746159,1073627758,PERROTTI,ANTHONY,E,DO,M,I,875 MILITARY TRL,SUITE 200,JUPITER,334585700.0,FL,US,Internal Medicine,Y,O,96372,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40.0,19.539393939,19.057575758


In [4]:
df.columns

Index(['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider',
       'Credentials of the Provider', 'Gender of the Provider',
       'Entity Type of the Provider', 'Street Address 1 of the Provider',
       'Street Address 2 of the Provider', 'City of the Provider',
       'Zip Code of the Provider', 'State Code of the Provider',
       'Country Code of the Provider', 'Provider Type',
       'Medicare Participation Indicator', 'Place of Service', 'HCPCS Code',
       'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services',
       'Number of Medicare Beneficiaries',
       'Number of Distinct Medicare Beneficiary/Per Day Services',
       'Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount'],
      dtype='object')

In [5]:
DropCols = ['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]

In [6]:
df = df.drop(DropCols, axis = 1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                                                    Non-Null Count   Dtype 
---  ------                                                    --------------   ----- 
 0   Credentials of the Provider                               92791 non-null   object
 1   Gender of the Provider                                    95746 non-null   object
 2   Entity Type of the Provider                               100000 non-null  object
 3   City of the Provider                                      100000 non-null  object
 4   State Code of the Provider                                100000 non-null  object
 5   Country Code of the Provider                              100000 non-null  object
 6   Provider Type                                             100000 non-null  object
 7   Medicare Participation Indicator                          100000 non-null  object
 8   Place of Servic

In [8]:
df.isnull().sum()

Credentials of the Provider                                 7209
Gender of the Provider                                      4254
Entity Type of the Provider                                    0
City of the Provider                                           0
State Code of the Provider                                     0
Country Code of the Provider                                   0
Provider Type                                                  0
Medicare Participation Indicator                               0
Place of Service                                               0
HCPCS Description                                              0
HCPCS Drug Indicator                                           0
Number of Services                                             0
Number of Medicare Beneficiaries                               0
Number of Distinct Medicare Beneficiary/Per Day Services       0
Average Medicare Allowed Amount                                0
Average Submitted Charge 

In [9]:
#Since this is just a simple implementation, we will just drop the rows with missing values
df.dropna(inplace=True)
df.shape

(92791, 18)

In [10]:
# also as we can see in the case of amounts in the dataset, they are not in numeric format, so we will go ahead and
#convert them back to numeric form

def RemoveComma(x):
    return x.replace(",","")

df["Average Medicare Allowed Amount"] = pd.to_numeric(df["Average Medicare Allowed Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors= "ignore")
df["Average Submitted Charge Amount"] = pd.to_numeric(df["Average Submitted Charge Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Payment Amount"] = pd.to_numeric(df["Average Medicare Payment Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Standardized Amount"] = pd.to_numeric(df["Average Medicare Standardized Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors = "ignore")

In [11]:
df.dtypes

Credentials of the Provider                                  object
Gender of the Provider                                       object
Entity Type of the Provider                                  object
City of the Provider                                         object
State Code of the Provider                                   object
Country Code of the Provider                                 object
Provider Type                                                object
Medicare Participation Indicator                             object
Place of Service                                             object
HCPCS Description                                            object
HCPCS Drug Indicator                                         object
Number of Services                                           object
Number of Medicare Beneficiaries                             object
Number of Distinct Medicare Beneficiary/Per Day Services     object
Average Medicare Allowed Amount                 

In [14]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

def RemoveComma(x):
    return x.replace(",","")

def Preprocessing(data):

   # Binary Encoding.

    
    BEcols = [var for var in data.columns if data[var].dtype == "O"]
    
    for col in BEcols:
        encoder = ce.BinaryEncoder(cols = [col])
        dfbin = encoder.fit_transform(data[col])
        data = pd.concat([data,dfbin], axis = 1)
        del data[col]

    #3. One-Hot-Encoding

#     data = pd.get_dummies(data,drop_first = True)
    
 
    #4. Standardization
 
    data_columns = data.columns
    std = StandardScaler()
    data = std.fit_transform(data)
    data = pd.DataFrame(data, columns = data_columns)
    
    return data


df = Preprocessing(df)

In [15]:
df.head()

Unnamed: 0,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount,Credentials of the Provider_0,Credentials of the Provider_1,Credentials of the Provider_2,Credentials of the Provider_3,Credentials of the Provider_4,Credentials of the Provider_5,Credentials of the Provider_6,Credentials of the Provider_7,Credentials of the Provider_8,Credentials of the Provider_9,Credentials of the Provider_10,Gender of the Provider_0,Gender of the Provider_1,Entity Type of the Provider_0,City of the Provider_0,City of the Provider_1,City of the Provider_2,City of the Provider_3,City of the Provider_4,City of the Provider_5,City of the Provider_6,City of the Provider_7,City of the Provider_8,City of the Provider_9,City of the Provider_10,City of the Provider_11,City of the Provider_12,State Code of the Provider_0,State Code of the Provider_1,State Code of the Provider_2,State Code of the Provider_3,State Code of the Provider_4,State Code of the Provider_5,Country Code of the Provider_0,Country Code of the Provider_1,Country Code of the Provider_2,Provider Type_0,Provider Type_1,Provider Type_2,Provider Type_3,Provider Type_4,Provider Type_5,Provider Type_6,Medicare Participation Indicator_0,Medicare Participation Indicator_1,Place of Service_0,Place of Service_1,HCPCS Description_0,HCPCS Description_1,HCPCS Description_2,HCPCS Description_3,HCPCS Description_4,HCPCS Description_5,HCPCS Description_6,HCPCS Description_7,HCPCS Description_8,HCPCS Description_9,HCPCS Description_10,HCPCS Description_11,HCPCS Drug Indicator_0,HCPCS Drug Indicator_1,Number of Services_0,Number of Services_1,Number of Services_2,Number of Services_3,Number of Services_4,Number of Services_5,Number of Services_6,Number of Services_7,Number of Services_8,Number of Services_9,Number of Services_10,Number of Services_11,Number of Medicare Beneficiaries_0,Number of Medicare Beneficiaries_1,Number of Medicare Beneficiaries_2,Number of Medicare Beneficiaries_3,Number of Medicare Beneficiaries_4,Number of Medicare Beneficiaries_5,Number of Medicare Beneficiaries_6,Number of Medicare Beneficiaries_7,Number of Medicare Beneficiaries_8,Number of Medicare Beneficiaries_9,Number of Medicare Beneficiaries_10,Number of Distinct Medicare Beneficiary/Per Day Services_0,Number of Distinct Medicare Beneficiary/Per Day Services_1,Number of Distinct Medicare Beneficiary/Per Day Services_2,Number of Distinct Medicare Beneficiary/Per Day Services_3,Number of Distinct Medicare Beneficiary/Per Day Services_4,Number of Distinct Medicare Beneficiary/Per Day Services_5,Number of Distinct Medicare Beneficiary/Per Day Services_6,Number of Distinct Medicare Beneficiary/Per Day Services_7,Number of Distinct Medicare Beneficiary/Per Day Services_8,Number of Distinct Medicare Beneficiary/Per Day Services_9,Number of Distinct Medicare Beneficiary/Per Day Services_10
0,0.473422,-0.032874,0.492697,0.516248,-0.101651,-0.119712,-0.135006,-0.161371,-0.201311,-0.271448,-0.288132,-0.3767,-0.457324,-1.059087,0.480447,-1.533318,1.533318,0.0,-0.154818,-0.348866,-0.532319,-0.701467,-0.786385,-0.910775,-0.930293,-0.909964,-1.019337,-1.065936,-1.030545,-1.023478,0.982426,-0.429758,-0.77626,-0.815078,-1.238586,-0.916871,0.961626,-0.003283,-0.007341,0.005686,-0.069809,-0.492238,-0.487401,-0.720536,-1.133026,-0.971303,0.79086,-0.017984,0.017984,-1.246106,1.246106,-0.043094,-0.22436,-0.393947,-0.554265,-0.673047,-0.754852,-0.936438,-0.993351,-0.992709,-0.972142,-1.002557,1.124067,-0.238069,0.238069,-0.072486,-0.167217,-0.272659,-0.371872,-0.521434,-0.674095,-0.884681,-0.872396,-0.934433,-0.954407,-0.886288,0.953562,-0.019425,-0.142077,-0.245657,-0.424926,-0.535685,-0.785039,-0.876675,-0.845774,-0.908461,-1.104073,0.912281,-0.133634,-0.237005,-0.342968,-0.507932,-0.644334,-0.859837,-0.955149,-0.95831,-1.18324,-0.923321,0.822055
1,0.119504,0.237374,0.264684,0.36276,-0.101651,-0.119712,-0.135006,-0.161371,-0.201311,-0.271448,-0.288132,-0.3767,-0.457324,-1.059087,0.480447,-1.533318,1.533318,0.0,-0.154818,-0.348866,-0.532319,-0.701467,-0.786385,-0.910775,-0.930293,-0.909964,-1.019337,-1.065936,-1.030545,0.97706,-1.017888,-0.429758,-0.77626,-0.815078,-1.238586,1.090666,-1.039905,-0.003283,-0.007341,0.005686,-0.069809,-0.492238,-0.487401,-0.720536,-1.133026,1.029545,-1.264446,-0.017984,0.017984,0.8025,-0.8025,-0.043094,-0.22436,-0.393947,-0.554265,-0.673047,-0.754852,-0.936438,-0.993351,-0.992709,-0.972142,0.997449,-0.889627,-0.238069,0.238069,-0.072486,-0.167217,-0.272659,-0.371872,-0.521434,-0.674095,-0.884681,-0.872396,-0.934433,-0.954407,1.128301,-1.0487,-0.019425,-0.142077,-0.245657,-0.424926,-0.535685,-0.785039,-0.876675,-0.845774,-0.908461,0.905737,-1.096153,-0.133634,-0.237005,-0.342968,-0.507932,-0.644334,-0.859837,-0.955149,-0.95831,-1.18324,1.083046,-1.216463
2,-0.032825,-0.199525,-0.058005,-0.08534,-0.101651,-0.119712,-0.135006,-0.161371,-0.201311,-0.271448,-0.288132,-0.3767,-0.457324,0.94421,-2.081395,0.65218,-0.65218,0.0,-0.154818,-0.348866,-0.532319,-0.701467,-0.786385,-0.910775,-0.930293,-0.909964,-1.019337,-1.065936,-1.030545,0.97706,0.982426,-0.429758,-0.77626,-0.815078,-1.238586,1.090666,0.961626,-0.003283,-0.007341,0.005686,-0.069809,-0.492238,-0.487401,-0.720536,-1.133026,1.029545,0.79086,-0.017984,0.017984,0.8025,-0.8025,-0.043094,-0.22436,-0.393947,-0.554265,-0.673047,-0.754852,-0.936438,-0.993351,-0.992709,-0.972142,0.997449,1.124067,-0.238069,0.238069,-0.072486,-0.167217,-0.272659,-0.371872,-0.521434,-0.674095,-0.884681,-0.872396,-0.934433,-0.954407,1.128301,0.953562,-0.019425,-0.142077,-0.245657,-0.424926,-0.535685,-0.785039,-0.876675,-0.845774,-0.908461,0.905737,0.912281,-0.133634,-0.237005,-0.342968,-0.507932,-0.644334,-0.859837,-0.955149,-0.95831,-1.18324,1.083046,0.822055
3,-0.434137,-0.365941,-0.419967,-0.42817,-0.101651,-0.119712,-0.135006,-0.161371,-0.201311,-0.271448,-0.288132,-0.3767,-0.457324,0.94421,0.480447,0.65218,-0.65218,0.0,-0.154818,-0.348866,-0.532319,-0.701467,-0.786385,-0.910775,-0.930293,-0.909964,-1.019337,-1.065936,0.970361,-1.023478,-1.017888,-0.429758,-0.77626,-0.815078,-1.238586,-0.916871,0.961626,-0.003283,-0.007341,0.005686,-0.069809,-0.492238,-0.487401,-0.720536,-1.133026,-0.971303,0.79086,-0.017984,0.017984,0.8025,-0.8025,-0.043094,-0.22436,-0.393947,-0.554265,-0.673047,-0.754852,-0.936438,-0.993351,-0.992709,1.028657,-1.002557,-0.889627,-0.238069,0.238069,-0.072486,-0.167217,-0.272659,-0.371872,-0.521434,-0.674095,-0.884681,-0.872396,-0.934433,1.047771,-0.886288,-1.0487,-0.019425,-0.142077,-0.245657,-0.424926,-0.535685,-0.785039,-0.876675,-0.845774,1.100763,-1.104073,-1.096153,-0.133634,-0.237005,-0.342968,-0.507932,-0.644334,-0.859837,-0.955149,-0.95831,0.845137,-0.923321,-1.216463
4,-0.328133,-0.327111,-0.324392,-0.33445,-0.101651,-0.119712,-0.135006,-0.161371,-0.201311,-0.271448,-0.288132,-0.3767,2.186635,-1.059087,-2.081395,0.65218,-0.65218,0.0,-0.154818,-0.348866,-0.532319,-0.701467,-0.786385,-0.910775,-0.930293,-0.909964,-1.019337,-1.065936,0.970361,-1.023478,0.982426,-0.429758,-0.77626,-0.815078,0.807372,-0.916871,-1.039905,-0.003283,-0.007341,0.005686,-0.069809,-0.492238,-0.487401,-0.720536,-1.133026,-0.971303,0.79086,-0.017984,0.017984,0.8025,-0.8025,-0.043094,-0.22436,-0.393947,-0.554265,-0.673047,-0.754852,-0.936438,-0.993351,-0.992709,1.028657,-1.002557,1.124067,-0.238069,0.238069,-0.072486,-0.167217,-0.272659,-0.371872,-0.521434,-0.674095,-0.884681,-0.872396,-0.934433,1.047771,-0.886288,0.953562,-0.019425,-0.142077,-0.245657,-0.424926,-0.535685,-0.785039,-0.876675,-0.845774,-0.908461,-1.104073,0.912281,-0.133634,-0.237005,-0.342968,-0.507932,-0.644334,-0.859837,-0.955149,-0.95831,0.845137,-0.923321,0.822055


In [16]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2)
print(train.shape, test.shape)

(74232, 99) (18559, 99)


In [45]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(n_estimators=300, max_samples='auto', 
                        contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, 
                        verbose=1, warm_start=False, random_state=2020)
model.fit(train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.6s finished


In [46]:
out = model.predict(train)
out

array([-1,  1,  1, ...,  1, -1, -1])

In [53]:
df_t = pd.DataFrame(out)
df_t.head()

Unnamed: 0,0
0,-1
1,1
2,1
3,-1
4,-1


In [54]:
df_t[0].value_counts()

-1    55002
 1    19230
Name: 0, dtype: int64

In [26]:
out[out == 1] = 0
out[out == -1] = 1

In [31]:
type(out)

numpy.ndarray

In [49]:
def potential_fraud(df, model):
    results = []
    out = model.predict(df)
    for ele in out:
        if ele == -1:
            results.append("Clean claim")
        else:
            results.append("Potential Fraudelent claim")
    return results
        
res = potential_fraud(test.head(10),model)

In [50]:
print(res)

['Potential Fraudelent claim', 'Clean claim', 'Clean claim', 'Potential Fraudelent claim', 'Potential Fraudelent claim', 'Clean claim', 'Potential Fraudelent claim', 'Potential Fraudelent claim', 'Clean claim', 'Potential Fraudelent claim']
