In [1]:
from pycaret.anomaly import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
data = pd.read_csv('./HealthCare/Train_Inpatientdata.csv',nrows=10000)
out_data = pd.read_csv("./HealthCare/Train_Outpatientdata.csv",nrows=10000)
labels = pd.read_csv('./HealthCare/Train.csv')
y = labels.merge(data)['PotentialFraud'].eq('Yes').mul(1)
out_y = labels.merge(out_data)['PotentialFraud'].eq('Yes').mul(1)


In [20]:
print("Ratio of Anomalies in Inpatient Data",(len(y[y==1])/len(y)))
print("Ratio of Anomalies in Outpatient Data",(len(out_y[out_y==1])/len(out_y)))

Ratio of Anomalies in Inpatient Data 0.571
Ratio of Anomalies in Outpatient Data 0.3505


In [3]:
in_ano_ratio = 0.571
out_ano_ratio = 0.3505

In [22]:
labels

Unnamed: 0,Provider,PotentialFraud
0,PRV51001,No
1,PRV51003,Yes
2,PRV51004,No
3,PRV51005,Yes
4,PRV51007,No
...,...,...
5405,PRV57759,No
5406,PRV57760,No
5407,PRV57761,No
5408,PRV57762,No


In [4]:
ano1 = setup(data = data)

Unnamed: 0,Description,Value
0,session_id,4813
1,Original Data,"(10000, 30)"
2,Missing Values,True
3,Numeric Features,5
4,Categorical Features,21
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(10000, 42337)"
9,CPU Jobs,-1


## Isolation Forest

In [24]:
## Tested Different Varitations of anomaly fraction
## Default 0.05 accuracy 63% but less anomalies were detected
## Actual 0.35  accuracy 52% around 1200 anomalies detected out of 10000

iforest = create_model('iforest',fraction=in_ano_ratio)

In [25]:
ifor_pred = predict_model(iforest,data=data)

In [26]:
y_pred = ifor_pred['Anomaly']

In [27]:
print("Isolation Forest Accuracy: ",accuracy_score(y,y_pred))
print(confusion_matrix(y,y_pred))

Isolation Forest Accuracy:  0.5156
[[1868 2422]
 [2422 3288]]


## KNN Anomaly Detection

In [28]:
knn = create_model('knn',fraction = in_ano_ratio)

In [29]:
knn_pred = predict_model(knn,data=data) 

In [30]:
knn_pred[knn_pred['Anomaly']==1]

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,Anomaly,Anomaly_Score
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,5849,,,,,,,,1,6.782330
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,,,7092.0,,,,,,1,262.875901
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,4019,,331.0,,,,,,1,6.928203
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,20300,,3893.0,,,,,,1,6.633250
5,BENE11017,CLM70950,2009-10-06,2009-10-12,PRV54986,8000,PHY402711,PHY402711,PHY402711,2009-10-06,...,25002,,863.0,,,,,,1,258.089132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,BENE47813,CLM54804,2009-06-09,2009-06-10,PRV55128,3000,PHY424816,PHY424816,PHY424816,2009-06-09,...,41401,,8872.0,,,,,,1,7.071068
9994,BENE47824,CLM48173,2009-04-23,2009-05-01,PRV53857,5000,PHY342421,PHY340277,,2009-04-23,...,25000,,4443.0,4019.0,496.0,2724.0,,,1,3519.820592
9997,BENE47834,CLM31789,2008-12-31,2009-01-02,PRV55396,8000,PHY382801,PHY382801,,2008-12-31,...,,,6021.0,,,,,,1,151.296799
9998,BENE47841,CLM53039,2009-05-28,2009-05-31,PRV51031,14000,PHY409001,,,2009-05-28,...,7993,,,,,,,,1,6.633250


In [31]:
print("KNN Accuracy: ",accuracy_score(y,knn_pred['Anomaly']))
print(confusion_matrix(y,knn_pred['Anomaly']))

KNN Accuracy:  0.5115
[[1927 2363]
 [2522 3188]]


## Angle Based Outlier Detection

In [32]:
abod_model = create_model('abod',fraction = in_ano_ratio)


In [33]:
abod_pred = predict_model(abod_model,data=data) 

In [34]:
abod_pred.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,Anomaly,Anomaly_Score
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,5849.0,,,,,,,,1,-1.240785e-07
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,,,7092.0,,,,,,1,-1.290437e-12
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,,,,,,,,,0,-5.298959e-06
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,4019.0,,331.0,,,,,,1,-5.145208e-08
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,20300.0,,3893.0,,,,,,1,-1.044975e-06


In [35]:
print("ABOD Accuracy: ",accuracy_score(y,abod_pred['Anomaly']))
print(confusion_matrix(y,abod_pred['Anomaly']))

ABOD Accuracy:  0.5245
[[1615 2675]
 [2080 3630]]


## Cluster Based

In [36]:
cluster_model = create_model('cluster',fraction = in_ano_ratio)

In [37]:
cluster_pred = predict_model(cluster_model,data=data) 

In [38]:
print("Cluster Accuracy: ",accuracy_score(y,cluster_pred['Anomaly']))
print(confusion_matrix(y,cluster_pred['Anomaly']))

Cluster Accuracy:  0.504
[[1811 2479]
 [2481 3229]]


## Stochastic

In [5]:
stochastic_model = create_model('sos',fraction = in_ano_ratio)

In [6]:
sto_pred = predict_model(stochastic_model,data=data)

In [7]:
print("Stochastic Accuracy: ",accuracy_score(y,sto_pred['Anomaly']))
print(confusion_matrix(y,sto_pred['Anomaly']))

Stochastic Accuracy:  0.429
[[4290    0]
 [5710    0]]


## Minimum Covariance

In [None]:
mcd_model = create_model('mcd',fraction = in_ano_ratio)

IntProgress(value=0, description='Processing: ', max=3)

In [None]:
mcd_pred = predict_model(mcd_model,data=data)

In [None]:
print("Minumum Covariance Accuracy: ",accuracy_score(y,mcd_pred['Anomaly']))
print(confusion_matrix(y,mcd_pred['Anomaly']))

####  Anomaly Detection Deosn't seems to be a good tool for detcting anomalies in this dataset.
####  Either we need to change technique for detection of fraud, or we need to mould our data

# For OutPatient Data

In [None]:
ano1 = setup(data = out_data)

## Isolation Forest

In [None]:
iforest = create_model('iforest',fraction=out_ano_ratio)

In [None]:
ifor_pred = predict_model(iforest,data=out_data)
y_pred = ifor_pred['Anomaly']
print("Isolation Forest Accuracy: ",accuracy_score(y,y_pred))
print(confusion_matrix(y,y_pred))

## KNN Anomaly Detection

In [None]:
knn = create_model('knn',fraction=out_ano_ratio)
knn_pred = predict_model(knn,data=out_data) 
print("KNN Accuracy: ",accuracy_score(y,knn_pred['Anomaly']))
print(confusion_matrix(y,knn_pred['Anomaly']))

## Angle Based Outlier Detection


In [None]:
abod_model = create_model('abod',fraction=out_ano_ratio)
abod_pred = predict_model(abod_model,data=out_data) 
print("ABOD Accuracy: ",accuracy_score(y,abod_pred['Anomaly']))
print(confusion_matrix(y,abod_pred['Anomaly']))

## Cluster Based

In [None]:
cluster_model = create_model('cluster',fraction=out_ano_ratio)
cluster_pred = predict_model(cluster_model,data=out_data) 
print("Cluster Accuracy: ",accuracy_score(y,cluster_pred['Anomaly']))
print(confusion_matrix(y,cluster_pred['Anomaly']))

## Stochastic

In [None]:
stochastic_model = create_model('sos',fraction=out_ano_ratio)
sto_pred = predict_model(stochastic_model,data=out_data)
print("Stochastic Accuracy: ",accuracy_score(y,sto_pred['Anomaly']))
print(confusion_matrix(y,sto_pred['Anomaly']))

## Minimum Corvarince

In [None]:
mcd_model = create_model('mcd',fraction=out_ano_ratio)
mcd_pred = predict_model(mcd_model,data=out_data)
print("Minumum Covariance Accuracy: ",accuracy_score(y,mcd_pred['Anomaly']))
print(confusion_matrix(y,mcd_pred['Anomaly']))