In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import os 
import pickle 
import joblib 
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import confusion_matrix  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import MinMaxScaler  
from sklearn.metrics import recall_score, precision_score,f1_score, roc_curve,auc    
from collections import Counter 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier 
from lightgbm import LGBMClassifier 
from prettytable import PrettyTable 

In [2]:
# load & check shape of test dataset
df_test_inpatient = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/data/Test_Inpatientdata-1542969243754.csv")
df_test_outpatient = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/data/Test_Outpatientdata-1542969243754.csv")
df_test_beneficiary = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/data/Test_Beneficiarydata-1542969243754.csv")
df_test_target = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/data/Test-1542969243754.csv")

print("*"*120)
print("The shape of Test Inpatient dataset  : ", df_test_inpatient.shape)
print("The shape of Test Outpatient dataset : ", df_test_outpatient.shape) 
print("The shape of Test Beneficiary dataset: ", df_test_beneficiary.shape)
print("The shape of Test Target dataset     : ", df_test_target.shape)

************************************************************************************************************************
The shape of Test Inpatient dataset  :  (9551, 30)
The shape of Test Outpatient dataset :  (125841, 27)
The shape of Test Beneficiary dataset:  (63968, 25)
The shape of Test Target dataset     :  (1353, 1)


In [3]:
# check common columns in outpatient and inpatients dataset based on outpatients columns

cols = []
for i in df_test_outpatient.columns:
  if i in df_test_inpatient.columns:
    cols.append(i)

print(cols)

['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider', 'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6', 'DeductibleAmtPaid', 'ClmAdmitDiagnosisCode']


In [4]:
# Merge Train_Outpatients data and Train_Inpatients Data
df_test = pd.merge(df_test_inpatient, df_test_outpatient,
                    left_on = cols,
                    right_on = cols,
                    how = "outer")

In [5]:
# Merge df_train and Beneficiary Data
df_test = pd.merge(df_test_beneficiary,df_test,
                    on = "BeneID",
                    how = "outer")

In [6]:
# Merge df_test and Target Data
df_test = pd.merge(df_test_target, df_test,
                    on = "Provider",
                    how = "outer")

In [7]:
# shape of merge Test dataset 
print("The Shape of Merge Test Dataset: ",df_test.shape)
print("="*120)
print("The Columns of Merge Test Dataset: ",df_test.columns)

The Shape of Merge Test Dataset:  (135392, 54)
The Columns of Merge Test Dataset:  Index(['Provider', 'BeneID', 'DOB', 'DOD', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'ClaimID',
       'ClaimStartDt', 'ClaimEndDt', 'InscClaimAmtReimbursed',
       'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
       'AdmissionDt', 'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid',
       'DischargeDt', 'DiagnosisGroupCode', 'ClmDiagnosisCode_1',
       'ClmDiagnosisCo

In [8]:
df_test.head()

Unnamed: 0,Provider,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,ClaimID,ClaimStartDt,ClaimEndDt,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6
0,PRV51002,BENE13342,1934-12-01,,2,1,0,1,410,12,12,2,2,2,2,2,2,1,2,2,2,2,0,0,1310,590,CLM129901,2009-01-10,2009-01-10,100,PHY418314,PHY418314,PHY413853,,78650.0,0.0,,,514,79092,V4501,,,,,,,,,,,,,
1,PRV51002,BENE13915,1921-02-01,,2,1,0,1,100,0,12,2,1,1,2,1,1,1,1,2,2,2,22000,3204,1020,910,CLM163936,2009-01-29,2009-01-29,40,PHY399539,PHY421140,,,78060.0,0.0,,,7840,V4502,2989,,,,,,,,,,,,,
2,PRV51002,BENE13915,1921-02-01,,2,1,0,1,100,0,12,2,1,1,2,1,1,1,1,2,2,2,22000,3204,1020,910,CLM348696,2009-05-09,2009-05-09,70,PHY352340,,,,41401.0,0.0,,,V4581,V173,,,,,,,,,,,,,,
3,PRV51002,BENE13915,1921-02-01,,2,1,0,1,100,0,12,2,1,1,2,1,1,1,1,2,2,2,22000,3204,1020,910,CLM500906,2009-08-01,2009-08-01,100,PHY366184,,,,,0.0,,,42789,,,,,,,,,,,,,,,
4,PRV51002,BENE14575,1961-01-01,,2,1,0,1,440,12,12,1,2,2,1,1,2,1,1,1,2,1,8000,13136,1520,730,CLM272820,2009-03-29,2009-03-29,400,PHY401762,,PHY330212,,1539.0,0.0,,,1532,,,,,,,,,,,,,,,


In [9]:
df_test["DOB"] = pd.to_datetime(df_test["DOB"])
df_test["DOD"] = pd.to_datetime(df_test["DOD"])

In [10]:
Diag_Code = [4019, 25000, 2724, 'V5869', 4011, 42731, 'V5861', 2720, 2449, 4280] 
Pro_Code =  [4019.0, 9904.0, 2724.0, 8154.0, 66.0, 3893.0, 3995.0, 4516.0, 3722.0, 8151.0]

In [11]:
def find_best_threshold(threshold, fpr, tpr):
    t = threshold[np.argmax(tpr*(1-fpr))]
    print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
    return t

def predict_with_best_t(proba, threshold):
    predictions = []
    for i in proba:
        if i>=threshold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [12]:
def final_fun_1(X):
  
  # Data Preprocessing
  # replace all NAN values in Date of Death with Maximum Date
  X["DOD"] = X["DOD"].fillna("2009-12-01")

  # calculating Age of Patients = Date_of_Death - Date_of_Birth
  def age_of_patients(data):
    date_of_birth = pd.to_datetime(data["DOB"])
    date_of_death = pd.to_datetime(data["DOD"])
    difference = (date_of_death - date_of_birth).dt.days
    data["Age_Of_Patients"] = round(difference/365)
    return data["Age_Of_Patients"]

  age_of_patients(X)
  
  # RenalDiseaseIndicator converting from Y/0 to 1/0 
  X["RenalDiseaseIndicator"] = np.where(X["RenalDiseaseIndicator"]=="Y",1,0)
  X["RenalDiseaseIndicator"].value_counts()

  # Drop ClmProcedureCode_4, ClmProcedureCode_6, ClmProcedureCode_6 have maximum Null values.
  # Drop NoOfMonths_PartACov, NoOfMonths_PartBCov both contain similar values.
  X.drop(columns= ["ClmProcedureCode_4","ClmProcedureCode_5","ClmProcedureCode_6",
                   "NoOfMonths_PartACov","NoOfMonths_PartBCov"], inplace=True, axis=1)
  
  # replace all NAN values with Zero
  replace_NAN = ["ClmAdmitDiagnosisCode", "DiagnosisGroupCode", 
            "ClmDiagnosisCode_1", "ClmDiagnosisCode_2", 
            "ClmDiagnosisCode_3","ClmDiagnosisCode_4",
            "ClmDiagnosisCode_5","ClmDiagnosisCode_6", 
            "ClmDiagnosisCode_7","ClmDiagnosisCode_8", 
            "ClmDiagnosisCode_9", "ClmDiagnosisCode_10",
            "ClmProcedureCode_1", "ClmProcedureCode_2",
            "ClmProcedureCode_3","AttendingPhysician",
            "OperatingPhysician","OtherPhysician"]

  for i in replace_NAN:
    X[i] = X[i].replace(np.nan,0) 
  
  # Feature Engineering

  # to check number of days patients stay in hospital.
  # calculating Duration in hospital = DischargeDT - AdmissionDT 
  def duration(data):
    date_of_discharge = pd.to_datetime(data["DischargeDt"])
    date_of_admission = pd.to_datetime(data["AdmissionDt"])
    difference = ((date_of_discharge - date_of_admission).dt.days)
    data["Hospital_stay_Duration"] = round(difference)
    return data["Hospital_stay_Duration"]

  duration(X)

  # replace NAN values with 0.
  replace_NAN = ["Hospital_stay_Duration"]
  for i in replace_NAN:
    X[i] = X[i].replace(np.nan,0)
  
  # replace NAN values with 0.
  replace_NAN = ["DeductibleAmtPaid"]
  for i in replace_NAN:
    X[i] = X[i].replace(np.nan,0)

  # to check number of days required to complete claim.
  # calculating Duration in Claim = ClaimEndDt - ClaimStartDt 
  def claim_duration(data):
    date_of_claim_start = pd.to_datetime(data["ClaimStartDt"])
    date_of_claim_end = pd.to_datetime(data["ClaimEndDt"])
    difference = (date_of_claim_end - date_of_claim_start).dt.days
    data["Claim_Duration"] = round(difference)
    return data["Claim_Duration"]

  claim_duration(X)
  
  # creating new feature from IPAnnualReimbursementAmt,OPAnnualReimbursementAmt, IPAnnualDeductibleAmt, OPAnnualDeductibleAmt
  # adding Inpatients and outpatients reimburse amount.
  # adding Inpatients and outpatients deductible amount.
  annual_amount_reimbursement = X["IPAnnualReimbursementAmt"] + X["OPAnnualReimbursementAmt"]
  annual_amount_deductible = X["IPAnnualDeductibleAmt"] + X["OPAnnualDeductibleAmt"]

  # create new feature as annual left total_amount_reimburse = annual_amount_reimbursement - annual_amount_deductible
  X["Amount_left_reimburse"] = annual_amount_reimbursement - annual_amount_deductible 
  
  X.drop(columns=["IPAnnualReimbursementAmt","IPAnnualDeductibleAmt", "OPAnnualReimbursementAmt","OPAnnualDeductibleAmt"],inplace=True,axis=1)

  for i in Diag_Code:
    for j in range(1,11):
      X["Diagnosis_Code_"+ str(i)] = np.where(X["ClmDiagnosisCode_"+str(j)] == i,1,0)
      
    X["Diagnosis_Code_"+str(i)] = np.where(X["DiagnosisGroupCode"] == i,1, np.where(X["Diagnosis_Code_"+str(i)] == 1,1,0))
    X["Diagnosis_Code_"+str(i)] = np.where(X["ClmAdmitDiagnosisCode"] == i,1, np.where(X["Diagnosis_Code_"+str(i)] == 1,1,0))       
   
  for i in Pro_Code:
    for j in range(1,4):
      X["Procedure_Code_"+str(i)] = np.where(X["ClmProcedureCode_"+str(j)] == i,1,0)

    X["Procedure_Code_"+str(i)] = np.where(X["ClmProcedureCode_"+str(j)] == i,1,np.where(X["Procedure_Code_"+str(i)] == 1,1,0))

  # replacing categories to count for whole dataset
  def replace_id(data,column):
    ''' Categories to count and then stored to dictionary as Key: Value
        After that we map that in particular column and replace  
    '''
    value_count = data[column].value_counts().to_dict()
    # print(value_count)
    data[column] = data[column].map(value_count)

  #Provider
  replace_id(X,"Provider")

  #BeneID
  replace_id(X,"BeneID")

  #AttendingPhysician
  replace_id(X,"AttendingPhysician")
   
  #OperatingPhysician
  replace_id(X,"OperatingPhysician")

  #OtherPhysician
  replace_id(X,"OtherPhysician")

  #ClmAdmitDiagnosisCode
  replace_id(X,"ClmAdmitDiagnosisCode")
  
  #DiagnosisGroupCode
  replace_id(X,"DiagnosisGroupCode")
  
  X.drop(columns=["ClaimStartDt","ClaimEndDt","ClaimID","AdmissionDt","DischargeDt",
                  "ClmDiagnosisCode_1","ClmDiagnosisCode_2","ClmDiagnosisCode_3",
                  "ClmDiagnosisCode_4","ClmDiagnosisCode_5","ClmDiagnosisCode_6",
                  "ClmDiagnosisCode_7","ClmDiagnosisCode_8","ClmDiagnosisCode_9","ClmDiagnosisCode_10",
                  "ClmProcedureCode_1","ClmProcedureCode_2","ClmProcedureCode_3"],inplace=True,axis=1)
  
  X.drop(columns=["DOB","DOD"],axis=1,inplace=True)

  # Normalizing data
  scale_columns = ["Provider","BeneID","Race","State","County","InscClaimAmtReimbursed",
                   "AttendingPhysician","OperatingPhysician","OtherPhysician",
                   "ClmAdmitDiagnosisCode","DeductibleAmtPaid","DiagnosisGroupCode",
                   "Age_Of_Patients","Hospital_stay_Duration","Claim_Duration","Amount_left_reimburse"]
  
  for i in scale_columns:    
    scale = load('/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/Post_training/scale_cols/'+i+'_std_scaler.bin')
    scale.clip = False
    X[i] = scale.transform(X[i].values.reshape(-1,1))
  
  model = pickle.load(open("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/Post_training/model.pkl","rb")) 
  y_pred = model.predict(X)
  
  X["PotentialFraud"] = y_pred
  
  # if y_pred==1:
  #   print("This Claim Predicted as Potentially FRAUDULENT")
  # else: 
  #   print("This Claim Predicted as Potentially Non-FRAUDULENT")
  
  return X

In [13]:
# Testing single datapoint final_fun_1 
single_datapoint_X = final_fun_1(df_test[13:14])
single_datapoint_X

Unnamed: 0,Provider,BeneID,Gender,Race,RenalDiseaseIndicator,State,County,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,Age_Of_Patients,Hospital_stay_Duration,Claim_Duration,Amount_left_reimburse,Diagnosis_Code_4019,Diagnosis_Code_25000,Diagnosis_Code_2724,Diagnosis_Code_V5869,Diagnosis_Code_4011,Diagnosis_Code_42731,Diagnosis_Code_V5861,Diagnosis_Code_2720,Diagnosis_Code_2449,Diagnosis_Code_4280,Procedure_Code_4019.0,Procedure_Code_9904.0,Procedure_Code_2724.0,Procedure_Code_8154.0,Procedure_Code_66.0,Procedure_Code_3893.0,Procedure_Code_3995.0,Procedure_Code_4516.0,Procedure_Code_3722.0,Procedure_Code_8151.0,PotentialFraud
13,0.0,0.0,2,0.0,0,0.0,0.16016,2,2,2,2,1,1,2,2,1,2,2,0.0008,0.0,0.0,0.0,0.0,0.0,0.0,0.716216,0.411111,0.411111,0.125949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
def final_fun_2(X,Y):
   
  model = pickle.load(open("/content/drive/MyDrive/Colab Notebooks/AAIC ASSIGNMENT/Case Study 1/Post_training/model.pkl","rb")) 
  y_pred = model.predict(X)
  
  test_fpr, test_tpr, te_thresholds = roc_curve(Y, y_pred)

  #recall_score
  recall_score_test = recall_score(Y, y_pred)

  #f1_score
  f1_score_test = f1_score(Y, y_pred)

  print("="*50)
  print("The Recall score of Test: ",recall_score_test)
  print("="*50)
  print("The F1_Score of Test: ",f1_score_test)
  print("="*50)

In [15]:
X = final_fun_1(df_test)
Y = X["PotentialFraud"] 
X = X.drop(["PotentialFraud"],axis=1)

In [16]:
d = final_fun_2(X,Y)

The Recall score of Test:  1.0
The F1_Score of Test:  1.0
