In [106]:
EXPERIMENT_NAME ='frauddetection'
import random
import kfp
from kfp import dsl
from kfp.components import create_component_from_func
from kfp.components import InputPath, OutputPath

In [107]:
BASE_IMAGE = "huanjason/scikit-learn"

In [108]:
# Download the data from minio
def data_download(input_data:list, output_dir_path: OutputPath()):
    from  minio import Minio
    import pandas as pd
    import os
    import urllib3
    
    os.makedirs(output_dir_path, exist_ok=True)
    file_path1 = os.path.join(output_dir_path, "Train" + '.csv')
    file_path2 = os.path.join(output_dir_path, "Train_Inpatientdata" + '.csv')
    file_path3 = os.path.join(output_dir_path, "Train_Outpatientdata" + '.csv')
    file_path4 = os.path.join(output_dir_path, "Train_Beneficiarydata" + '.csv')

    minio_client = Minio(
    "172.20.17.71:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,
    http_client=urllib3.ProxyManager(
        "http://172.20.17.71:9000/",
        timeout=urllib3.Timeout.DEFAULT_TIMEOUT,
        cert_reqs="CERT_REQUIRED",
        retries=urllib3.Retry(
            total=5,
            backoff_factor=0.2,
            status_forcelist=[500, 502, 503, 504],
        ),
    ),
    )
    
    obj1 = minio_client.get_object("dataset",input_data[0])
    obj2 = minio_client.get_object("dataset",input_data[1])
    obj3 = minio_client.get_object("dataset",input_data[2])
    obj4 = minio_client.get_object("dataset",input_data[3])    

    df1 = pd.read_csv(obj1)
    df2 = pd.read_csv(obj2)
    df3 = pd.read_csv(obj3)
    df4 = pd.read_csv(obj4)

    df1.to_csv(file_path1, index = False)
    df2.to_csv(file_path2, index = False)
    df3.to_csv(file_path3, index = False)
    df4.to_csv(file_path4, index = False)

In [109]:
def process_data(input_dir_path: InputPath(), output_dir_path: OutputPath()):
    import os 
    import pandas as pd
    import numpy as np

    dir_items = os.listdir(input_dir_path)
    
    os.makedirs(output_dir_path, exist_ok=True)
    file_path = os.path.join(output_dir_path, "final_data" + '.csv')
    
    input_file_path1 = input_dir_path + "/" + dir_items[0]
    input_file_path2 = input_dir_path + "/" + dir_items[1]
    input_file_path3 = input_dir_path + "/" + dir_items[2]
    input_file_path4 = input_dir_path + "/" + dir_items[3]
    
    train_target = pd.read_csv(input_file_path1)
    train_inpatient = pd.read_csv(input_file_path3)
    train_outpatient = pd.read_csv(input_file_path4).iloc[:50000, :]
    train_beneficiary = pd.read_csv(input_file_path2)
    
    final_data = pd.merge(train_inpatient, train_outpatient, left_on = [ col for col in train_outpatient.columns if col in train_inpatient.columns], right_on = [ col for col in train_outpatient.columns if col in train_inpatient.columns], how = 'outer')
    final_data = pd.merge(final_data,train_beneficiary,how='inner',on='BeneID' )
    final_data = final_data.iloc[:100000, :]

    final_data = pd.merge(final_data,train_target,how='outer',on='Provider')
    
    train_outpatient = pd.merge(train_outpatient,train_target,how='outer',on='Provider')
    train_inpatient = pd.merge(train_inpatient,train_target,how='outer',on='Provider')

    attendingPhysician_count = final_data['AttendingPhysician'].value_counts().to_dict()
    provider_count = final_data['Provider'].value_counts().to_dict()
    bene_count = final_data['BeneID'].value_counts().to_dict()

    final_data['phycount']=final_data['AttendingPhysician'].map(attendingPhysician_count)
    final_data['procount']=final_data['Provider'].map(provider_count)
    final_data['benecount']=final_data['BeneID'].map(bene_count)
    
    train_inpatient['AdmissionDt']= pd.to_datetime(train_inpatient['AdmissionDt'])
    train_inpatient['DischargeDt']= pd.to_datetime(train_inpatient['DischargeDt'])
    
    train_inpatient['numOfDaysAdmitted'] = train_inpatient['DischargeDt'] - train_inpatient['AdmissionDt']
    train_inpatient['numOfDaysAdmitted'].replace({pd.NaT: "0 days"}, inplace=True)
    train_inpatient['numOfDaysAdmitted'] = train_inpatient['numOfDaysAdmitted'].astype(str).map(lambda x: x.split(" ")[0]).astype('int64')
    
    attendingPhysician_count = train_inpatient['AttendingPhysician'].value_counts().to_dict()
    train_inpatient['phycount']=train_inpatient['AttendingPhysician'].map(attendingPhysician_count)

    final_data.drop('ClmProcedureCode_5',axis=1,inplace=True)
    final_data.drop('ClmProcedureCode_6',axis=1,inplace=True)
    final_data.drop('NoOfMonths_PartACov',axis=1,inplace=True)
    final_data.drop('NoOfMonths_PartBCov',axis=1,inplace=True)

    colFillna = ['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4',
                 'ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8',
                 'ClmDiagnosisCode_9','ClmDiagnosisCode_10','ClmProcedureCode_1','ClmProcedureCode_2',
                 'ClmProcedureCode_3','ClmProcedureCode_4','DiagnosisGroupCode','ClmAdmitDiagnosisCode']
    final_data[colFillna]= final_data[colFillna].replace({np.nan:0})  
    
    
    colFillmode = ['ClmDiagnosisCode_1','ClmDiagnosisCode_2',
             'ClmDiagnosisCode_3','ClmDiagnosisCode_4',
             'ClmDiagnosisCode_5','ClmDiagnosisCode_6',
             'ClmDiagnosisCode_7','ClmDiagnosisCode_8',
             'ClmDiagnosisCode_9','ClmDiagnosisCode_10',
             'ClmProcedureCode_1','ClmProcedureCode_2',
             'ClmProcedureCode_3','ClmProcedureCode_4',
             'DiagnosisGroupCode','ClmAdmitDiagnosisCode']

    for i in colFillna:
        mode = final_data[i].mode()[0]
        print(i,'mode :',mode)
        final_data[i]=final_data[i].fillna(mode)
        
    final_data['PotentialFraud'] = final_data['PotentialFraud'].map({'Yes':1,'No':0})
    final_data['RenalDiseaseIndicator'] = final_data['RenalDiseaseIndicator'].map({'Y':1,'0':0})
    
    final_data['ChronicCond_Alzheimer'] = final_data['ChronicCond_Alzheimer'].map({2:0,1:1})
    final_data['ChronicCond_Heartfailure'] = final_data['ChronicCond_Heartfailure'].map({2:0,1:1})
    final_data['ChronicCond_KidneyDisease'] = final_data['ChronicCond_KidneyDisease'].map({2:0,1:1})
    final_data['ChronicCond_Cancer'] = final_data['ChronicCond_Cancer'].map({2:0,1:1})
    final_data['ChronicCond_ObstrPulmonary'] = final_data['ChronicCond_ObstrPulmonary'].map({2:0,1:1})
    final_data['ChronicCond_Depression'] = final_data['ChronicCond_Depression'].map({2:0,1:1})
    final_data['ChronicCond_Diabetes'] = final_data['ChronicCond_Diabetes'].map({2:0,1:1})
    final_data['ChronicCond_IschemicHeart'] = final_data['ChronicCond_IschemicHeart'].map({2:0,1:1})
    final_data['ChronicCond_Osteoporasis'] = final_data['ChronicCond_Osteoporasis'].map({2:0,1:1})
    final_data['ChronicCond_rheumatoidarthritis'] = final_data['ChronicCond_rheumatoidarthritis'].map({2:0,1:1})
    final_data['ChronicCond_stroke'] = final_data['ChronicCond_stroke'].map({2:0,1:1})
    final_data['Gender'] = final_data['Gender'].map({2:0,1:1})
    
    bene_count = final_data['BeneID'].value_counts().to_dict()
    final_data['BeneCount'] = final_data['BeneID'].map(bene_count)
    
    pro_count = final_data['Provider'].value_counts().to_dict()
    final_data['ProviderCount'] = final_data['Provider'].map(pro_count)
    
    attphy_count = final_data['AttendingPhysician'].value_counts().to_dict()
    final_data['AttendingPhysicianCount'] = final_data['AttendingPhysician'].map(attphy_count)
    final_data['AttendingPhysicianCount'] = final_data['AttendingPhysicianCount'].fillna(0)
    
    mode = final_data['AttendingPhysicianCount'].mode()[0]
    final_data['AttendingPhysicianCount']=final_data['AttendingPhysicianCount'].fillna(mode)
    df = pd.DataFrame(final_data,columns = ['ClaimStartDt','ClaimEndDt','AdmissionDt','DischargeDt']) 
    
    final_data['AdmissionDt']= pd.to_datetime(final_data['AdmissionDt'])
    final_data['DischargeDt']= pd.to_datetime(final_data['DischargeDt'])
    final_data['ClaimStartDt']= pd.to_datetime(final_data['ClaimStartDt'])
    final_data['ClaimEndDt']= pd.to_datetime(final_data['ClaimEndDt'])

    df = pd.DataFrame(final_data,columns = ['ClaimStartDt','ClaimEndDt','AdmissionDt','DischargeDt'])
    
    final_data['numOfDaysAdmitted'] = final_data['DischargeDt'] - final_data['AdmissionDt']
    final_data['numOfDaysAdmitted'].replace({pd.NaT: "0 days"}, inplace=True)
    final_data['numOfDaysAdmitted'] = final_data['numOfDaysAdmitted'].astype(str).map(lambda x: x.split(" ")[0]).astype('int64')
    
    mean = final_data['numOfDaysAdmitted'].mean()
    mean = round(mean,1)
    final_data['numOfDaysAdmitted']=final_data['numOfDaysAdmitted'].replace({0:mean})
    
    final_data['numOfDaysForClaim'] = final_data['ClaimEndDt'] - final_data['ClaimStartDt']
    final_data['numOfDaysForClaim'].replace({pd.NaT: "0 days"}, inplace=True)
    final_data['numOfDaysForClaim'] = final_data['numOfDaysForClaim'].astype(str).map(lambda x: x.split(" ")[0]).astype('int64')
    
    mean = final_data['numOfDaysForClaim'].mean()
    mean = round(mean,1)
    final_data['numOfDaysForClaim']=final_data['numOfDaysForClaim'].replace({0:mean})

    ip_op_total_amount = final_data['IPAnnualReimbursementAmt'] + final_data['OPAnnualReimbursementAmt']
    ip_op_ded_amount = final_data['IPAnnualDeductibleAmt'] + final_data['OPAnnualDeductibleAmt']
    ip_op_total_amount = ip_op_total_amount - ip_op_ded_amount
    final_data['ip_op_total_amount'] = ip_op_total_amount
    
    num_of_chronic = final_data['RenalDiseaseIndicator'] + final_data['ChronicCond_Alzheimer'] + \
                    final_data['ChronicCond_Heartfailure'] + final_data['ChronicCond_KidneyDisease'] + \
                    final_data['ChronicCond_Cancer'] + final_data['ChronicCond_ObstrPulmonary'] + \
                    final_data['ChronicCond_Depression'] + final_data['ChronicCond_Diabetes'] + \
                    final_data['ChronicCond_IschemicHeart'] + final_data['ChronicCond_Osteoporasis'] + \
                    final_data['ChronicCond_rheumatoidarthritis'] + final_data['ChronicCond_stroke'] 
    final_data['num_of_chronic'] = num_of_chronic

    num_of_diag_proc = final_data[['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 
       'DiagnosisGroupCode','ClmAdmitDiagnosisCode']].values
    
    countnum_of_diag_proc = []
    for i in range(len(num_of_diag_proc)):
        countnum_of_diag_proc.append(np.count_nonzero(num_of_diag_proc[i]))
        
    final_data['num_of_diag_proc'] = countnum_of_diag_proc
    
    num_of_phy = final_data[['AttendingPhysician','OperatingPhysician','OtherPhysician']].fillna(0).values
    
    countnum_of_phy = []
    for i in range(len(num_of_phy)):
        countnum_of_phy.append(np.count_nonzero(num_of_phy[i]))
        
    final_data['num_of_phy'] = countnum_of_phy
    
    mode = final_data['num_of_phy'].mode()[0]
    final_data['num_of_phy']=final_data['num_of_phy'].replace({0:mode})
    
    diagnosis_code = final_data[['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'DiagnosisGroupCode','ClmAdmitDiagnosisCode']]
    top10 = ['4019','25000','2724','V5869','4011','42731','V5861','2720','2449','4280']
    for col in top10:
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_1']==col,1,0)
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_2']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_3']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_4']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_5']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_6']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_7']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_8']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_9']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmDiagnosisCode_10']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['DiagnosisGroupCode']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
        final_data['diagnosis_'+str(col)] = np.where(final_data['ClmAdmitDiagnosisCode']==col,1,np.where(final_data['diagnosis_'+str(col)]==1,1,0 ))
    
    procedure_code = final_data[['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4']]
    final_data_proc = pd.DataFrame(columns = ['ProCode'])
    final_data_proc['ProCode'] = pd.concat([final_data["ClmProcedureCode_1"], 
                                               final_data["ClmProcedureCode_2"], 
                                               final_data["ClmProcedureCode_3"], 
                                               final_data["ClmProcedureCode_4"]], axis=0).dropna()
    final_data_proc = pd.DataFrame(columns = ['ProCode'])
    final_data_proc['ProCode'] = pd.concat([final_data["ClmProcedureCode_1"], 
                                               final_data["ClmProcedureCode_2"], 
                                               final_data["ClmProcedureCode_3"], 
                                               final_data["ClmProcedureCode_4"]], axis=0).dropna()
    

    top5 = [4019.0, 9904.0, 2724.0, 8154.0, 66.0]
    
    for col in top5:
        final_data['procedure_'+str(col)] = np.where(final_data['ClmProcedureCode_1']==col,1,0)
        final_data['procedure_'+str(col)] = np.where(final_data['ClmProcedureCode_2']==col,1,\
                                       np.where(final_data['procedure_'+str(col)]==1,1,0 ))
        final_data['procedure_'+str(col)] = np.where(final_data['ClmProcedureCode_3']==col,1,\
                                       np.where(final_data['procedure_'+str(col)]==1,1,0 ))
        final_data['procedure_'+str(col)] = np.where(final_data['ClmProcedureCode_4']==col,1,\
                                       np.where(final_data['procedure_'+str(col)]==1,1,0 ))
        

    DiagnosisCode_1_count = final_data['ClmDiagnosisCode_1'].value_counts().to_dict()
    DiagnosisCode_1_count[0]=0

    DiagnosisCode_2_count = final_data['ClmDiagnosisCode_2'].value_counts().to_dict()
    DiagnosisCode_2_count[0]=0

    DiagnosisCode_3_count = final_data['ClmDiagnosisCode_3'].value_counts().to_dict()
    DiagnosisCode_3_count[0]=0

    DiagnosisCode_4_count = final_data['ClmDiagnosisCode_4'].value_counts().to_dict()
    DiagnosisCode_4_count[0]=0

    DiagnosisCode_5_count = final_data['ClmDiagnosisCode_5'].value_counts().to_dict()
    DiagnosisCode_5_count[0]=0

    DiagnosisCode_6_count = final_data['ClmDiagnosisCode_6'].value_counts().to_dict()
    DiagnosisCode_6_count[0]=0

    DiagnosisCode_7_count = final_data['ClmDiagnosisCode_7'].value_counts().to_dict()
    DiagnosisCode_7_count[0]=0

    DiagnosisCode_8_count = final_data['ClmDiagnosisCode_8'].value_counts().to_dict()
    DiagnosisCode_8_count[0]=0

    DiagnosisCode_9_count = final_data['ClmDiagnosisCode_9'].value_counts().to_dict()
    DiagnosisCode_9_count[0]=0

    DiagnosisCode_10_count = final_data['ClmDiagnosisCode_10'].value_counts().to_dict()
    DiagnosisCode_10_count[0]=0

    ClmAdmitDiagnosisCode_count = final_data['ClmAdmitDiagnosisCode'].value_counts().to_dict()
    ClmAdmitDiagnosisCode_count[0]=0

    DiagnosisGroupCode_count = final_data['DiagnosisGroupCode'].value_counts().to_dict()
    DiagnosisGroupCode_count[0]=0
    
    
    DiagnosisCode_1_count = final_data['ClmDiagnosisCode_1'].value_counts().to_dict()
    DiagnosisCode_2_count = final_data['ClmDiagnosisCode_2'].value_counts().to_dict()
    DiagnosisCode_3_count = final_data['ClmDiagnosisCode_3'].value_counts().to_dict()
    DiagnosisCode_4_count = final_data['ClmDiagnosisCode_4'].value_counts().to_dict()
    DiagnosisCode_5_count = final_data['ClmDiagnosisCode_5'].value_counts().to_dict()
    DiagnosisCode_6_count = final_data['ClmDiagnosisCode_6'].value_counts().to_dict()
    DiagnosisCode_7_count = final_data['ClmDiagnosisCode_7'].value_counts().to_dict()
    DiagnosisCode_8_count = final_data['ClmDiagnosisCode_8'].value_counts().to_dict()
    DiagnosisCode_9_count = final_data['ClmDiagnosisCode_9'].value_counts().to_dict()
    DiagnosisCode_10_count = final_data['ClmDiagnosisCode_10'].value_counts().to_dict()
    ClmAdmitDiagnosisCode_count = final_data['ClmAdmitDiagnosisCode'].value_counts().to_dict()
    DiagnosisGroupCode_count = final_data['DiagnosisGroupCode'].value_counts().to_dict()

    final_data['DiagnosisCode_1_count'] = final_data['ClmDiagnosisCode_1'].map(DiagnosisCode_1_count)
    final_data['DiagnosisCode_2_count'] = final_data['ClmDiagnosisCode_2'].map(DiagnosisCode_2_count)
    final_data['DiagnosisCode_3_count'] = final_data['ClmDiagnosisCode_3'].map(DiagnosisCode_3_count)
    final_data['DiagnosisCode_4_count'] = final_data['ClmDiagnosisCode_4'].map(DiagnosisCode_4_count)
    final_data['DiagnosisCode_5_count'] = final_data['ClmDiagnosisCode_5'].map(DiagnosisCode_5_count)
    final_data['DiagnosisCode_6_count'] = final_data['ClmDiagnosisCode_6'].map(DiagnosisCode_6_count)
    final_data['DiagnosisCode_7_count'] = final_data['ClmDiagnosisCode_7'].map(DiagnosisCode_7_count)
    final_data['DiagnosisCode_8_count'] = final_data['ClmDiagnosisCode_8'].map(DiagnosisCode_8_count)
    final_data['DiagnosisCode_9_count'] = final_data['ClmDiagnosisCode_9'].map(DiagnosisCode_9_count)
    final_data['DiagnosisCode_10_count'] = final_data['ClmDiagnosisCode_10'].map(DiagnosisCode_10_count)
    final_data['ClmAdmitDiagnosisCode_count'] = final_data['ClmAdmitDiagnosisCode'].map(ClmAdmitDiagnosisCode_count)
    final_data['DiagnosisGroupCode_count'] = final_data['DiagnosisGroupCode'].map(DiagnosisGroupCode_count)
    
    final_data['DeductibleAmtPaid']= final_data['DeductibleAmtPaid'].replace({np.nan:0})
    mean = final_data['DeductibleAmtPaid'].mean()
    mean = round(mean,0)
    final_data['DeductibleAmtPaid']=final_data['DeductibleAmtPaid'].replace({np.nan:mean})

    final_data = final_data.drop(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider','AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt','ClmAdmitDiagnosisCode','DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4','DOB', 'DOD'],axis=1)
    
    final_data.to_csv(file_path, index = False)


In [110]:
def logistic_regression(input_dir_path: InputPath(), output_dir_path: OutputPath()):
    import os 
    import json
    import pandas as pd
    import numpy as np
    import sklearn
    import seaborn as sns
    import pickle
    import matplotlib.pyplot as plt
    from sklearn import preprocessing
    from sklearn.linear_model import LogisticRegression 
    from sklearn import metrics
    from sklearn.metrics import roc_curve, f1_score, confusion_matrix
        
    os.makedirs(output_dir_path, exist_ok=True)
    file_path = os.path.join(output_dir_path, "lr_model" + '.pkl')
    
    final_data = pd.read_csv(input_dir_path + "/" + 'final_data' + '.csv')
    final_data = final_data.dropna().reset_index()
    
    Y = final_data['PotentialFraud']
    X = final_data.drop('PotentialFraud',axis=1)
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, stratify = Y)
    
    min_max_scaler = preprocessing.MinMaxScaler()
    
    col_to_nor = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'Race',
       'State', 'County', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'BeneCount',
       'ProviderCount', 'AttendingPhysicianCount', 'numOfDaysAdmitted',
       'numOfDaysForClaim', 'ip_op_total_amount', 'num_of_chronic',
       'num_of_diag_proc', 'num_of_phy', 'DiagnosisCode_1_count', 'DiagnosisCode_2_count',
       'DiagnosisCode_3_count', 'DiagnosisCode_4_count',
       'DiagnosisCode_5_count', 'DiagnosisCode_6_count',
       'DiagnosisCode_7_count', 'DiagnosisCode_8_count',
       'DiagnosisCode_9_count', 'DiagnosisCode_10_count',
       'ClmAdmitDiagnosisCode_count', 'DiagnosisGroupCode_count']

    X_train.loc[:,col_to_nor] = min_max_scaler.fit_transform(X_train[col_to_nor])
    X_test.loc[:,col_to_nor] = min_max_scaler.fit_transform(X_test[col_to_nor])
    
    lr = LogisticRegression(C=5 , penalty='l2')
    lr.fit(X_train,Y_train)
    
    print('accuracy on test data:',lr.score(X_test,Y_test))
    
    pickle.dump(lr, open(file_path, 'wb'))
    


In [111]:
def decision_tree(input_dir_path: InputPath(),  output_dir_path: OutputPath()):
    import os 
    import json
    import pandas as pd
    import numpy as np
    import sklearn
    import seaborn as sns
    import pickle
    import matplotlib.pyplot as plt
    from sklearn import preprocessing
    from sklearn import metrics
    from sklearn.metrics import roc_curve, f1_score, confusion_matrix
    from sklearn.tree import DecisionTreeClassifier

    os.makedirs(output_dir_path, exist_ok=True)
    file_path = os.path.join(output_dir_path, "dt_model" + '.pkl')
    
    final_data = pd.read_csv(input_dir_path + "/" + 'final_data' + '.csv')
    final_data = final_data.dropna().reset_index()
    
    Y = final_data['PotentialFraud']
    X = final_data.drop('PotentialFraud',axis=1)
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, stratify = Y)
    
    min_max_scaler = preprocessing.MinMaxScaler()
    
    col_to_nor = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'Race',
       'State', 'County', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'BeneCount',
       'ProviderCount', 'AttendingPhysicianCount', 'numOfDaysAdmitted',
       'numOfDaysForClaim', 'ip_op_total_amount', 'num_of_chronic',
       'num_of_diag_proc', 'num_of_phy', 'DiagnosisCode_1_count', 'DiagnosisCode_2_count',
       'DiagnosisCode_3_count', 'DiagnosisCode_4_count',
       'DiagnosisCode_5_count', 'DiagnosisCode_6_count',
       'DiagnosisCode_7_count', 'DiagnosisCode_8_count',
       'DiagnosisCode_9_count', 'DiagnosisCode_10_count',
       'ClmAdmitDiagnosisCode_count', 'DiagnosisGroupCode_count']

    X_train.loc[:,col_to_nor] = min_max_scaler.fit_transform(X_train[col_to_nor])
    X_test.loc[:,col_to_nor] = min_max_scaler.fit_transform(X_test[col_to_nor])
    
    params = {"max_depth":250,"min_samples_split":70,"criterion":'gini'}
    dt = DecisionTreeClassifier(max_depth=params["max_depth"],min_samples_split=params["min_samples_split"],criterion=params["criterion"])
    dt.fit(X_train,Y_train)
    
    print('accuracy on test data:',dt.score(X_test,Y_test))
    
    pickle.dump(dt, open(file_path, 'wb'))


In [112]:
# Converting the python function to component
download_data_opp = create_component_from_func(data_download,base_image=BASE_IMAGE, packages_to_install=['minio', 'pandas'])
process_data_opp = create_component_from_func(process_data,base_image=BASE_IMAGE, packages_to_install=['minio', 'pandas'])
lr_building_opp = create_component_from_func(logistic_regression,base_image=BASE_IMAGE, packages_to_install=['minio', 'pandas'])
dt_building_opp = create_component_from_func(decision_tree,base_image=BASE_IMAGE, packages_to_install=['minio', 'pandas'])

In [113]:
@kfp.dsl.pipeline(name='model-training-pipeline')
def model_pipeline(data=["Train.csv","Train_Inpatientdata.csv","Train_Outpatientdata.csv","Train_Beneficiarydata.csv"]):
    download_data_task = download_data_opp(data)    
    eda_opp_task = process_data_opp(input_dir=download_data_task.output)
    lr_opp_task = lr_building_opp(input_dir=eda_opp_task.output)
    dt_opp_task = dt_building_opp( input_dir=eda_opp_task.output)

In [116]:
kfp.compiler.Compiler().compile(model_pipeline, 'model-training-pipeline.yaml')
HOST = 'https://kubeflow-workos-slvr.anthem.com'    
namespace = "harikrishna-kantipudi"
session_cookie = "MTY0MzYxNzg2OXxOd3dBTkZRM1JWQTNSelkxVEROQ1JWcFdUVTFNTlROS1MxVkxVMUZGVURKSFFWWlFTazQxTTFWWVVFaEdUa3BZVlU1UFNWRk1UVkU9fFoyKCgVjNiyE7B35bl5xiqntpFalMBqJqfCnxzEC8yt"

In [117]:
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=namespace, ssl_ca_cert="./root.pem")

experiment = client.create_experiment(name=EXPERIMENT_NAME,namespace=namespace)
client.create_run_from_pipeline_func(pipeline_func=model_pipeline, arguments={}, experiment_name=EXPERIMENT_NAME,
                                     namespace=namespace)

RunPipelineResult(run_id=ddc9fc33-e2a0-4361-9904-6913e59298be)