In [97]:
import pandas as pd
import numpy as np
import os  

In [98]:
def filepath(f):
    d = os.path.join(os.path.dirname(os.getcwd()), 'processed_data', f)
    return d

In [99]:
#Import training and test datasets
train_data=pd.read_csv(filepath("final_training_set.csv"))
test_data=pd.read_csv(filepath("final_test_set.csv"))

In [100]:
uid = train_data["ClaimID"]
train_data = train_data.drop(
    ["ClaimID",
     "Unnamed: 0"],
    axis = 1
)

In [101]:
train_data["procedure_1"].nunique()

1

In [102]:
for c in train_data.columns:
    print(c)

Provider
PotentialFraud
InscClaimAmtReimbursed
DeductibleAmtPaid
inpatient
Gender
Race
RenalDiseaseIndicator
ChronicCond_Alzheimer
ChronicCond_Heartfailure
ChronicCond_KidneyDisease
ChronicCond_Cancer
ChronicCond_ObstrPulmonary
ChronicCond_Depression
ChronicCond_Diabetes
ChronicCond_IschemicHeart
ChronicCond_Osteoporasis
ChronicCond_rheumatoidarthritis
ChronicCond_stroke
IPAnnualReimbursementAmt
IPAnnualDeductibleAmt
OPAnnualReimbursementAmt
OPAnnualDeductibleAmt
age
claim_duration
time_under_care
admitDiagInFinalDiagnosis
diagnosis_1
diagnosis_2
diagnosis_3
diagnosis_4
diagnosis_5
diagnosis_6
diagnosis_7
diagnosis_8
diagnosis_9
diagnosis_10
diagnosis_11
diagnosis_12
diagnosis_13
diagnosis_14
diagnosis_15
diagnosis_16
diagnosis_17
diagnosis_18
diagnosis_19
procedure_1
procedure_2
procedure_3
procedure_4
procedure_5
procedure_6
procedure_7
procedure_8
procedure_9
procedure_10
procedure_11
procedure_12
procedure_13
procedure_14
procedure_15
procedure_16
procedure_17
procedure_18


In [103]:
def minmax_encode(df, col):
    """
    Return dataset including the minmax encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be numeric
    """

    maxx = df[col].max()
    minx = df[col].min()
    out = list(map(lambda x: (x-minx)/(maxx-minx), df[col]))
    new_colname = col + "_minmax"
    df[new_colname] = out
    return df.drop(
        [col],
        axis = 1
    )

In [104]:
def one_hot_encode(df, col):
    """
    Returns the dataset including the one hot encoded columns and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable
    """
    ohe_cols = pd.get_dummies(df[col], prefix = col)
    output = pd.concat(
        [df, ohe_cols],
        axis = 1,
    ).drop(
        [col],
        axis = 1
    )
    return output

def frequency_encode(df, col):
    """
    Returns the dataset including the frequency encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable with high cardinality
    """
    val_counts = df[col].value_counts().to_dict()
    total = len(col)
    out = []
    for x in df[col]:
        out.append(val_counts[x]/total)
    new_colname = col + '_freq'
    df[new_colname] = out
    df.drop(
        [col],
        axis = 1,
        inplace = True
    )

    return df

In [105]:
freq_encoded_cols = []
ohe_cols = []
num_cols = []
unique_threshold = 30

for col in train_data.columns:
    if train_data[col].nunique() == 1:
        print(col,"has been removed as it is constant")
        train_data.drop([col], axis=1, inplace=True)
    elif train_data[col].nunique() == 2: # Binary column
        continue
    elif train_data[col].dtype in ['int64','float64']:
        train_data[col] = train_data[col].fillna(train_data[col].median())
        num_cols.append(col)
    elif train_data[col].nunique() > unique_threshold:
        freq_encoded_cols.append(col)
    elif 2 < train_data[col].nunique() <= unique_threshold:
        ohe_cols.append(col)
        


procedure_1 has been removed as it is constant
procedure_2 has been removed as it is constant
procedure_3 has been removed as it is constant


In [106]:
for col in train_data.columns:
    if col in num_cols:
        try:
            train_data = minmax_encode(train_data, col)
        except:
            print(col)
    elif col in ohe_cols:
        train_data = one_hot_encode(train_data, col)
    elif col in freq_encoded_cols:
        train_data = frequency_encode(train_data, col)
    elif train_data[col].nunique() == 1:
        train_data.drop(
            [col],
            axis = 1,
            inplace = True
        )
    else:
        train_data[col] = train_data[col].astype('bool')

In [107]:
for c in train_data.columns:
    print(c)

PotentialFraud
inpatient
Gender
RenalDiseaseIndicator
ChronicCond_Alzheimer
ChronicCond_Heartfailure
ChronicCond_KidneyDisease
ChronicCond_Cancer
ChronicCond_ObstrPulmonary
ChronicCond_Depression
ChronicCond_Diabetes
ChronicCond_IschemicHeart
ChronicCond_Osteoporasis
ChronicCond_rheumatoidarthritis
ChronicCond_stroke
admitDiagInFinalDiagnosis
diagnosis_1
diagnosis_2
diagnosis_3
diagnosis_4
diagnosis_5
diagnosis_6
diagnosis_7
diagnosis_8
diagnosis_9
diagnosis_10
diagnosis_11
diagnosis_12
diagnosis_13
diagnosis_14
diagnosis_15
diagnosis_16
diagnosis_17
diagnosis_18
diagnosis_19
procedure_4
procedure_5
procedure_6
procedure_7
procedure_8
procedure_9
procedure_10
procedure_11
procedure_12
procedure_13
procedure_14
procedure_15
procedure_16
procedure_17
procedure_18
Provider_freq
InscClaimAmtReimbursed_minmax
DeductibleAmtPaid_minmax
Race_minmax
IPAnnualReimbursementAmt_minmax
IPAnnualDeductibleAmt_minmax
OPAnnualReimbursementAmt_minmax
OPAnnualDeductibleAmt_minmax
age_minmax
claim_duration

In [108]:
y = train_data["PotentialFraud"]
train_data.drop(
    ["PotentialFraud"],
    axis = 1,
    inplace = True
)