# 2. Pre-processing for Claims data
This notebook will load the claims data, then split in years, do features transformation, merge data by Id as the preprocessing.

## Part a: Data import and split by years
First, load the claims data from a CSV file.

In [1]:
import pandas as pd

data_path = '../data/raw/Claims.csv'
claims_df = pd.read_csv(data_path)
claims_df

Unnamed: 0,MemberID,ProviderID,Vendor,PCP,Year,Specialty,PlaceSvc,PayDelay,LengthOfStay,DSFS,PrimaryConditionGroup,CharlsonIndex,ProcedureGroup,SupLOS
0,42286978,8013252.0,172193.0,37796.0,Y1,Surgery,Office,28,,8- 9 months,NEUMENT,0,MED,0
1,97903248,3316066.0,726296.0,5300.0,Y3,Internal,Office,50,,7- 8 months,NEUMENT,1-2,EM,0
2,2759427,2997752.0,140343.0,91972.0,Y3,Internal,Office,14,,0- 1 month,METAB3,0,EM,0
3,73570559,7053364.0,240043.0,70119.0,Y3,Laboratory,Independent Lab,24,,5- 6 months,METAB3,1-2,SCS,0
4,11837054,7557061.0,496247.0,68968.0,Y2,Surgery,Outpatient Hospital,27,,4- 5 months,FXDISLC,1-2,EM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668985,14932948,6641119.0,693028.0,22193.0,Y2,Obstetrics and Gynecology,Inpatient Hospital,58,,0- 1 month,GYNEC1,0,EM,0
2668986,31248189,6932712.0,223304.0,70748.0,Y3,Internal,Inpatient Hospital,23,,0- 1 month,GIBLEED,1-2,EM,0
2668987,43767339,1483429.0,35565.0,5278.0,Y3,Diagnostic Imaging,Office,122,,4- 5 months,ODaBNCA,0,SIS,0
2668988,96393713,7094351.0,347045.0,93075.0,Y3,Internal,Office,151,,1- 2 months,METAB3,1-2,EM,0


In [2]:
# Split data by year
claims_Y1_df = claims_df[claims_df['Year'] == 'Y1']
claims_Y2_df = claims_df[claims_df['Year'] == 'Y2']
claims_Y3_df = claims_df[claims_df['Year'] == 'Y3']
claims_Y3_df

Unnamed: 0,MemberID,ProviderID,Vendor,PCP,Year,Specialty,PlaceSvc,PayDelay,LengthOfStay,DSFS,PrimaryConditionGroup,CharlsonIndex,ProcedureGroup,SupLOS
1,97903248,3316066.0,726296.0,5300.0,Y3,Internal,Office,50,,7- 8 months,NEUMENT,1-2,EM,0
2,2759427,2997752.0,140343.0,91972.0,Y3,Internal,Office,14,,0- 1 month,METAB3,0,EM,0
3,73570559,7053364.0,240043.0,70119.0,Y3,Laboratory,Independent Lab,24,,5- 6 months,METAB3,1-2,SCS,0
5,45844561,1963488.0,4042.0,55823.0,Y3,Pediatrics,Office,25,,3- 4 months,NEUMENT,0,EM,0
9,72200595,6251259.0,791272.0,49465.0,Y3,Internal,Office,56,,7- 8 months,MISCHRT,1-2,SCS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668980,27364767,3394625.0,112451.0,39946.0,Y3,Internal,Outpatient Hospital,42,,0- 1 month,MSC2a3,0,SDS,0
2668982,95316156,794638.0,122401.0,46162.0,Y3,Internal,Office,22,,2- 3 months,INFEC4,0,EM,0
2668986,31248189,6932712.0,223304.0,70748.0,Y3,Internal,Inpatient Hospital,23,,0- 1 month,GIBLEED,1-2,EM,0
2668987,43767339,1483429.0,35565.0,5278.0,Y3,Diagnostic Imaging,Office,122,,4- 5 months,ODaBNCA,0,SIS,0


## Part b: Count Number of Claims for Each Member
We create the `ClaimsCount` column:

In [3]:
# Step 2: Count Number of Claims for Each Member
def calculate_claims_count(df):
    claims_count = df.groupby('MemberID').size().reset_index(name='ClaimsCount')
    return claims_count

# Calculate claims count for each year
claims_count_Y1 = calculate_claims_count(claims_Y1_df)
claims_count_Y2 = calculate_claims_count(claims_Y2_df)
claims_count_Y3 = calculate_claims_count(claims_Y3_df)
claims_count_Y1

Unnamed: 0,MemberID,ClaimsCount
0,210,8
1,3197,5
2,3889,13
3,4187,4
4,9063,4
...,...,...
76033,99995554,35
76034,99996214,1
76035,99997485,1
76036,99997895,14


## Part c: Count Unique Values for: `ProviderID`, `Vendor`, and `PCP`

In [4]:
# function for count unique values
def unique_values_count(df, feature):
    unique_count = df.groupby('MemberID')[feature].nunique().reset_index(name=f'Unique_{feature}')
    return unique_count

claims_Y1_providers = unique_values_count(claims_Y1_df, 'ProviderID')
claims_Y2_providers = unique_values_count(claims_Y2_df, 'ProviderID')
claims_Y3_providers = unique_values_count(claims_Y3_df, 'ProviderID')
claims_Y1_providers

Unnamed: 0,MemberID,Unique_ProviderID
0,210,4
1,3197,3
2,3889,7
3,4187,3
4,9063,2
...,...,...
76033,99995554,3
76034,99996214,1
76035,99997485,1
76036,99997895,5


In [5]:
claims_Y1_vendors = unique_values_count(claims_Y1_df, 'Vendor')
claims_Y2_vendors = unique_values_count(claims_Y2_df, 'Vendor')
claims_Y3_vendors = unique_values_count(claims_Y3_df, 'Vendor')
claims_Y1_vendors

Unnamed: 0,MemberID,Unique_Vendor
0,210,4
1,3197,3
2,3889,7
3,4187,3
4,9063,2
...,...,...
76033,99995554,3
76034,99996214,1
76035,99997485,1
76036,99997895,4


In [6]:
claims_Y1_pcps = unique_values_count(claims_Y1_df, 'PCP')
claims_Y2_pcps = unique_values_count(claims_Y2_df, 'PCP')
claims_Y3_pcps = unique_values_count(claims_Y3_df, 'PCP')
claims_Y1_pcps

Unnamed: 0,MemberID,Unique_PCP
0,210,2
1,3197,1
2,3889,1
3,4187,1
4,9063,1
...,...,...
76033,99995554,1
76034,99996214,1
76035,99997485,1
76036,99997895,1


## Part d: Do One-Hot Encoding for Categorical Features: 
`Specialty`, `PlaceSvc`, `PrimaryConditionGroup`, `CharlsonIndex`, `ProcedureGroup`, `DSFS`

In [7]:
# Step 4: One-Hot Encoding for Categorical Features
def one_hot_encode(df, column):
    one_hot_encoded = pd.get_dummies(df[column])
    one_hot_encoded['MemberID'] = df['MemberID']
    one_hot_agg = one_hot_encoded.groupby('MemberID').sum().reset_index()
    return one_hot_agg

In [8]:
# One-hot encoding of Specialty
claims_Y1_specialty = one_hot_encode(claims_Y1_df, 'Specialty')
claims_Y2_specialty = one_hot_encode(claims_Y2_df, 'Specialty')
claims_Y3_specialty = one_hot_encode(claims_Y3_df, 'Specialty')
claims_Y1_specialty

Unnamed: 0,MemberID,Anesthesiology,Diagnostic Imaging,Emergency,General Practice,Internal,Laboratory,Obstetrics and Gynecology,Other,Pathology,Pediatrics,Rehabilitation,Surgery
0,210,0,0,2,0,4,2,0,0,0,0,0,0
1,3197,0,0,2,0,0,0,0,0,0,3,0,0
2,3889,0,1,4,0,4,4,0,0,0,0,0,0
3,4187,0,1,0,0,2,0,0,1,0,0,0,0
4,9063,0,1,0,0,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,0,0,0,2,26,7,0,0,0,0,0,0
76034,99996214,0,0,0,1,0,0,0,0,0,0,0,0
76035,99997485,0,0,0,1,0,0,0,0,0,0,0,0
76036,99997895,0,1,0,0,4,7,0,0,0,0,0,2


In [9]:
# One-hot encoding of PlaceSvc
claims_Y1_placesvc = one_hot_encode(claims_Y1_df, 'PlaceSvc')
claims_Y2_placesvc = one_hot_encode(claims_Y2_df, 'PlaceSvc')
claims_Y3_placesvc = one_hot_encode(claims_Y3_df, 'PlaceSvc')
claims_Y1_placesvc

Unnamed: 0,MemberID,Ambulance,Home,Independent Lab,Inpatient Hospital,Office,Other,Outpatient Hospital,Urgent Care
0,210,0,0,2,0,4,0,0,2
1,3197,0,0,0,0,3,0,0,2
2,3889,0,0,4,0,4,0,0,5
3,4187,0,0,0,0,4,0,0,0
4,9063,0,0,0,0,4,0,0,0
...,...,...,...,...,...,...,...,...,...
76033,99995554,0,0,7,0,28,0,0,0
76034,99996214,0,0,0,0,1,0,0,0
76035,99997485,0,0,0,0,1,0,0,0
76036,99997895,0,0,7,0,7,0,0,0


In [10]:
# One-hot encoding of PrimaryConditionGroup
claims_Y1_primary_cond = one_hot_encode(claims_Y1_df, 'PrimaryConditionGroup')
claims_Y2_primary_cond = one_hot_encode(claims_Y2_df, 'PrimaryConditionGroup')
claims_Y3_primary_cond = one_hot_encode(claims_Y3_df, 'PrimaryConditionGroup')
claims_Y1_primary_cond

Unnamed: 0,MemberID,AMI,APPCHOL,ARTHSPIN,CANCRA,CANCRB,CANCRM,CATAST,CHF,COPD,...,RENAL2,RENAL3,RESPR4,ROAMI,SEIZURE,SEPSIS,SKNAUT,STROKE,TRAUMA,UTI
0,210,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3197,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
2,3889,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,2,0,0
3,4187,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9063,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,29,0,0,0
76034,99996214,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76035,99997485,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76036,99997895,0,0,5,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# One-hot encoding of CharlsonIndex
claims_Y1_CharlsonIndex = one_hot_encode(claims_Y1_df, 'CharlsonIndex')
claims_Y2_CharlsonIndex = one_hot_encode(claims_Y2_df, 'CharlsonIndex')
claims_Y3_CharlsonIndex = one_hot_encode(claims_Y3_df, 'CharlsonIndex')
claims_Y1_CharlsonIndex

Unnamed: 0,MemberID,0,1-2,3-4,5+
0,210,8,0,0,0
1,3197,5,0,0,0
2,3889,9,4,0,0
3,4187,4,0,0,0
4,9063,4,0,0,0
...,...,...,...,...,...
76033,99995554,35,0,0,0
76034,99996214,1,0,0,0
76035,99997485,1,0,0,0
76036,99997895,14,0,0,0


In [12]:
# One-hot encoding of ProcedureGroup
claims_Y1_ProcedureGroup = one_hot_encode(claims_Y1_df, 'ProcedureGroup')
claims_Y2_ProcedureGroup = one_hot_encode(claims_Y2_df, 'ProcedureGroup')
claims_Y3_ProcedureGroup = one_hot_encode(claims_Y3_df, 'ProcedureGroup')
claims_Y1_ProcedureGroup

Unnamed: 0,MemberID,ANES,EM,MED,PL,RAD,SAS,SCS,SDS,SEOA,SGS,SIS,SMCD,SMS,SNS,SO,SRS,SUS
0,210,0,1,4,1,0,0,1,1,0,0,0,0,0,0,0,0,0
1,3197,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3889,0,5,0,4,1,0,1,2,0,0,0,0,0,0,0,0,0
3,4187,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,9063,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,0,4,24,3,0,0,4,0,0,0,0,0,0,0,0,0,0
76034,99996214,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76035,99997485,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76036,99997895,0,6,0,4,1,0,3,0,0,0,0,0,0,0,0,0,0


In [13]:
# One-hot encoding of DSFS
claims_Y1_DSFS = one_hot_encode(claims_Y1_df, 'DSFS')
claims_Y2_DSFS = one_hot_encode(claims_Y2_df, 'DSFS')
claims_Y3_DSFS = one_hot_encode(claims_Y3_df, 'DSFS')
claims_Y1_DSFS

Unnamed: 0,MemberID,0- 1 month,1- 2 months,10-11 months,11-12 months,2- 3 months,3- 4 months,4- 5 months,5- 6 months,6- 7 months,7- 8 months,8- 9 months,9-10 months
0,210,3,2,0,0,0,1,0,0,1,0,0,1
1,3197,1,1,0,2,0,0,0,1,0,0,0,0
2,3889,12,1,0,0,0,0,0,0,0,0,0,0
3,4187,1,1,0,0,1,0,0,0,0,0,1,0
4,9063,3,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,1,0,0,0,2,0,1,6,7,7,4,7
76034,99996214,1,0,0,0,0,0,0,0,0,0,0,0
76035,99997485,1,0,0,0,0,0,0,0,0,0,0,0
76036,99997895,2,1,0,0,1,2,2,0,0,0,3,1


## Part e: Summing Numeric Feature: `PayDelay`

In [14]:
# Function to sum numeric feature, replace '162+' with 162, and convert to integer
def sum_numeric_feature(df, feature):
    df[feature] = df[feature].replace('162+', '162').astype(int)
    numeric_sum = df.groupby('MemberID')[feature].sum().reset_index()
    return numeric_sum

# Sum PayDelay for each member
claims_Y1_paydelay = sum_numeric_feature(claims_Y1_df, 'PayDelay')
claims_Y2_paydelay = sum_numeric_feature(claims_Y2_df, 'PayDelay')
claims_Y3_paydelay = sum_numeric_feature(claims_Y3_df, 'PayDelay')
claims_Y1_paydelay

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = df[feature].replace('162+', '162').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = df[feature].replace('162+', '162').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = df[feature].replace('162+', '162').astype(int)


Unnamed: 0,MemberID,PayDelay
0,210,720
1,3197,492
2,3889,919
3,4187,340
4,9063,241
...,...,...
76033,99995554,3899
76034,99996214,19
76035,99997485,130
76036,99997895,539


## Part f: Merge all Features by ID and Save in Years

In [15]:
# Merge function, used to merge each feature by year
def merge_data_by_year(claims_count, providers, vendors, pcps, paydelay, specialty, placesvc, 
                       primary_cond, charlson_index, procedure_group, dsfs):
    from functools import reduce
    data_frames = [claims_count, providers, vendors, pcps, paydelay, specialty, placesvc,
                   primary_cond, charlson_index, procedure_group, dsfs]
    merged_data = reduce(lambda left, right: pd.merge(left, right, on='MemberID', how='outer'), data_frames)
    return merged_data

# merge data by years
merged_Y1 = merge_data_by_year(claims_count_Y1, claims_Y1_providers, claims_Y1_vendors, claims_Y1_pcps, 
                               claims_Y1_paydelay, claims_Y1_specialty, claims_Y1_placesvc, claims_Y1_primary_cond, 
                               claims_Y1_CharlsonIndex, claims_Y1_ProcedureGroup, claims_Y1_DSFS)
merged_Y2 = merge_data_by_year(claims_count_Y2, claims_Y2_providers, claims_Y2_vendors, claims_Y2_pcps, 
                               claims_Y2_paydelay, claims_Y2_specialty, claims_Y2_placesvc, claims_Y2_primary_cond, 
                               claims_Y2_CharlsonIndex, claims_Y2_ProcedureGroup, claims_Y2_DSFS)
merged_Y3 = merge_data_by_year(claims_count_Y3, claims_Y3_providers, claims_Y3_vendors, claims_Y3_pcps, 
                               claims_Y3_paydelay, claims_Y3_specialty, claims_Y3_placesvc, claims_Y3_primary_cond, 
                               claims_Y3_CharlsonIndex, claims_Y3_ProcedureGroup, claims_Y3_DSFS)

In [16]:
merged_Y1

Unnamed: 0,MemberID,ClaimsCount,Unique_ProviderID,Unique_Vendor,Unique_PCP,PayDelay,Anesthesiology,Diagnostic Imaging,Emergency,General Practice,...,10-11 months,11-12 months,2- 3 months,3- 4 months,4- 5 months,5- 6 months,6- 7 months,7- 8 months,8- 9 months,9-10 months
0,210,8,4,4,2,720,0,0,2,0,...,0,0,0,1,0,0,1,0,0,1
1,3197,5,3,3,1,492,0,0,2,0,...,0,2,0,0,0,1,0,0,0,0
2,3889,13,7,7,1,919,0,1,4,0,...,0,0,0,0,0,0,0,0,0,0
3,4187,4,3,3,1,340,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,9063,4,2,2,1,241,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,35,3,3,1,3899,0,0,0,2,...,0,0,2,0,1,6,7,7,4,7
76034,99996214,1,1,1,1,19,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
76035,99997485,1,1,1,1,130,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
76036,99997895,14,5,4,1,539,0,1,0,0,...,0,0,1,2,2,0,0,0,3,1


In [17]:
# Save the merged data for each year as a CSV file
merged_Y1.to_csv('../data/processed/processed_claim_Y1.csv', index=False)
merged_Y2.to_csv('../data/processed/processed_claim_Y2.csv', index=False)
merged_Y3.to_csv('../data/processed/processed_claim_Y3.csv', index=False)

In [18]:
# display the data after processing
new_members_df = pd.read_csv('../data/processed/processed_claim_Y1.csv')
new_members_df

Unnamed: 0,MemberID,ClaimsCount,Unique_ProviderID,Unique_Vendor,Unique_PCP,PayDelay,Anesthesiology,Diagnostic Imaging,Emergency,General Practice,...,10-11 months,11-12 months,2- 3 months,3- 4 months,4- 5 months,5- 6 months,6- 7 months,7- 8 months,8- 9 months,9-10 months
0,210,8,4,4,2,720,0,0,2,0,...,0,0,0,1,0,0,1,0,0,1
1,3197,5,3,3,1,492,0,0,2,0,...,0,2,0,0,0,1,0,0,0,0
2,3889,13,7,7,1,919,0,1,4,0,...,0,0,0,0,0,0,0,0,0,0
3,4187,4,3,3,1,340,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,9063,4,2,2,1,241,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,35,3,3,1,3899,0,0,0,2,...,0,0,2,0,1,6,7,7,4,7
76034,99996214,1,1,1,1,19,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
76035,99997485,1,1,1,1,130,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
76036,99997895,14,5,4,1,539,0,1,0,0,...,0,0,1,2,2,0,0,0,3,1
