## Importing all the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from pylab import rcParams

In [2]:
pd.set_option('display.max_columns', 100) # shows all the columns

## Pre-processing Data

In [3]:
# Loading all the Test datasets 

Test_Beneficiary = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test_Beneficiary.csv')
Test_Inpatient = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test_Inpatient.csv')
Test_Outpatient = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test_Outpatient.csv')
Fraud_label = pd.read_csv('/Users/suborna/Github/Capstone_Project/Data/Test.csv')

In [4]:
# dropping columns with high missingness and columns that are not important for the analysis
Test_Beneficiary = Test_Beneficiary.drop(['DOD'], axis = 1)
Test_Inpatient = Test_Inpatient.drop(['ClmProcedureCode_6'], axis = 1) # dropping because 100% data as NAs
Test_Outpatient = Test_Outpatient.drop(['ClmProcedureCode_6'], axis = 1) # dropping because 100% data as NAs

In [5]:
Test_Beneficiary.isnull().sum() # checking which features have missing values

BeneID                             0
DOB                                0
Gender                             0
Race                               0
RenalDiseaseIndicator              0
State                              0
County                             0
NoOfMonths_PartACov                0
NoOfMonths_PartBCov                0
ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
IPAnnualReimbursementAmt           0
IPAnnualDeductibleAmt              0
OPAnnualReimbursementAmt           0
OPAnnualDeductibleAmt              0
dtype: int64

In [6]:
Test_Inpatient.isnull().sum() # checking which features have missing values

BeneID                       0
ClaimID                      0
ClaimStartDt                 0
ClaimEndDt                   0
Provider                     0
InscClaimAmtReimbursed       0
AttendingPhysician          31
OperatingPhysician        3962
OtherPhysician            8538
AdmissionDt                  0
ClmAdmitDiagnosisCode        0
DeductibleAmtPaid          196
DischargeDt                  0
DiagnosisGroupCode           0
ClmDiagnosisCode_1           0
ClmDiagnosisCode_2          54
ClmDiagnosisCode_3         169
ClmDiagnosisCode_4         404
ClmDiagnosisCode_5         719
ClmDiagnosisCode_6        1197
ClmDiagnosisCode_7        1736
ClmDiagnosisCode_8        2360
ClmDiagnosisCode_9        3238
ClmDiagnosisCode_10       8664
ClmProcedureCode_1        4118
ClmProcedureCode_2        8297
ClmProcedureCode_3        9328
ClmProcedureCode_4        9522
ClmProcedureCode_5        9549
dtype: int64

In [7]:
Test_Outpatient.isnull().sum() # checking which features have missing values

BeneID                         0
ClaimID                        0
ClaimStartDt                   0
ClaimEndDt                     0
Provider                       0
InscClaimAmtReimbursed         0
AttendingPhysician           316
OperatingPhysician        104237
OtherPhysician             78222
ClmDiagnosisCode_1          2578
ClmDiagnosisCode_2         47731
ClmDiagnosisCode_3         76575
ClmDiagnosisCode_4         95371
ClmDiagnosisCode_5        107875
ClmDiagnosisCode_6        114035
ClmDiagnosisCode_7        117871
ClmDiagnosisCode_8        120310
ClmDiagnosisCode_9        122278
ClmDiagnosisCode_10       125578
ClmProcedureCode_1        125807
ClmProcedureCode_2        125832
ClmProcedureCode_3        125839
ClmProcedureCode_4        125841
ClmProcedureCode_5        125841
DeductibleAmtPaid              0
ClmAdmitDiagnosisCode     100036
dtype: int64

In [8]:
# Fill 0 for data type 'NA'

Test_Inpatient[['DeductibleAmtPaid', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5']] = Test_Inpatient[[\
                                                                                             'DeductibleAmtPaid', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5']].fillna(0)

Test_Outpatient[['ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5']] = Test_Outpatient[[\
                                                                                              'ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5']].fillna(0)

# Filling None for missing Doctors

Test_Inpatient[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']] = \
Test_Inpatient[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']].fillna('None')

Test_Outpatient[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']] = \
Test_Outpatient[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']].fillna('None')

In [10]:
Test_Inpatient.isnull().sum() # checking to make sure all missing values have been dealth with

BeneID                    0
ClaimID                   0
ClaimStartDt              0
ClaimEndDt                0
Provider                  0
InscClaimAmtReimbursed    0
AttendingPhysician        0
OperatingPhysician        0
OtherPhysician            0
ClmDiagnosisCode_1        0
ClmDiagnosisCode_2        0
ClmDiagnosisCode_3        0
ClmDiagnosisCode_4        0
ClmDiagnosisCode_5        0
ClmDiagnosisCode_6        0
ClmDiagnosisCode_7        0
ClmDiagnosisCode_8        0
ClmDiagnosisCode_9        0
ClmDiagnosisCode_10       0
ClmProcedureCode_1        0
ClmProcedureCode_2        0
ClmProcedureCode_3        0
ClmProcedureCode_4        0
ClmProcedureCode_5        0
DeductibleAmtPaid         0
ClmAdmitDiagnosisCode     0
dtype: int64

In [11]:
Test_Outpatient.isnull().sum() # checking to make sure all missing values have been dealth with

BeneID                    0
ClaimID                   0
ClaimStartDt              0
ClaimEndDt                0
Provider                  0
InscClaimAmtReimbursed    0
AttendingPhysician        0
OperatingPhysician        0
OtherPhysician            0
ClmDiagnosisCode_1        0
ClmDiagnosisCode_2        0
ClmDiagnosisCode_3        0
ClmDiagnosisCode_4        0
ClmDiagnosisCode_5        0
ClmDiagnosisCode_6        0
ClmDiagnosisCode_7        0
ClmDiagnosisCode_8        0
ClmDiagnosisCode_9        0
ClmDiagnosisCode_10       0
ClmProcedureCode_1        0
ClmProcedureCode_2        0
ClmProcedureCode_3        0
ClmProcedureCode_4        0
ClmProcedureCode_5        0
DeductibleAmtPaid         0
ClmAdmitDiagnosisCode     0
dtype: int64

In [12]:
# Merging inpatient & outpatient datasets
Merged_Inpatient = pd.merge(Test_Beneficiary, Test_Inpatient, how = 'inner', on = 'BeneID')
Merged_Inpatient = pd.merge(Merged_Inpatient, Fraud_label, how = 'inner', on = 'Provider')

Merged_Outpatient = pd.merge(Test_Beneficiary, Test_Outpatient, how = 'inner', on = 'BeneID')
Merged_Outpatient = pd.merge(Merged_Outpatient, Fraud_label, how = 'inner', on = 'Provider')

In [14]:
# replacing 'Y' with 1 which stands for Yes and '2' with 0 which stands for No
replace_two = ['Gender', 'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', 'ChronicCond_Cancer', \
                      'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', \
                      'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']

Merged_Inpatient.RenalDiseaseIndicator = Merged_Inpatient.RenalDiseaseIndicator.replace('Y', 1)
Merged_Inpatient[replace_two] = Merged_Inpatient[replace_two].replace(2, 0)

Merged_Outpatient.RenalDiseaseIndicator = Merged_Outpatient.RenalDiseaseIndicator.replace('Y', 1)
Merged_Outpatient[replace_two] = Merged_Outpatient[replace_two].replace(2, 0)

In [15]:
# Calculating Age of all the patients with respect to 2009
Merged_Inpatient['Age'] = int(2009) - Merged_Inpatient.DOB.apply(lambda x: x.split('-')[0]).astype('int')
Merged_Outpatient['Age'] = int(2009) - Merged_Outpatient.DOB.apply(lambda x: x.split('-')[0]).astype('int')

# Dropping the date of birth column as we already calculated age of all the patients with respect to 2009
Merged_Inpatient = Merged_Inpatient.drop('DOB', axis = 1)
Merged_Outpatient = Merged_Outpatient.drop('DOB', axis = 1)

In [16]:
# printing all the unique values to make sure the columns are good to go (Inpatient)
for column in replace_two:
    print(column, Merged_Inpatient[column].unique())

print(' * ' * 20)

# printing all the unique values to make sure the columns are good to go (Outpatient)
for column in replace_two:
    print(column, Merged_Outpatient[column].unique())

Gender [0 1]
ChronicCond_Alzheimer [0 1]
ChronicCond_Heartfailure [1 0]
ChronicCond_KidneyDisease [1 0]
ChronicCond_Cancer [0 1]
ChronicCond_ObstrPulmonary [1 0]
ChronicCond_Depression [1 0]
ChronicCond_Diabetes [0 1]
ChronicCond_IschemicHeart [1 0]
ChronicCond_Osteoporasis [0 1]
ChronicCond_rheumatoidarthritis [0 1]
ChronicCond_stroke [0 1]
 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 
Gender [1 0]
ChronicCond_Alzheimer [1 0]
ChronicCond_Heartfailure [0 1]
ChronicCond_KidneyDisease [1 0]
ChronicCond_Cancer [0 1]
ChronicCond_ObstrPulmonary [0 1]
ChronicCond_Depression [1 0]
ChronicCond_Diabetes [1 0]
ChronicCond_IschemicHeart [1 0]
ChronicCond_Osteoporasis [0 1]
ChronicCond_rheumatoidarthritis [1 0]
ChronicCond_stroke [1 0]


In [17]:
#Merged_Inpatient['DOB'] = pd.to_datetime(Merged_Inpatient['DOB'])
Merged_Inpatient['ClaimStartDt'] = pd.to_datetime(Merged_Inpatient['ClaimStartDt'])
Merged_Inpatient['ClaimEndDt'] = pd.to_datetime(Merged_Inpatient['ClaimEndDt'])
Merged_Inpatient['AdmissionDt'] = pd.to_datetime(Merged_Inpatient['AdmissionDt'])
Merged_Inpatient['DischargeDt'] = pd.to_datetime(Merged_Inpatient['DischargeDt'])


#Merged_Outpatient['DOB'] = pd.to_datetime(Merged_Outpatient['DOB'])
Merged_Outpatient['ClaimStartDt'] = pd.to_datetime(Merged_Outpatient['ClaimStartDt'])
Merged_Outpatient['ClaimEndDt'] = pd.to_datetime(Merged_Outpatient['ClaimEndDt'])

In [18]:
# Creating new columns with label Inpatient & Outpatient so that after merging we can easily filter out inpatient or outpatient if necessary
Merged_Inpatient['Type'] = 'Inpatient'
Merged_Outpatient['Type'] = 'Outpatient'

# merging inpatient and outpatient into one data frame
Merged_InOut = pd.concat([Merged_Inpatient, Merged_Outpatient], axis = 0)
Merged_InOut.RenalDiseaseIndicator = Merged_InOut.RenalDiseaseIndicator.astype('int64')

In [19]:
# column for per patients hospital stay count
Merged_InOut['Hospital_Stay'] = (Merged_InOut['DischargeDt'] - Merged_InOut['AdmissionDt']).dt.days
Merged_InOut['Claim_Length'] = (Merged_InOut['ClaimEndDt'] - Merged_InOut['ClaimStartDt']).dt.days

In [20]:
# defining a function to create age group label on the age column
def age_group(x):
    if x <= 40:
        return '26-40'
    elif x <= 60:
        return '41-60'
    elif x <= 80:
        return '61-80'
    else:
        return '81-100'
    
# applying the above function on the age column and labeling the group as a new column
Merged_InOut['Age_Group'] = Merged_InOut.Age.apply(age_group)

In [21]:
# Dropping because these are not necessary for the analysis
Merged_InOut = Merged_InOut.drop(['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'], axis = 1)

In [22]:
# filling in all the missing values with 0
Merged_InOut = Merged_InOut.fillna(0)

In [23]:
# calculating total payment to the provider per claim
Merged_InOut['Total_Payment'] = Merged_InOut['InscClaimAmtReimbursed'] + Merged_InOut['DeductibleAmtPaid']

In [24]:
# checking if there are any missing values
Merged_InOut.isnull().sum()

BeneID                             0
Gender                             0
Race                               0
RenalDiseaseIndicator              0
State                              0
County                             0
NoOfMonths_PartACov                0
NoOfMonths_PartBCov                0
ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
IPAnnualReimbursementAmt           0
IPAnnualDeductibleAmt              0
OPAnnualReimbursementAmt           0
OPAnnualDeductibleAmt              0
ClaimID                            0
Provider                           0
InscClaimAmtReimbursed             0
AttendingPhysician                 0
O

In [25]:
# checking data type of each columns
Merged_InOut.dtypes

BeneID                              object
Gender                               int64
Race                                 int64
RenalDiseaseIndicator                int64
State                                int64
County                               int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
ClaimID    

## Feature Engineering

In [40]:
# Dataframe where all the provider level features will be saved
Features_Engineered = pd.DataFrame(Merged_InOut.Provider.unique(), columns = ['Provider'])

In [35]:
Features_Engineered.shape

(1353, 1)

In [36]:
Features_Engineered.sample(5) # showing 5 rows of the dataframe

Unnamed: 0,Provider
1110,PRV55282
183,PRV56203
1348,PRV56372
334,PRV55015
164,PRV56429


In [41]:
# New Colum Male & Female Patient Count
Male_Patient = Merged_InOut[['Provider', 'BeneID', 'Gender']].loc[Merged_InOut.Gender == 1].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Male_Patient'})
Female_Patient = Merged_InOut[['Provider', 'BeneID', 'Gender']].loc[Merged_InOut.Gender == 0].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Female_Patient'})

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Male_Patient, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Female_Patient, on = 'Provider', how = 'left')


# New Columns for Count of Age Group
Age_26= Merged_InOut.loc[Merged_InOut.Age_Group == '26-40'].groupby('Provider')['BeneID'].nunique().to_frame().rename(columns = {'BeneID': 'Age(26-40)'}).reset_index()
Age_41= Merged_InOut.loc[Merged_InOut.Age_Group == '41-60'].groupby('Provider')['BeneID'].nunique().to_frame().rename(columns = {'BeneID': 'Age(41-60)'}).reset_index()
Age_61= Merged_InOut.loc[Merged_InOut.Age_Group == '61-80'].groupby('Provider')['BeneID'].nunique().to_frame().rename(columns = {'BeneID': 'Age(61-80)'}).reset_index()
Age_81= Merged_InOut.loc[Merged_InOut.Age_Group == '81-100'].groupby('Provider')['BeneID'].nunique().to_frame().rename(columns = {'BeneID': 'Age(81-100)'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Age_26, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Age_41, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Age_61, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Age_81, on = 'Provider', how = 'left')


# New Columns for Count of Different Race
Race_White= Merged_InOut[['Provider', 'BeneID', 'Race']].loc[Merged_InOut.Race == 1].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Race_White'})
Race_Black= Merged_InOut[['Provider', 'BeneID', 'Race']].loc[Merged_InOut.Race == 2].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Race_Black'})
Race_Hispanic= Merged_InOut[['Provider', 'BeneID', 'Race']].loc[Merged_InOut.Race == 3].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Race_Hispanic'})
Race_Native= Merged_InOut[['Provider', 'BeneID', 'Race']].loc[Merged_InOut.Race == 5].groupby('Provider')['BeneID'].nunique().to_frame().reset_index().rename(columns = {'BeneID': 'Race_Native'})

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Race_White, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Race_Black, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Race_Hispanic, on = 'Provider', how = 'left')
Features_Engineered = Features_Engineered.merge(right = Race_Native, on = 'Provider', how = 'left')


# New Column for Count of Hospital Stay Per Provider
Hos_Stay= Merged_InOut.groupby('Provider')['Hospital_Stay'].agg(['sum']).reset_index()
Hos_Stay.columns = ['Provider', 'Hos_Stay_Total']

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Hos_Stay, on = 'Provider', how = 'left')


# New Column for Sum of Total Payment Per Provider
Payment= Merged_InOut.groupby('Provider')['Total_Payment'].agg(['sum']).reset_index()
Payment.columns = ['Provider', 'Payment_Total']

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Payment, on = 'Provider', how = 'left')


# New Column for Sum of Claim Lenght Per Provider
claim_duration= Merged_InOut.groupby('Provider')['Claim_Length'].agg(['sum']).reset_index()
claim_duration.columns = ['Provider', 'Claim_Len_Total']

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = claim_duration, on = 'Provider', how = 'left')


# New Column for Count of Alzheimer Patient Per Provider
Alzheimer= Merged_InOut.groupby('Provider')['ChronicCond_Alzheimer'].agg('sum').to_frame().rename(columns = {'ChronicCond_Alzheimer': 'Alzheimer'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Alzheimer, on = 'Provider', how = 'left')


# New Column for Count of Heartfailure Patient Per Provider
Heartfailure= Merged_InOut.groupby('Provider')['ChronicCond_Heartfailure'].agg('sum').to_frame().rename(columns = {'ChronicCond_Heartfailure': 'Heartfailure'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Heartfailure, on = 'Provider', how = 'left')


# New Column for Count of Kidney Disease Patient Per Provider
KidneyDisease= Merged_InOut.groupby('Provider')['ChronicCond_KidneyDisease'].agg('sum').to_frame().rename(columns = {'ChronicCond_KidneyDisease': 'KidneyDisease'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = KidneyDisease, on = 'Provider', how = 'left')


# New Column for Count of Cancer Patient Per Provider
Cancer= Merged_InOut.groupby('Provider')['ChronicCond_Cancer'].agg('sum').to_frame().rename(columns = {'ChronicCond_Cancer': 'Cancer'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Cancer, on = 'Provider', how = 'left')


# New Column for Count of Obstructive Pulmonary Patients Per Provider
ObstrPulmonary= Merged_InOut.groupby('Provider')['ChronicCond_ObstrPulmonary'].agg('sum').to_frame().rename(columns = {'ChronicCond_ObstrPulmonary': 'ObstrPulmonary'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = ObstrPulmonary, on = 'Provider', how = 'left')


# New Column for Count of Depression Patients Per Provider
Depression= Merged_InOut.groupby('Provider')['ChronicCond_Depression'].agg('sum').to_frame().rename(columns = {'ChronicCond_Depression': 'Depression'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Depression, on = 'Provider', how = 'left')


# New Column for Count of Diabetes Patients Per Provider
Diabetes= Merged_InOut.groupby('Provider')['ChronicCond_Diabetes'].agg('sum').to_frame().rename(columns = {'ChronicCond_Diabetes': 'Diabetes'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Diabetes, on = 'Provider', how = 'left')


# New Column for Count of Ischemic Heart Patients Per Provider
IschemicHeart= Merged_InOut.groupby('Provider')['ChronicCond_IschemicHeart'].agg('sum').to_frame().rename(columns = {'ChronicCond_IschemicHeart': 'IschemicHeart'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = IschemicHeart, on = 'Provider', how = 'left')


# New Column for Count of Osteoporasis Patients Per Provider
Osteoporasis= Merged_InOut.groupby('Provider')['ChronicCond_Osteoporasis'].agg('sum').to_frame().rename(columns = {'ChronicCond_Osteoporasis': 'Osteoporasis'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = Osteoporasis, on = 'Provider', how = 'left')


# New Column for Count of rheumatoidarthritis Patients Per Provider
rheumatoidarthritis= Merged_InOut.groupby('Provider')['ChronicCond_rheumatoidarthritis'].agg('sum').to_frame().rename(columns = {'ChronicCond_rheumatoidarthritis': 'rheumatoidarthritis'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = rheumatoidarthritis, on = 'Provider', how = 'left')


# New Column for Count of Stroke Patients Per Provider
stroke= Merged_InOut.groupby('Provider')['ChronicCond_stroke'].agg('sum').to_frame().rename(columns = {'ChronicCond_stroke': 'stroke'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = stroke, on = 'Provider', how = 'left')


# New Column for Count of Renal Disease Patients Per Provider
renal= Merged_InOut.groupby('Provider')['RenalDiseaseIndicator'].agg('sum').to_frame().rename(columns = {'RenalDiseaseIndicator': 'RenalDisease'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = renal, on = 'Provider', how = 'left')


# New Column for Count of State Per Provider
State_Count= Merged_InOut[['Provider', 'State']].groupby('Provider')['State'].nunique().to_frame().rename(columns = {'State': 'State_Count'}).reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = State_Count, on = 'Provider', how = 'left')


# New Column for Count of County Per Provider
County_Count= Merged_InOut[['Provider', 'State', 'County']].groupby(['Provider', 'State'])['County'].nunique().to_frame().rename(columns = {'County': 'County_Count'}).reset_index()
County_Count= County_Count.groupby('Provider')['County_Count'].agg('sum').to_frame().reset_index()

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = County_Count, on = 'Provider', how = 'left')


# New Column for Count of Diagnosis Group Code Per Provider
DiagGroupCode= Merged_InOut.loc[Merged_InOut.DiagnosisGroupCode != 0].groupby('Provider')['DiagnosisGroupCode'].\
nunique().to_frame().reset_index().rename(columns = {'DiagnosisGroupCode': 'DiagGroupCode'})

# Adding Columns to New Dataframe
Features_Engineered = Features_Engineered.merge(right = DiagGroupCode, on = 'Provider', how = 'left')

In [47]:
Features_Engineered.sample(5) # printing 5 random rows 

Unnamed: 0,Provider,Male_Patient,Female_Patient,Age(26-40),Age(41-60),Age(61-80),Age(81-100),Race_White,Race_Black,Race_Hispanic,Race_Native,Hos_Stay_Total,Payment_Total,Claim_Len_Total,Alzheimer,Heartfailure,KidneyDisease,Cancer,ObstrPulmonary,Depression,Diabetes,IschemicHeart,Osteoporasis,rheumatoidarthritis,stroke,RenalDisease,State_Count,County_Count,DiagGroupCode
1121,PRV54190,8.0,15.0,1.0,2.0,10.0,10.0,22.0,1.0,0.0,0.0,0.0,8750.0,31,9,11,14,2,5,8,17,20,6,10,2,5,1,6,0.0
370,PRV52447,4.0,8.0,0.0,0.0,7.0,5.0,8.0,4.0,0.0,0.0,76.0,115816.0,76,9,7,6,0,6,9,9,10,3,4,2,4,2,6,11.0
1085,PRV54485,7.0,8.0,0.0,3.0,8.0,4.0,15.0,0.0,0.0,0.0,0.0,10910.0,20,16,15,11,1,5,11,14,29,10,3,3,3,1,3,0.0
1251,PRV51714,5.0,5.0,1.0,0.0,6.0,3.0,8.0,1.0,0.0,1.0,0.0,2220.0,14,2,5,3,1,5,2,6,8,2,3,0,2,1,1,0.0
590,PRV56943,19.0,29.0,1.0,7.0,31.0,9.0,38.0,7.0,2.0,1.0,0.0,14110.0,137,23,26,19,4,19,17,34,34,16,13,2,6,1,1,0.0


In [45]:
# Filling all the missing values
Features_Engineered = Features_Engineered.fillna(0)
Features_Engineered.isnull().sum().sum() # there are no missing values

0

In [46]:
# Exporting the dataframe as csv 
# Features_Engineered.to_csv('/Users/suborna/Github/Capstone_Project/Data/Test_New_Features.csv', index = False)