In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_info_columns', 200)

In [2]:
gen_path = Path().cwd().parent

In [3]:
diabetes = pd.read_csv(gen_path / 'data' / 'diabetic_data.csv')

print(diabetes.shape)
print(diabetes.columns)

(101766, 50)
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


In [4]:

#dropping features considered in exploratory work (separate ipynb files)
drop=['race', 'weight', 'medical_specialty', 'max_glu_serum']

In [5]:
diabetes.replace('?', pd.NA, inplace= True)

In [6]:
# Analyzing the values '?' which means they did not have any value there

diabetes.isnull().sum()[diabetes.isnull().sum() > 0]

race                  2273
weight               98569
payer_code           40256
medical_specialty    49949
diag_1                  21
diag_2                 358
diag_3                1423
max_glu_serum        96420
A1Cresult            84748
dtype: int64

In [7]:
diabetes.isnull().sum()[diabetes.isnull().sum() > 0] / diabetes.shape[0] * 100

race                  2.233555
weight               96.858479
payer_code           39.557416
medical_specialty    49.082208
diag_1                0.020636
diag_2                0.351787
diag_3                1.398306
max_glu_serum        94.746772
A1Cresult            83.277322
dtype: float64

In [8]:
diabetes['MP_DM_payer_ind'] = ((diabetes['payer_code'] == 'MP') | 
                               (diabetes['payer_code'] == 'DM')).astype(int)

In [9]:
drop.extend(['payer_code', 'A1Cresult'])

In [10]:

admission_type_id = { 1 : 'Emergency'
, 2 : 'Urgent'
, 3 : 'Elective'
, 4 : 'Newborn'
, 5 : 'Not Available'
, 6 : 'NULL'
, 7 : 'Trauma Center'
, 8 : 'Not Mapped' }

In [11]:

discharge_disposition_id = { 1 : 'Discharged to home'
, 2 : 'Discharged/transferred to another short term hospital'
, 3 : 'Discharged/transferred to SNF'
, 4 : 'Discharged/transferred to ICF'
, 5 : 'Discharged/transferred to another type of inpatient care institution'
, 6 : 'Discharged/transferred to home with home health service'
, 7 : 'Left AMA'
, 8 : 'Discharged/transferred to home under care of Home IV provider'
, 9 : 'Admitted as an inpatient to this hospital'
, 10 : 'Neonate discharged to another hospital for neonatal aftercare'
, 11 : 'Expired'
, 12 : 'Still patient or expected to return for outpatient services'
, 13 : 'Hospice / home'
, 14 : 'Hospice / medical facility'
, 15 : 'Discharged/transferred within this institution to Medicare approved swing bed'
, 16 : 'Discharged/transferred/referred another institution for outpatient services'
, 17 : 'Discharged/transferred/referred to this institution for outpatient services'
, 18 : 'NULL'
, 19 : 'Expired at home. Medicaid only, hospice'
, 20 : 'Expired in a medical facility. Medicaid only, hospice'
, 21 : 'Expired, place unknown. Medicaid only, hospice'
, 22 : 'Discharged/transferred to another rehab fac including rehab units of a hospital'
, 23 : 'Discharged/transferred to a long term care hospital'
, 24 : 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare'
, 25 : 'Not Mapped'
, 26 : 'Unknown/Invalid'
, 30 : 'Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere'
, 27 : 'Discharged/transferred to a federal health care facility'
, 28 : 'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital'
, 29 : 'Discharged/transferred to a Critical Access Hospital (CAH)' }

In [12]:

admission_source_id = { 1 : 'Physician Referral'
, 2 : 'Clinic Referral'
, 3 : 'HMO Referral'
, 4 : 'Transfer from a hospital'
, 5 : 'Transfer from a Skilled Nursing Facility (SNF)'
, 6 : 'Transfer from another health care facility'
, 7 : 'Emergency Room'
, 8 : 'Court/Law Enforcement'
, 9 :  'Not Available'
, 10 : 'Transfer from critial access hospital'
, 11 : 'Normal Delivery'
, 12 : 'Premature Delivery'
, 13 : 'Sick Baby'
, 14 : 'Extramural Birth'
, 15 : 'Not Available'
, 17 : 'NULL'
, 18 : 'Transfer From Another Home Health Agency'
, 19 : 'Readmission to Same Home Health Agency'
, 20 : 'Not Mapped'
, 21 : 'Unknown/Invalid'
, 22 : 'Transfer from hospital inpt/same fac reslt in a sep claim'
, 23 : 'Born inside this hospital'
, 24 : 'Born outside this hospital'
, 25 : 'Transfer from Ambulatory Surgery Center'
, 26 : 'Transfer from Hospice'
                      }

In [13]:

death_disposition_id = {  11 : 'Expired'
, 13 : 'Hospice / home'
, 14 : 'Hospice / medical facility'
, 19 : 'Expired at home. Medicaid only, hospice'
, 20 : 'Expired in a medical facility. Medicaid only, hospice'
, 21 : 'Expired, place unknown. Medicaid only, hospice'
 }

In [14]:

diabetes['expiration_ind'] = diabetes['discharge_disposition_id'].isin([11,13,14,19,20,21]).astype('int')

In [15]:

#install values from lookup dictionaries
diabetes['admission_type'] = diabetes['admission_type_id'].map(admission_type_id)
diabetes['discharge_disposition'] = diabetes['discharge_disposition_id'].map(discharge_disposition_id)
diabetes['admission_source'] = diabetes['admission_source_id'].map(admission_source_id)

del admission_type_id
del discharge_disposition_id 
del admission_source_id

diabetes['admission_grp_1_ind'] = ( diabetes['admission_type'].isin(['NULL','Emergency'])).astype(int)
diabetes['admission_grp_2_ind'] = ( diabetes['admission_type'].isin(['Elective','Not Mapped'])).astype(int)

diabetes['discharge_grp_1_ind'] = ( diabetes['discharge_disposition'].isin(['Discharged/transferred to a long term care hospital'
                                                                           ,'NULL'
                                                                           ,'Discharged to home'])).astype(int)

diabetes['discharge_grp_2_ind'] = ( diabetes['discharge_disposition'].isin(['Left AMA'
                                                                            ,'Discharged/transferred to another type of inpatient care institution'
                                                                            ,'Discharged/transferred to SNF'
                                                                            ,'Discharged/transferred to home with home health service'
                                                                            ,'Discharged/transferred to another rehab fac including rehab units of a hospital'])).astype(int)

diabetes['admission_type_ind'] = ( diabetes['admission_source'].isin(['Clinic Referral'
                                                                     ,'Transfer from a hospital'
                                                                     ,'Transfer from another health care facility'])).astype(int)

diabetes['mb_admission_grp_1_ct'] = diabetes.groupby('patient_nbr')['admission_grp_1_ind'].transform('sum')
diabetes['mb_admission_grp_2_ct'] = diabetes.groupby('patient_nbr')['admission_grp_2_ind'].transform('sum')
diabetes['mb_discharge_grp_1_ct'] = diabetes.groupby('patient_nbr')['discharge_grp_1_ind'].transform('sum')
diabetes['mb_discharge_grp_2_ct'] = diabetes.groupby('patient_nbr')['discharge_grp_2_ind'].transform('sum')
diabetes['mb_admission_type_ct']  = diabetes.groupby('patient_nbr')['admission_type_ind'].transform('sum')

# Delete the old options
drop.extend(['admission_type_id', 'discharge_disposition_id', 'admission_source_id'])
drop.extend(['admission_type', 'discharge_disposition', 'admission_source'])
drop.extend(['admission_grp_1_ind','admission_grp_2_ind','discharge_grp_1_ind', 'discharge_grp_2_ind','admission_type_ind'])


In [16]:

#add patient-level count of unique diagnoses codes 
# Melt the dataset to have all diagnoses in a single column
diagnosis_melted = diabetes.melt(id_vars=['patient_nbr'], value_vars=['diag_1', 'diag_2', 'diag_3'])

# Group by patient_nbr and count distinct diagnosis codes
distinct_counts = diagnosis_melted.groupby('patient_nbr')['value'].nunique().reset_index()

# Rename columns for clarity
distinct_counts.columns = ['patient_nbr', 'distinct_diag_count']

diabetes = diabetes.merge(distinct_counts, on='patient_nbr', how='left')

del diagnosis_melted, distinct_counts

In [17]:

# Read the dx code lookup into dataframe 
df = pd.read_csv(gen_path / 'data' / "unique_diag_df_edit.csv")

# Convert DataFrame to a dictionary with 'diagnosis_cd' as keys and 'diagnosis' as values
diag_dict = dict(zip(df['diagnosis_cd'], df['diagnosis']))

del df

#install descriptive diagnoses
diabetes['diagnosis_1'] = diabetes['diag_1'].map(diag_dict)
diabetes['diagnosis_2'] = diabetes['diag_2'].map(diag_dict)
diabetes['diagnosis_3'] = diabetes['diag_3'].map(diag_dict)

In [18]:

diabetes['diag_1_freq'] = diabetes.groupby('diag_1')['diag_1'].transform('count')
diabetes['diag_2_freq'] = diabetes.groupby('diag_2')['diag_2'].transform('count')
diabetes['diag_3_freq'] = diabetes.groupby('diag_3')['diag_3'].transform('count')

In [19]:
# Major ordered drivers of readmit
diabetes['diag_1_428_ind'] = ( diabetes['diag_1']=='428' ).astype(int) #CHF NOS
diabetes['diag_1_491_ind'] = ( diabetes['diag_1']=='491' ).astype(int) #SIMPLE CHR BRONCHITIS
diabetes['diag_1_493_ind'] = ( diabetes['diag_1']=='493' ).astype(int) #EXTRINSIC ASTHMA NOS

diabetes['diag_2_403_ind'] = ( diabetes['diag_2']=='403' ).astype(int) #MAL HY KID W CR KID I-IV
diabetes['diag_2_707_ind'] = ( diabetes['diag_2']=='707' ).astype(int) #DECUBITUS ULCER
diabetes['diag_2_585_ind'] = ( diabetes['diag_2']=='585' ).astype(int) #CHRONIC RENAL FAILURE
diabetes['diag_2_491_ind'] = ( diabetes['diag_2']=='491' ).astype(int) #SIMPLE CHR BRONCHITIS

diabetes['diag_3_403_ind'] = ( diabetes['diag_3']=='403' ).astype(int) #MAL HY KID W CR KID I-IV
diabetes['diag_3_585_ind'] = ( diabetes['diag_3']=='585' ).astype(int) #CHRONIC RENAL FAILURE
diabetes['diag_3_707_ind'] = ( diabetes['diag_3']=='707' ).astype(int) #DECUBITUS ULCER

In [20]:
diabetes['diag_1_driver_ind'] = diabetes['diag_1'].isin([ '403' #MAL HY KID W CR KID I-IV
, '787' #NAUSEA WITH VOMITING
, '404' #MAL HY HT/KD I-IV W/O HF
, '707' #DECUBITUS ULCER
, '572' #ABSCESS OF LIVER
, '730' #AC OSTEOMYELITIS-UNSPEC
, 'V58' #RADIOTHERAPY ENCOUNTER
, '537' #ACQ PYLORIC STENOSIS
, '443' #RAYNAUD'S SYNDROME
, '292' #DRUG WITHDRAWAL
, '496' #CHR AIRWAY OBSTRUCT NEC
, '585' #CHRONIC RENAL FAILURE
, '282' #HEREDITARY SPHEROCYTOSIS
, '799' #ASPHYXIA
, '284' #CONGEN APLASTIC ANEMIA
, '567' #PERITONITIS IN INFEC DIS
, '293' #DELIRIUM D/T OTHER COND
, '924' #CONTUSION OF THIGH
, '340' #MULTIPLE SCLEROSIS
, '514' #PULM CONGEST/HYPOSTASIS
, '485' #BRONCHOPNEUMONIA ORG NOS
, '714' #RHEUMATOID ARTHRITIS
, '277' #CYSTIC FIBROS W/O ILEUS
, '150' #MAL NEO CERVICAL ESOPHAG
, '135' #SARCOIDOSIS
, '522']).astype(int) #PULPITIS


In [21]:
diabetes['diag_2_driver_ind'] = diabetes['diag_2'].isin([ '571' #ALCOHOLIC FATTY LIVER
, '404' #MAL HY HT/KD I-IV W/O HF
, '536' #ACHLORHYDRIA
, '202' #NDLR LYM UNSP XTRNDL ORG
, '396' #MITRAL/AORTIC STENOSIS
, '304' #OPIOID DEPENDENCE-UNSPEC
, '444' #ABD AORTIC EMBOLISM
, '581' #NEPHROTIC SYN, PROLIFER
, '731' #OSTEITIS DEFORMANS NOS
, 'E94' #ADV EFF ANALEPTICS
, '397' #TRICUSPID VALVE DISEASE
, '595' #ACUTE CYSTITIS
, '205' #AC MYL LEUK WO ACHV RMSN
, '490' #BRONCHITIS NOS
, '459' #HEMORRHAGE NOS
, '189' #MALIG NEOPL KIDNEY
, '154' #MAL NEO RECTOSIGMOID JCT
, '332' #PARALYSIS AGITANS
, 'V49' #DEFICIENCIES OF LIMBS
, '681' #CELLULITIS, FINGER NOS
, '150' #MAL NEO CERVICAL ESOPHAG
, '537' #ACQ PYLORIC STENOSIS
, '094' ]).astype(int) #TABES DORSALIS



In [22]:
diabetes['diag_3_driver_ind'] = diabetes['diag_3'].isin([  '682' #CELLULITIS OF FACE
, '070' #HEPATITIS A WITH COMA
, '536' #ACHLORHYDRIA
, 'V42' #KIDNEY TRANSPLANT STATUS
, '443' #RAYNAUD'S SYNDROME
, '304' #OPIOID DEPENDENCE-UNSPEC
, '284' #CONGEN APLASTIC ANEMIA
, '466' #ACUTE BRONCHITIS
, '459' #HEMORRHAGE NOS
, '581' #NEPHROTIC SYN, PROLIFER
, '337' #IDIOPATH AUTO NEUROPATHY
, '583' #PROLIFERAT NEPHRITIS NOS
, '203' #MULT MYE W/O ACHV RMSON
, 'V46' #DEPENDENCE ON ASPIRATOR
, '482' #K. PNEUMONIAE PNEUMONIA
, 'V49' #DEFICIENCIES OF LIMBS
, '444' #ABD AORTIC EMBOLISM
, '174' #MALIG NEO NIPPLE
, '456' #ESOPHAG VARICES W BLEED
, '519' #TRACHEOSTOMY COMP NOS
, 'E92' #ACC-POWERED LAWN MOWER
, 'V62' #UNEMPLOYMENT
, '356' #HERED PERIPH NEUROPATHY
, '711' #PYOGEN ARTHRITIS-UNSPEC
, '053' ]).astype(int)  #HERPES ZOSTER MENINGITIS


In [23]:
diabetes.drop(drop, axis = 1, inplace= True)
diabetes.dropna(inplace= True, axis= 0)

In [24]:
diabetes.shape

(79186, 68)

In [25]:

dx_list=[ '428'
, '403'
, '707'
, '585'
, '491'
, '396'
, '440'
, '453'
, '571'
, '284'
, '304'
, '482'
, '150'
, '282'
, '332'
, '443'
, '719'
, '423'
, '281'
, '536'
, '368'
, '515'
, '595'
, '572'
, '681'
, '581'
, '537'
, '490'
, '583'
, 'V46'
, '519'
, '300'
, '567'
, 'E92'
, 'V49'
, '094'
, '514'
, '494'
, '042'
, '404'
, '346'
, '792'
, '398'
, '753'
, '577'
, '730'
, '444'
, '459'
, '790'
, '337'
, '397'
, '292'
, 'V42'
, '289']


In [26]:

# Create dx_events with the selected columns
dx_events = diabetes[['patient_nbr', 'encounter_id', 'diag_1', 'diag_2', 'diag_3']].copy(deep=True)

# Add new empty columns for each element in dx_list
for dx in dx_list:
    dx_events[f'dx_{dx}_ind'] = pd.NA  # Creates empty columns with missing values (best for Pandas)

print(dx_events.shape)

(79186, 59)


In [27]:

# populate each column
for dx in dx_list:
    dx_events[f'dx_{dx}_ind'] = ((dx_events['diag_1'] == dx) |
                                 (dx_events['diag_2'] == dx) |
                                 (dx_events['diag_3'] == dx) ).astype(int)
print('done')   

done


In [28]:

# Group by patient_nbr and compute max and sum for each diagnosis indicator
dx_aggregated = dx_events.groupby('patient_nbr').agg(
    {f'dx_{dx}_ind': ['max', 'sum'] for dx in dx_list}
)

# Rename columns to match SQL-style naming
dx_aggregated.columns = [f'{col[0]}_{col[1]}' for col in dx_aggregated.columns]

# Reset index to bring patient_nbr back as a column
dx_aggregated = dx_aggregated.reset_index()

dx_aggregated.to_csv(gen_path / 'data' / 'dx_aggregated_ck.csv',index=False)
del dx_events

In [29]:

print(diabetes.shape)
diabetes = diabetes.merge(dx_aggregated, on='patient_nbr', how='left')
print(diabetes.shape)
del dx_aggregated

(79186, 68)
(79186, 176)


In [30]:

diabetes['alcohol_ind'] = diabetes[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('ALCOHOL' in str(val) for val in row)), axis=1
)

diabetes['obesity_ind'] = diabetes[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('OBESITY' in str(val) for val in row)), axis=1
)

diabetes['mh_ind'] = diabetes[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('MALIGNANT HYPERTENSION' in str(val) for val in row)), axis=1
)

In [31]:

diabetes['alcohol_history_ind'] = diabetes.groupby('patient_nbr')['alcohol_ind'].transform('max')
diabetes['obesity_history_ind'] = diabetes.groupby('patient_nbr')['obesity_ind'].transform('max')
diabetes['mh_history_ind'] = diabetes.groupby('patient_nbr')['mh_ind'].transform('max')

drop.extend(['alcohol_ind', 'obesity_ind', 'mh_ind'])

In [32]:

#add patient-level some features
diabetes['encounter_ct'] = diabetes['patient_nbr'].map(diabetes.groupby('patient_nbr')['encounter_id'].nunique())
diabetes['mb_time_in_hospital'] = diabetes.groupby('patient_nbr')['time_in_hospital'].transform('sum')

diabetes['mb_num_lab_procedures_ct'] = diabetes.groupby('patient_nbr')['num_lab_procedures'].transform('sum')
diabetes['mb_num_procedures_ct'] = diabetes.groupby('patient_nbr')['num_procedures'].transform('sum')
diabetes['mb_num_medications_ct'] = diabetes.groupby('patient_nbr')['num_medications'].transform('sum')
diabetes['mb_number_outpatient_ct'] = diabetes.groupby('patient_nbr')['number_outpatient'].transform('sum')
diabetes['mb_number_emergency_ct'] = diabetes.groupby('patient_nbr')['number_emergency'].transform('sum')
diabetes['mb_number_inpatient_ct'] = diabetes.groupby('patient_nbr')['number_inpatient'].transform('sum')
diabetes['mb_number_diagnoses_ct'] = diabetes.groupby('patient_nbr')['number_diagnoses'].transform('sum')

drop.extend(['num_lab_procedures', 'num_procedures',
              'num_medications', 'number_outpatient',
                'number_emergency', 'number_inpatient', 
                'number_diagnoses'])

In [33]:
# get frequency on all these.
drugs=['metformin'
,'repaglinide'
,'nateglinide'
,'chlorpropamide'
,'glimepiride'
,'acetohexamide'
,'glipizide'
,'glyburide'
,'tolbutamide'
,'pioglitazone'
,'rosiglitazone'
,'acarbose'
,'miglitol'
,'troglitazone'
,'tolazamide'
,'examide'
,'citoglipton'
,'insulin'
,'glyburide-metformin'
,'glipizide-metformin'
,'glimepiride-pioglitazone'
,'metformin-rosiglitazone'
,'metformin-pioglitazone']

In [34]:
# Create an empty list to store results
records = []

# Compute frequency counts for each feature in `drugs`
for feature in drugs:
    value_counts = diabetes[feature].value_counts(dropna=False).reset_index()
    value_counts.columns = ['observation', 'count']
    value_counts['feature'] = feature  # Add feature name

    # Append results
    records.append(value_counts)

# Combine all frequency tables into a single DataFrame
drug_freq = pd.concat(records, ignore_index=True)[['feature', 'observation', 'count']]
drug_freq.to_csv(gen_path / 'data' / 'drugs.csv',index=False)

In [35]:
drop.extend([ 'citoglipton'
, 'examide'
, 'acetohexamide'
, 'glimepiride-pioglitazone'
, 'metformin-pioglitazone'
, 'metformin-rosiglitazone'
, 'troglitazone'
, 'glipizide-metformin'
, 'tolbutamide'
, 'miglitol'
, 'tolazamide'
, 'chlorpropamide'
, 'acarbose'
, 'nateglinide'
, 'glyburide-metformin'
, 'repaglinide' ])

In [36]:

print(diabetes.groupby('encounter_ct')['patient_nbr'].nunique().reset_index().sort_values('encounter_ct'))

    encounter_ct  patient_nbr
0              1        46472
1              2         8009
2              3         2491
3              4          954
4              5          431
5              6          192
6              7          108
7              8           45
8              9           31
9             10           11
10            11           19
11            12           10
12            13            1
13            14            6
14            15            2
15            16            1
16            17            4
17            18            2
18            19            1


In [37]:
diabetes.to_csv(gen_path / 'data' / 'p004_diabetes_MLprep.csv',index=False)
print(diabetes.shape)
print(diabetes.info())

(79186, 191)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79186 entries, 0 to 79185
Data columns (total 191 columns):
 #    Column                    Non-Null Count  Dtype  
---   ------                    --------------  -----  
 0    encounter_id              79186 non-null  int64  
 1    patient_nbr               79186 non-null  int64  
 2    gender                    79186 non-null  object 
 3    age                       79186 non-null  object 
 4    time_in_hospital          79186 non-null  int64  
 5    num_lab_procedures        79186 non-null  int64  
 6    num_procedures            79186 non-null  int64  
 7    num_medications           79186 non-null  int64  
 8    number_outpatient         79186 non-null  int64  
 9    number_emergency          79186 non-null  int64  
 10   number_inpatient          79186 non-null  int64  
 11   diag_1                    79186 non-null  object 
 12   diag_2                    79186 non-null  object 
 13   diag_3                    79186

In [39]:
drop

['race',
 'weight',
 'medical_specialty',
 'max_glu_serum',
 'payer_code',
 'A1Cresult',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'admission_type',
 'discharge_disposition',
 'admission_source',
 'admission_grp_1_ind',
 'admission_grp_2_ind',
 'discharge_grp_1_ind',
 'discharge_grp_2_ind',
 'admission_type_ind',
 'alcohol_ind',
 'obesity_ind',
 'mh_ind',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'citoglipton',
 'examide',
 'acetohexamide',
 'glimepiride-pioglitazone',
 'metformin-pioglitazone',
 'metformin-rosiglitazone',
 'troglitazone',
 'glipizide-metformin',
 'tolbutamide',
 'miglitol',
 'tolazamide',
 'chlorpropamide',
 'acarbose',
 'nateglinide',
 'glyburide-metformin',
 'repaglinide']

In [38]:

drop1 = ['diag_1', 'diag_2', 'diag_3', 'diagnosis_tuple_freq'
, 'diag_1_freq', 'diag_2_freq', 'diag_3_freq'
, 'diagnosis_1', 'diagnosis_2', 'diagnosis_3'
,  'diag_1_sort' ,'diag_2_sort' ,'diag_3_sort'
, 'diagnosis_1_sort', 'diagnosis_2_sort', 'diagnosis_3_sort'
, 'diag_1_sort_freq', 'diag_2_sort_freq', 'diag_3_sort_freq']

drop.extend([col_name for col_name in drop1 if not col_name in drop])

In [39]:
drop = [col_name for col_name in drop if col_name in diabetes.columns]
diabetes.drop(columns=drop, inplace=True)
diabetes.to_pickle(gen_path / 'data' / "p004_diabetes_MLprep.pkl")
print(diabetes.shape)
print(diabetes.info())

(79186, 156)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79186 entries, 0 to 79185
Data columns (total 156 columns):
 #    Column                    Non-Null Count  Dtype 
---   ------                    --------------  ----- 
 0    encounter_id              79186 non-null  int64 
 1    patient_nbr               79186 non-null  int64 
 2    gender                    79186 non-null  object
 3    age                       79186 non-null  object
 4    time_in_hospital          79186 non-null  int64 
 5    metformin                 79186 non-null  object
 6    glimepiride               79186 non-null  object
 7    glipizide                 79186 non-null  object
 8    glyburide                 79186 non-null  object
 9    pioglitazone              79186 non-null  object
 10   rosiglitazone             79186 non-null  object
 11   insulin                   79186 non-null  object
 12   change                    79186 non-null  object
 13   diabetesMed               79186 non-null  obje