In [2]:
!pip install -q PyAthena

## Prepare Datasets for Predictor Training, Validation and Testing

#### TS

Import modules that build patient cohort, extract demographics and lab events data:

In [255]:
from dataproc.cohort import query_esbl_pts, remove_dups, observation_window
from dataproc.sampling import generate_samples
from dataproc.create_dataset import dataset_creation
from hyper_params import HyperParams

import numpy as np
import pandas as pd
from matplotlib import pyplot
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder

In [4]:
# load hyperparams instance
params = HyperParams()

Patients cohort:

In [None]:
# Select esbl microbiology test
esbl_admits = query_esbl_pts()
# Remove dups
esbl_admits = remove_dups(esbl_admits)
# Create observation window
esbl_admits_window = observation_window(esbl_admits, window_size=params.observation_window_hours)
# Subset columns
pts_labels = esbl_admits_window[['hadm_id', 'index_date','RESISTANT_YN']]
pts_labels.to_pickle('data/patient_labels.pkl')

Import cohort/labels data from the .pkl file:

In [None]:
pts_labels = pd.read_pickle('data/patient_labels.pkl')
print(pts_labels.shape)
pts_labels.head()

Patient's features data:

In [15]:
# Loading the features
features = dataset_creation(pts_labels['hadm_id'], params.observation_window_hours)
features = features.merge(pts_labels[['hadm_id','RESISTANT_YN']], on='hadm_id')
features.to_pickle('data/features.pkl')

Import features data from the .pkl file:

In [144]:
features = pd.read_pickle('data/features.pkl')
print(list(features.columns))

['hadm_id', 'subject_id', '10378-8', '10535-3', '10839-9', '11555-0', '11556-8', '11557-6', '11558-4', '13362-9', '1644-4', '1742-6', '1751-7', '17849-1', '1798-8', '1863-0', '1920-8', '1959-6', '1963-8', '1968-7', '1971-1', '1975-2', '1988-5', '1994-3', '19991-9', '19994-3', '2000-8', '20077-4', '20112-9', '20564-1', '20569-0', '20570-8', '20578-1', '2069-3', '2075-0', '2078-4', '2085-9', '2090-9', '2093-3', '2143-6', '2157-6', '2160-0', '2161-8', '2170-9', '2276-4', '2284-8', '2339-0', '2345-7', '2498-4', '2500-7', '2532-0', '2601-3', '26498-6', '2692-2', '2695-5', '2777-1', '2823-3', '2828-2', '28541-1', '2947-0', '2951-2', '2955-3', '30089-7', '3016-3', '30226-5', '3040-3', '3094-0', '3095-7', '3151-8', '3173-2', '3255-7', '32693-4', '3297-9', '3349-8', '3376-1', '33762-6', '3377-9', '3390-2', '33914-3', '3397-7', '34728-6', '3773-9', '3879-4', '3967-7', '4023-8', '4073-3', '42662-7', '4542-7', '4544-3', '4548-4', '48065-7', '5642-4', '5767-9', '5769-5', '5770-3', '5778-6', '5787-7

In [24]:
loinc_codes = list(features.drop(columns=['hadm_id', 'subject_id', 'admittime','admission_type']).columns)[:-8]
print(list(loinc_codes))

['10378-8', '10535-3', '10839-9', '11555-0', '11556-8', '11557-6', '11558-4', '13362-9', '1644-4', '1742-6', '1751-7', '17849-1', '1798-8', '1863-0', '1920-8', '1959-6', '1963-8', '1968-7', '1971-1', '1975-2', '1988-5', '1994-3', '19991-9', '19994-3', '2000-8', '20077-4', '20112-9', '20564-1', '20569-0', '20570-8', '20578-1', '2069-3', '2075-0', '2078-4', '2085-9', '2090-9', '2093-3', '2143-6', '2157-6', '2160-0', '2161-8', '2170-9', '2276-4', '2284-8', '2339-0', '2345-7', '2498-4', '2500-7', '2532-0', '2601-3', '26498-6', '2692-2', '2695-5', '2777-1', '2823-3', '2828-2', '28541-1', '2947-0', '2951-2', '2955-3', '30089-7', '3016-3', '30226-5', '3040-3', '3094-0', '3095-7', '3151-8', '3173-2', '3255-7', '32693-4', '3297-9', '3349-8', '3376-1', '33762-6', '3377-9', '3390-2', '33914-3', '3397-7', '34728-6', '3773-9', '3879-4', '3967-7', '4023-8', '4073-3', '42662-7', '4542-7', '4544-3', '4548-4', '48065-7', '5642-4', '5767-9', '5769-5', '5770-3', '5778-6', '5787-7', '5792-7', '5793-5', '5

In [25]:
features_summary = features[loinc_codes].describe()

In [29]:
# Embedding the features
from dataproc.embeddings import loinc_values

loinc_vals = loinc_values(loinc_codes)
loinc_vals.dropna(subset=['value'], inplace=True)
loinc_vals = loinc_vals.astype({'value': 'string', 'loinc_code': 'category'})
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('>GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.rstrip(' NG/ML'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<>'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.replace(',', '.'))
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO ANALYZE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'MOLYSIS FALSELY DECREASES THIS RESULT'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'COMPUTER NETWORK FAILURE. TEST NOT RESULTED.'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == ':UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO QUANTITATE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO REPORT'].index),  inplace=True)

In [171]:
numeric = []
categorical = []
weird = []
for code in loinc_codes:
    size = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'])
    size_unique = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'].unique())
    sum_na = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').isna().sum()
    if sum_na / size < 0.05:
        numeric.append(code)
    elif sum_na / size > 0.05 and size_unique < 100:
        categorical.append(code)
    else:
        weird.append(code)

In [172]:
# remove lab column that contains only 'inf' and 'Nan'
numeric.remove('26498-6')
# remove lab column that contains phrase 'See comments'
categorical.remove('33914-3')
# remove lab column that contains phrase 'Random'
categorical.remove('13362-9')

In [173]:
print('All:', len(loinc_codes))
print('Numeric: ', len(numeric))
print('Categorical: ', len(categorical))
print('Weird:', len(weird))

All: 144
Numeric:  94
Categorical:  36
Weird: 11


Summary statistics for numeric lab codes:

In [118]:
numeric_stats = []
for code in numeric:
    a = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').describe()
    numeric_stats.append(a)
numeric_stats_df = pd.concat(numeric_stats, axis=1, keys=numeric)

In [119]:
numeric_stats_df

Unnamed: 0,10535-3,10839-9,11555-0,11556-8,11557-6,11558-4,1644-4,1742-6,1751-7,17849-1,...,763-3,772-4,777-3,785-6,786-4,787-2,788-0,789-8,804-5,9322-9
count,8358.0,11447.0,490527.0,490523.0,490504.0,530658.0,24986.0,219448.0,146690.0,10073.0,...,75587.0,16564.0,778241.0,747560.0,747921.0,747537.0,746408.0,747568.0,753127.0,18128.0
mean,1.129834,7.479856,-0.090604,136.737019,42.736738,7.379366,167.710318,129.960002,3.178038,2.534686,...,3.046492,13.766542,239.295564,30.220467,33.508648,90.264973,15.799282,3.510249,10.499787,3.920383
std,0.749934,16.189266,5.353903,92.114782,11.388981,0.087115,229.124373,536.810586,0.757973,1.775861,...,6.148193,59.786263,150.321123,2.574024,1.573992,6.987865,2.356128,0.656843,9.006713,2.365156
min,0.02,0.0,-414.0,0.0,0.0,0.0,1.0,0.0,0.9,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8
25%,0.6,0.3,-3.0,81.0,36.0,7.33,85.0,18.0,2.6,1.4,...,0.0,1.0,139.0,28.9,32.5,86.0,14.1,3.05,6.4,2.8
50%,1.0,0.4,0.0,109.0,41.0,7.39,124.0,32.0,3.1,2.1,...,0.0,3.0,215.0,30.3,33.6,90.0,15.3,3.44,9.1,3.5
75%,1.4,4.7,3.0,156.0,47.0,7.44,187.0,69.0,3.7,3.2,...,3.0,8.0,307.0,31.6,34.6,94.0,17.0,3.9,12.8,4.5
max,8.0,575.0,162.0,1914.0,247.0,7.99,12496.0,25460.0,6.9,22.1,...,94.0,2879.0,4504.0,48.9,45.8,147.0,36.4,32.97,846.7,65.4


In [318]:
# for now, ignoring the mixed type lab tests
dataset = features.drop(columns=weird, errors='ignore')

In [319]:
dataset.shape

(4328, 145)

### Data Preprocessing  and Embeddings:

In [320]:
print(list(dataset.columns))

['hadm_id', 'subject_id', '10378-8', '10535-3', '10839-9', '11555-0', '11556-8', '11557-6', '11558-4', '13362-9', '1644-4', '1742-6', '1751-7', '17849-1', '1798-8', '1863-0', '1920-8', '1959-6', '1963-8', '1968-7', '1971-1', '1975-2', '1988-5', '1994-3', '19991-9', '19994-3', '2000-8', '20077-4', '20112-9', '20564-1', '20569-0', '20570-8', '20578-1', '2069-3', '2075-0', '2078-4', '2085-9', '2090-9', '2093-3', '2143-6', '2157-6', '2160-0', '2161-8', '2170-9', '2276-4', '2284-8', '2339-0', '2345-7', '2498-4', '2500-7', '2532-0', '2601-3', '26498-6', '2692-2', '2695-5', '2777-1', '2823-3', '2828-2', '28541-1', '2947-0', '2951-2', '2955-3', '30089-7', '3016-3', '30226-5', '3040-3', '3094-0', '3095-7', '3151-8', '3173-2', '3255-7', '32693-4', '3349-8', '3376-1', '33762-6', '3377-9', '3390-2', '33914-3', '3397-7', '34728-6', '3773-9', '3879-4', '3967-7', '4073-3', '42662-7', '4542-7', '4544-3', '5767-9', '5769-5', '5770-3', '5778-6', '5792-7', '5794-3', '5797-6', '5799-2', '5802-4', '5803-2'

#### Clean lab numeric variables:

In [321]:
# Convert to numeric selected columns
dataset[numeric] = dataset[numeric].apply(pd.to_numeric, errors='coerce', axis=1)

Since many lab data have outliers the median and interquartile range can be used to standardizing the numeric variables:   
- value = (value – median) / (p75 – p25)

In [322]:
def stanardize_numeric_values(df, list_of_clms, ref_df):
    """
    Use the median and interquartile range to 
    standardize the numeric variables
    value = (value – median) / (p75 – p25)
    """
    for code in list_of_clms:
        median = ref_df[code]['50%']
        p25 = ref_df[code]['25%']
        p75 = ref_df[code]['75%']
        df[code] = (df[code] - median) / (p75 - p25)
    return df
    

In [323]:
dataset = stanardize_numeric_values(dataset, numeric, numeric_stats_df)

Imputation of missing values using scikit-learn https://scikit-learn.org/stable/modules/impute.html#impute

In [324]:
from sklearn.impute import SimpleImputer

def replace_missing_val(df, list_of_clms, how='median'):
    """
    Imputation of missing values using median
    """
    imp = SimpleImputer(strategy=how)
    df_prc = imp.fit_transform(df[list_of_clms])
    #df[list_of_clms] = pd.DataFrame(df_prc, columns=list_of_clms)
    return df_prc


In [325]:
numlabvars = replace_missing_val(dataset, numeric, how='median')

In [326]:
numlabvars.shape

(4328, 94)

#### Clean lab categorical variables:

In [327]:
dataset['30089-7'] = np.where(dataset['30089-7'].isin(['<1','1','2']), '0-2',
                     np.where(dataset['30089-7'].isin(['3','4']),'3-5', dataset['30089-7']))

dataset['5767-9'] = np.where(dataset['5767-9'].isin(['CLEAR']), 'Clear',
                    np.where(dataset['5767-9'].isin(['SLHAZY']), 'SlHazy',
                    np.where(dataset['5767-9'].isin(['HAZY']), 'Hazy',
                    np.where(dataset['5767-9'].isin(['SlCloudy']),'SlCldy',  
                    np.where(dataset['5767-9'].isin(['CLOUDY']),'Cloudy',dataset['5767-9'])))))

dataset['5769-5'] = np.where(dataset['5769-5'].isin(['0']), 'NEG',
                    np.where(dataset['5769-5'].isin(['NOTDONE']), 'NONE',
                    np.where(dataset['5769-5'].isin(['LRG']), 'MANY', dataset['5769-5'])))

dataset['5778-6'] = np.where(dataset['5778-6'].isin(['YELLOW','YEL']), 'Yellow',
                    np.where(dataset['5778-6'].isin(['STRAW']), 'Straw',
                    np.where(dataset['5778-6'].isin(['AMBER','AMB']), 'Amber', 
                    np.where(dataset['5778-6'].isin(['RED']), 'Red', 
                    np.where(dataset['5778-6'].isin(['ORANGE']), 'Orange', 
                    np.where(dataset['5778-6'].isin(['DKAMB','DKAMBER']), 'DkAmb', 
                    np.where(dataset['5778-6'].isin([' ']), np.nan, dataset['5778-6'])))))))

dataset['5797-6'] = np.where(dataset['5797-6'].isin(['>80']), '80',dataset['5797-6'])

dataset['5804-0'] = np.where(dataset['5804-0'].isin(['>300']), '300',
                    np.where(dataset['5804-0'].isin([' ']), np.nan, dataset['5804-0']))

dataset['5818-0'] = np.where(dataset['5818-0'].isin(['.2']), '0.2',
                    np.where(dataset['5818-0'].isin(['>8','>8.0']), '8',
                    np.where(dataset['5818-0'].isin(['>12']), '12',
                    np.where(dataset['5818-0'].isin(['NotDone']), np.nan, dataset['5818-0']))))

dataset['5822-2'] = np.where(dataset['5822-2'].isin(['0', 'N']), 'NONE',
                    np.where(dataset['5822-2'].isin(['NOTDONE']), np.nan, dataset['5822-2']))

dataset['778-1'] = np.where(dataset['778-1'].isin(['UNABLE TO ESTIMATE DUE TO PLATELET CLUMPS']), 'NOTDETECTED', dataset['778-1'])


In [328]:
# print value counts for each lab categorical variable:
for col in categorical:
    print('----------------------------------')
    print('Column name: ', col)
    print(dataset[col].value_counts())

----------------------------------
Column name:  10378-8
NORMAL        457
1+            141
OCCASIONAL    126
2+             45
3+              8
Name: 10378-8, dtype: int64
----------------------------------
Column name:  30089-7
0-2      88
3-5      19
6-10      3
11-20     1
21-50     1
Name: 30089-7, dtype: int64
----------------------------------
Column name:  30226-5
0-10        15
10-40       15
40-80       15
80-160       4
160-320      2
640-1280     1
>1280        1
Name: 30226-5, dtype: int64
----------------------------------
Column name:  3349-8
NEG    427
POS     10
Name: 3349-8, dtype: int64
----------------------------------
Column name:  3376-1
NEG    665
POS      4
Name: 3376-1, dtype: int64
----------------------------------
Column name:  3377-9
NEG    427
POS     11
Name: 3377-9, dtype: int64
----------------------------------
Column name:  3390-2
NEG    337
POS    102
Name: 3390-2, dtype: int64
----------------------------------
Column name:  3397-7
NEG    412
POS

In [329]:
# replace 'Nan' values in categorical variables by 'UNKNOWN'
dataset.update(dataset[categorical].fillna('UNKNOWN'))

In [330]:
dataset[categorical].head()

Unnamed: 0,10378-8,30089-7,30226-5,3349-8,3376-1,3377-9,3390-2,3397-7,3773-9,3879-4,...,738-5,741-9,774-0,778-1,779-9,7790-9,7791-7,800-3,8246-1,8247-9
0,UNKNOWN,UNKNOWN,UNKNOWN,NEG,NEG,NEG,NEG,POS,NEG,POS,...,UNKNOWN,UNKNOWN,UNKNOWN,LOW,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
1,2+,UNKNOWN,UNKNOWN,UNKNOWN,NEG,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,...,2+,2+,2+,LOW,2+,UNKNOWN,1+,1+,UNKNOWN,UNKNOWN
2,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,...,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
3,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,...,1+,1+,UNKNOWN,UNKNOWN,1+,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
4,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,...,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN


Use one hot encoder for categoric lab features:

In [331]:
enc = OneHotEncoder()
enc.fit(dataset[categorical])
enc.categories_[0:4]

[array(['1+', '2+', '3+', 'NORMAL', 'OCCASIONAL', 'UNKNOWN'], dtype=object),
 array(['0-2', '11-20', '21-50', '3-5', '6-10', 'UNKNOWN'], dtype=object),
 array(['0-10', '10-40', '160-320', '40-80', '640-1280', '80-160', '>1280',
        'UNKNOWN'], dtype=object),
 array(['NEG', 'POS', 'UNKNOWN'], dtype=object)]

In [332]:
onehotlabvars = enc.transform(dataset[categorical]).toarray()

In [333]:
onehotlabvars.shape

(4328, 208)

#### Clean demographic static variables:

In [363]:
staticvars = ['admission_type', 'admission_location', 'insurance', 'language', 
               'religion', 'marital_status', 'ethnicity', 'gender']

In [364]:
dataset['admission_location'] = \
np.where(dataset['admission_location'].isin(['** INFO NOT AVAILABLE **']), 'EMERGENCY ROOM ADMIT',
np.where(dataset['admission_location'].isin(['TRANSFER FROM SKILLED NUR','TRANSFER FROM OTHER HEALT',
                        'TRANSFER FROM HOSP/EXTRAM']), 'TRANSFER FROM MED FACILITY',dataset['admission_location']))
dataset['language'] = \
np.where(~dataset['language'].isin(['ENGL','SPAN']),'OTHER',dataset['language'])

dataset['religion'] = \
np.where(~dataset['religion'].isin(['CATHOLIC','NOT SPECIFIED','UNOBTAINABLE','PROTESTANT QUAKER','JEWISH']),'OTHER',
np.where(dataset['religion'].isin(['UNOBTAINABLE']),'NOT SPECIFIED', dataset['religion'] ))

dataset['ethnicity'] = \
np.where(dataset['ethnicity'].isin(['ASIAN - CHINESE',
                                    'ASIAN - ASIAN INDIAN',
                                    'ASIAN - VIETNAMESE',
                                    'ASIAN - OTHER',
                                    'ASIAN - FILIPINO',
                                    'ASIAN - CAMBODIAN']), 'ASIAN',
np.where(dataset['ethnicity'].isin(['WHITE - RUSSIAN',
                                    'WHITE - BRAZILIAN',
                                    'WHITE - OTHER EUROPEAN']),'WHITE',
np.where(dataset['ethnicity'].isin(['BLACK/CAPE VERDEAN',
                                    'BLACK/HAITIAN',
                                    'BLACK/AFRICAN']), 'BLACK/AFRICAN AMERICAN',
np.where(dataset['ethnicity'].isin(['HISPANIC/LATINO - PUERTO RICAN',
                                   'HISPANIC/LATINO - DOMINICAN',
                                   'HISPANIC/LATINO - SALVADORAN',
                                   'HISPANIC/LATINO - CUBAN',
                                   'HISPANIC/LATINO - MEXICAN']), 'HISPANIC OR LATINO',   
np.where(dataset['ethnicity'].isin(['MULTI RACE ETHNICITY',
                                    'MIDDLE EASTERN',
                                    'PORTUGUESE',
                                    'AMERICAN INDIAN/ALASKA NATIVE',
                                    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
                                    'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE']), 'OTHER',
np.where(dataset['ethnicity'].isin(['UNABLE TO OBTAIN',
                                    'PATIENT DECLINED TO ANSWER']), 'UNKNOWN/NOT SPECIFIED',
dataset['ethnicity']))))))

In [365]:
# print value counts for each demographic variable:
for col in staticvars:
    print('----------------------------------')
    print('Column name: ', col)
    print(dataset[col].value_counts())

----------------------------------
Column name:  admission_type
EMERGENCY    3682
ELECTIVE      445
URGENT        130
NEWBORN        71
Name: admission_type, dtype: int64
----------------------------------
Column name:  admission_location
EMERGENCY ROOM ADMIT          2088
TRANSFER FROM MED FACILITY     822
CLINIC REFERRAL/PREMATURE      802
PHYS REFERRAL/NORMAL DELI      616
Name: admission_location, dtype: int64
----------------------------------
Column name:  insurance
Medicare      2688
Private       1183
Medicaid       347
Government      89
Self Pay        21
Name: insurance, dtype: int64
----------------------------------
Column name:  language
ENGL     2269
OTHER    1963
SPAN       96
Name: language, dtype: int64
----------------------------------
Column name:  religion
CATHOLIC             1598
NOT SPECIFIED        1254
PROTESTANT QUAKER     589
OTHER                 477
JEWISH                410
Name: religion, dtype: int64
----------------------------------
Column name:  mar

#### Use one hot encoder for demographic features:

In [366]:
enc = OneHotEncoder()
enc.fit(dataset[staticvars])
enc.categories_

[array(['ELECTIVE', 'EMERGENCY', 'NEWBORN', 'URGENT'], dtype=object),
 array(['CLINIC REFERRAL/PREMATURE', 'EMERGENCY ROOM ADMIT',
        'PHYS REFERRAL/NORMAL DELI', 'TRANSFER FROM MED FACILITY'],
       dtype=object),
 array(['Government', 'Medicaid', 'Medicare', 'Private', 'Self Pay'],
       dtype=object),
 array(['ENGL', 'OTHER', 'SPAN'], dtype=object),
 array(['CATHOLIC', 'JEWISH', 'NOT SPECIFIED', 'OTHER',
        'PROTESTANT QUAKER'], dtype=object),
 array(['DIVORCED', 'MARRIED', 'SEPARATED', 'SINGLE', 'UNKNOWN (DEFAULT)',
        'WIDOWED', nan], dtype=object),
 array(['ASIAN', 'BLACK/AFRICAN AMERICAN', 'HISPANIC OR LATINO', 'OTHER',
        'UNKNOWN/NOT SPECIFIED', 'WHITE'], dtype=object),
 array(['F', 'M'], dtype=object)]

In [367]:
onehotstaticvars = enc.transform(dataset[staticvars]).toarray()

In [368]:
onehotstaticvars.shape

(4328, 36)

In [369]:
onehotstaticvars

array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 1.],
       [1., 0., 0., ..., 1., 0., 1.]])

#### Combine all features and constract full dataset

In [381]:
# response variable
response = np.array([dataset['RESISTANT_YN']])
response = response.T
response.shape

(4328, 1)

In [382]:
# the last variable is a target variable 
fulldata = np.concatenate((numlabvars, onehotlabvars, onehotstaticvars, response), axis=1)
fulldata.shape

(4328, 339)

In [386]:
# Save to a file
np.save('data/fulldata.npy', fulldata)

In [387]:
# Load data
fulldata = np.load('data/fulldata.npy')

(4328, 339)