In [260]:
import pandas as pd
import numpy as np
import json
from glob import glob
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain, MultiOutputClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

rand_state = 42


In [261]:
import sklearn 
print(sklearn.__version__) 


1.4.1.post1


In [262]:
def load_and_combine_json_files(directory_path, search_pattern):
    # Use glob to find JSON files in the directory based on the search pattern
    pop_files = glob(directory_path + search_pattern)

    # Initialize an empty DataFrame to hold all the data
    combined_df = pd.DataFrame()

    # Loop through each JSON file
    for file in pop_files:
        # Determine the dataset type based on the file name
        if 'train' in file:
            dataset_type = 'train'
        elif 'test' in file:
            dataset_type = 'test'
        elif 'val' in file:
            dataset_type = 'validate'
        else:
            dataset_type = 'unknown'

        print('Loading data files...', file, dataset_type)
        # Load the JSON file into a DataFrame
        with open(file) as f:
            data = json.load(f)
        df = pd.DataFrame(data)

        # Add a new column to flag the dataset type
        df['dataset_type'] = dataset_type

        # Append the DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return combined_df

# Define parameters for the function to combine JSON files
directory_path = '/Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/'
pop_files = 'pop_*.json'  # This pattern can be changed based on the files you're looking for
sample_files = 'sample_*.json'

# Load and combine the JSON files
pop_df = load_and_combine_json_files(directory_path, pop_files)
sample_df = load_and_combine_json_files(directory_path, sample_files)

print('Population Cols\n',pop_df.columns)
print('Sample Cols\n',pop_df.columns)


Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/pop_validation_set__chexpert.json validate
Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/pop_test_set__chexpert.json test
Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/pop_train_set__chexpert.json train
Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/sample__train_set__chexpert.json train
Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/sample__test_set__chexpert.json test
Loading data files... /Users/leegary/Downloads/mimic_iv_multilabel__json_files__20240302/sample__validation_set__chexpert.json validate
Population Cols
 Index(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity',
       'pathologies_number', 'pathologies_names', 'radiology_note',
       'discha

In [263]:
pop_df.head()

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,pathologies_number,pathologies_names,radiology_note,discharge_note,chief_complaint,major_surgical_or_invasive_procedure,history_of_present_illness,past_medical_history,family_history,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia,dataset_type
0,13110963.0,23553001,58949064.0,101.2,99.0,18.0,97.0,151.0,66.0,0,2.0,1.0,lung_opacity,FINAL REPORT\...,\nName: ___ Unit No: __...,fever,,"___ yo F with advanced dementia, found to be f...",# Dementia\n# Stable T-cell lymphoproliferativ...,The patient's sister had fibroids,0.0,0.0,0.0,1.0,0.0,0.0,validate
1,17918016.0,24868379,53178110.0,98.5,140.0,18.0,97.0,137.0,94.0,0,1.0,2.0,"cardiomegaly, pleural_effusion",FINAL REPORT\...,\nName: ___ Unit No: ___\n...,"shortness of breath, lower extremity edema, ch...",none.,"___ yo morbidly obese female with a h/o HLD, H...",GERD \nCholelythiasis \nOSTEOARTHRITIS \nOBE...,"No family history of early MI, arrhythmia, car...",0.0,1.0,0.0,0.0,1.0,0.0,validate
2,16856749.0,27527958,54094282.0,98.4,70.0,16.0,99.0,125.0,56.0,0,2.0,2.0,"edema, pleural_effusion",FINAL REPORT\...,\nName: ___ Unit No: ___...,Dyspnea,Right heart catheterization ___,History provided by wife b/c patient is hard o...,"1. CARDIAC RISK FACTORS: Diabetes Type II, Dys...","No family history of early MI, arrhythmia, car...",0.0,0.0,1.0,0.0,1.0,0.0,validate
3,17448207.0,22162530,52690784.0,97.9,150.0,18.0,98.0,128.0,101.0,0,1.0,0.0,no_finding,FINAL REPORT\...,\nName: ___ Unit No: _...,tachycardia,NONE,___ with a hisotry of atrial fibrillation s/p ...,Venous stasis \nA-fib s/p cardioversion in __...,Father died at ___ getting CABG \nTwin brothe...,0.0,0.0,0.0,0.0,0.0,0.0,validate
4,10692509.0,26210665,50447877.0,98.0,78.0,18.0,100.0,131.0,49.0,0,2.0,0.0,no_finding,FINAL REPORT\...,\nName: ___ Unit No: ___\n \n...,chest pain,,"Mr. ___ is a ___ w/ PMH of mild dementia, CAD ...","1. CARDIAC RISK FACTORS: (-)Diabetes, (+)Dysli...","No history of CAD, diabetes as far as he knows...",0.0,0.0,0.0,0.0,0.0,0.0,validate


In [264]:
# Count the records for each dataframe represented for validation, training, and test
pop_dataset_type_counts = pop_df['dataset_type'].value_counts()
pop_dataset_type_percentages = pop_df['dataset_type'].value_counts(normalize=True) * 100  # Get percentages

print('Pop dataset size ',pop_df.shape)
print("\nCounts for Pop Dataset:")
print(pop_dataset_type_counts)
print("\nPercentages for Pop Dataset:")
print(pop_dataset_type_percentages)

Pop dataset size  (14443, 27)

Counts for Pop Dataset:
dataset_type
train       9630
validate    2407
test        2406
Name: count, dtype: int64

Percentages for Pop Dataset:
dataset_type
train       66.675898
validate    16.665513
test        16.658589
Name: proportion, dtype: float64


In [265]:
sample_dataset_type_counts = sample_df['dataset_type'].value_counts()
sample_dataset_type_percentages = sample_df['dataset_type'].value_counts(normalize=True) * 100  # Get percentages

print('Sample dataset size ',sample_df.shape)
print("\nCounts for Sample Dataset:")
print(sample_dataset_type_counts)
print("\nPercentages for Sample Dataset:")
print(sample_dataset_type_percentages)

Sample dataset size  (1445, 27)

Counts for Sample Dataset:
dataset_type
train       963
test        241
validate    241
Name: count, dtype: int64

Percentages for Sample Dataset:
dataset_type
train       66.643599
test        16.678201
validate    16.678201
Name: proportion, dtype: float64


In [266]:
def preprocess_data(df):
    x_cols = ['temperature','heartrate','resprate','o2sat','sbp','dbp','pain','acuity','dataset_type'] 
    y_cols = ['atelectasis','cardiomegaly','edema','lung_opacity','pleural_effusion','pneumonia']
    analysis_cols = ['patient_id'] + x_cols + y_cols

    df_new = df[analysis_cols].copy(deep=True)
    df_new['acuity'] = df_new['acuity'].astype('int64')
    df_new['patient_id'] = df_new['patient_id'].astype('int64')    
    df_new = df_new.set_index('patient_id', inplace=False)

    y = df_new[y_cols].astype('int64')
    X = df_new.drop(y_cols, axis=1)

    print('X Cols: ', X.columns)
    print('Y Cols: ', y.columns)
    print('Rows, cols: ', X.shape)

    return df_new, X, y

sample_Xy, sample_X, sample_y = preprocess_data(sample_df)

X Cols:  Index(['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain',
       'acuity', 'dataset_type'],
      dtype='object')
Y Cols:  Index(['atelectasis', 'cardiomegaly', 'edema', 'lung_opacity',
       'pleural_effusion', 'pneumonia'],
      dtype='object')
Rows, cols:  (1445, 9)


In [267]:
# sample_y.iloc[:,:-1].astype('int64')

Split DF New 

In [268]:
sample_X.head()

Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,dataset_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10164996,98.4,80.0,20.0,98.0,134.0,76.0,5,3,train
14664256,98.3,75.0,16.0,98.0,147.0,86.0,2,2,train
11058391,98.4,160.0,18.0,100.0,138.0,106.0,0,1,train
12220452,98.8,68.0,16.0,100.0,173.0,70.0,10,3,train
17748848,98.4,88.0,16.0,96.0,157.0,88.0,2,2,train


## Combined Sample Data EDA

In [269]:
print(sample_Xy.info())
sample_Xy.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1445 entries, 10164996 to 12202842
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temperature       1445 non-null   float64
 1   heartrate         1445 non-null   float64
 2   resprate          1445 non-null   float64
 3   o2sat             1445 non-null   float64
 4   sbp               1445 non-null   float64
 5   dbp               1445 non-null   float64
 6   pain              1445 non-null   int64  
 7   acuity            1445 non-null   int64  
 8   dataset_type      1445 non-null   object 
 9   atelectasis       1445 non-null   float64
 10  cardiomegaly      1445 non-null   float64
 11  edema             1445 non-null   float64
 12  lung_opacity      1445 non-null   float64
 13  pleural_effusion  1445 non-null   float64
 14  pneumonia         1445 non-null   float64
dtypes: float64(12), int64(2), object(1)
memory usage: 180.6+ KB
None


Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,dataset_type,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10164996,98.4,80.0,20.0,98.0,134.0,76.0,5,3,train,0.0,0.0,0.0,0.0,0.0,0.0
14664256,98.3,75.0,16.0,98.0,147.0,86.0,2,2,train,0.0,0.0,1.0,0.0,0.0,0.0
11058391,98.4,160.0,18.0,100.0,138.0,106.0,0,1,train,0.0,0.0,1.0,0.0,1.0,0.0
12220452,98.8,68.0,16.0,100.0,173.0,70.0,10,3,train,0.0,1.0,1.0,0.0,0.0,0.0
17748848,98.4,88.0,16.0,96.0,157.0,88.0,2,2,train,0.0,0.0,0.0,0.0,0.0,0.0


In [270]:
sample_Xy.isnull().sum()

temperature         0
heartrate           0
resprate            0
o2sat               0
sbp                 0
dbp                 0
pain                0
acuity              0
dataset_type        0
atelectasis         0
cardiomegaly        0
edema               0
lung_opacity        0
pleural_effusion    0
pneumonia           0
dtype: int64

In [271]:
sample_Xy.head()

Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,dataset_type,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10164996,98.4,80.0,20.0,98.0,134.0,76.0,5,3,train,0.0,0.0,0.0,0.0,0.0,0.0
14664256,98.3,75.0,16.0,98.0,147.0,86.0,2,2,train,0.0,0.0,1.0,0.0,0.0,0.0
11058391,98.4,160.0,18.0,100.0,138.0,106.0,0,1,train,0.0,0.0,1.0,0.0,1.0,0.0
12220452,98.8,68.0,16.0,100.0,173.0,70.0,10,3,train,0.0,1.0,1.0,0.0,0.0,0.0
17748848,98.4,88.0,16.0,96.0,157.0,88.0,2,2,train,0.0,0.0,0.0,0.0,0.0,0.0


In [272]:
sample_X.head()

Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,dataset_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10164996,98.4,80.0,20.0,98.0,134.0,76.0,5,3,train
14664256,98.3,75.0,16.0,98.0,147.0,86.0,2,2,train
11058391,98.4,160.0,18.0,100.0,138.0,106.0,0,1,train
12220452,98.8,68.0,16.0,100.0,173.0,70.0,10,3,train
17748848,98.4,88.0,16.0,96.0,157.0,88.0,2,2,train


In [273]:
sample_y.head()

Unnamed: 0_level_0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10164996,0,0,0,0,0,0
14664256,0,0,1,0,0,0
11058391,0,0,1,0,1,0
12220452,0,1,1,0,0,0
17748848,0,0,0,0,0,0


## Transpose of column unique ordinal values

In [274]:
# Step 1 & 2: Calculate and sort unique values for each column, store them in a dictionary
unique_values_sorted = {}
for column in sample_X[['pain','acuity']].columns:
    unique_values = np.sort(sample_X[column].unique())[::-1]  # Sort unique values in descending order
    unique_values_sorted[column] = unique_values

# Step 3: Create a new DataFrame from the dictionary with transposed columns
unique_values_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in unique_values_sorted.items()]))

# Transpose the DataFrame so that each row now represents a column in the original DataFrame
unique_values_transposed = unique_values_df.T

unique_values_transposed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
pain,10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0
acuity,4.0,3.0,2.0,1.0,,,,,,,


### Datatypes 
- **Ratio:** `temperature`, `heartrate`,`resprate`, `o2sat`, `sbp` and `dbp`  
- **Ordinal:** `pain` and `acuity`  
- **Nominal:** `patient_id`, `atelectasis`, `cardiomegaly`,`edema`, `lung_opacity`, `pleural_effusion` and `pneumonia` 

Pipeline

In [275]:
# [x for x in sample_y['dataset_type'].unique()]
sample_df['dataset_type'].unique()

array(['train', 'test', 'validate'], dtype=object)

In [276]:
sample_df[sample_X.columns]

Unnamed: 0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,dataset_type
0,98.4,80.0,20.0,98.0,134.0,76.0,5,3.0,train
1,98.3,75.0,16.0,98.0,147.0,86.0,2,2.0,train
2,98.4,160.0,18.0,100.0,138.0,106.0,0,1.0,train
3,98.8,68.0,16.0,100.0,173.0,70.0,10,3.0,train
4,98.4,88.0,16.0,96.0,157.0,88.0,2,2.0,train
...,...,...,...,...,...,...,...,...,...
1440,101.4,85.0,16.0,97.0,124.0,59.0,8,3.0,validate
1441,97.4,62.0,18.0,100.0,146.0,31.0,7,2.0,validate
1442,97.5,87.0,16.0,100.0,157.0,74.0,0,3.0,validate
1443,98.6,96.0,18.0,97.0,157.0,97.0,0,2.0,validate


In [277]:
train_X = sample_X[sample_X['dataset_type']=='train'].iloc[:,:-1]
val_X = sample_X[sample_X['dataset_type']=='validate'].iloc[:,:-1]
test_X = sample_X[sample_X['dataset_type']=='test'].iloc[:,:-1] # Test will not be used until late stage fusion

train_y = sample_y[sample_X['dataset_type']=='train'].iloc[:,:-1]
val_y = sample_y[sample_X['dataset_type']=='validate'].iloc[:,:-1]
test_y = sample_y[sample_X['dataset_type']=='test'].iloc[:,:-1] # Test will not be used until late stage fusion

In [278]:
print(train_X.shape)
train_X.head()

(963, 8)


Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10164996,98.4,80.0,20.0,98.0,134.0,76.0,5,3
14664256,98.3,75.0,16.0,98.0,147.0,86.0,2,2
11058391,98.4,160.0,18.0,100.0,138.0,106.0,0,1
12220452,98.8,68.0,16.0,100.0,173.0,70.0,10,3
17748848,98.4,88.0,16.0,96.0,157.0,88.0,2,2


In [279]:
print(train_y.shape)
train_y.head()

(963, 5)


Unnamed: 0_level_0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10164996,0,0,0,0,0
14664256,0,0,1,0,0
11058391,0,0,1,0,1
12220452,0,1,1,0,0
17748848,0,0,0,0,0


In [280]:
train_y.value_counts(normalize=True) * 100

atelectasis  cardiomegaly  edema  lung_opacity  pleural_effusion
0            0             0      0             0                   57.736241
                                  1             0                   11.007269
1            0             0      0             0                    5.192108
0            1             0      0             0                    4.880582
             0             0      0             1                    3.634476
                           1      0             0                    2.803738
1            0             0      1             0                    2.699896
                                  0             1                    1.869159
0            1             1      0             0                    1.349948
             0             1      0             1                    1.349948
                           0      1             1                    1.349948
                           1      1             0                    1.038422

In [281]:
print(val_X.shape)
val_X.head()

(241, 8)


Unnamed: 0_level_0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16875792,97.9,72.0,16.0,100.0,118.0,81.0,4,2
18884046,99.4,99.0,18.0,96.0,130.0,71.0,0,2
17977549,100.8,89.0,20.0,96.0,129.0,80.0,4,2
13323009,97.0,76.0,14.0,99.0,105.0,66.0,0,3
13686218,97.9,60.0,18.0,95.0,238.0,75.0,3,2


In [282]:
print(val_y.shape)
val_y.head()

(241, 5)


Unnamed: 0_level_0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16875792,0,0,0,0,0
18884046,0,0,0,0,0
17977549,0,0,0,0,0
13323009,0,0,0,1,0
13686218,0,1,0,0,0


In [283]:
train_y.value_counts(normalize=True) * 100

atelectasis  cardiomegaly  edema  lung_opacity  pleural_effusion
0            0             0      0             0                   57.736241
                                  1             0                   11.007269
1            0             0      0             0                    5.192108
0            1             0      0             0                    4.880582
             0             0      0             1                    3.634476
                           1      0             0                    2.803738
1            0             0      1             0                    2.699896
                                  0             1                    1.869159
0            1             1      0             0                    1.349948
             0             1      0             1                    1.349948
                           0      1             1                    1.349948
                           1      1             0                    1.038422

In [284]:


# # Define types of features
# ordinal_cols = ['pain', 'acuity']
# ratio_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']

# # Define the preprocessing steps
# ordinal_pp_steps = Pipeline([
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('encoder', OrdinalEncoder()),
#     ('scaler', RobustScaler(with_centering=False))
# ])
# ratio_pp_steps = Pipeline([
#     ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
#     ('scaler', RobustScaler(with_centering=False))
# ])

# # Define transformers
# transformers = [
#     ("ordinal", ordinal_pp_steps, ordinal_cols),
#     ('ratio', ratio_pp_steps, ratio_cols)
# ]
# preprocessor = ColumnTransformer(transformers=transformers)

# # Define the XGBClassifier
# xgb = XGBClassifier(
#     random_state=rand_state,
#     learning_rate=0.1,
#     max_depth=3,
#     n_estimators=1000,
#     subsample=0.5,
#     use_label_encoder=False,  # Avoids a warning
#     eval_metric='logloss'  # Adjusted for multi-output
# )

# # Final pipeline
# pipe = Pipeline([
#     ('preprocess', preprocessor),
#     ('estimator', MultiOutputClassifier(xgb, n_jobs=-1))
# ])

# # Fit the model
# pipe.fit(train_X, train_y)

# # Predict and evaluate
# y_pred = pipe.predict(test_X)
# print(f"Accuracy: {accuracy_score(test_y, y_pred)}")


In [285]:
# Define types of features
ordinal_cols = ['pain', 'acuity']
ratio_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']

# Define the preprocessing steps
ordinal_pp_steps = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder()),
    ('scaler', RobustScaler(with_centering=True))
])
ratio_pp_steps = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('scaler', RobustScaler(with_centering=True))
])

# Define transformers
transformers = [
    ("ordinal", ordinal_pp_steps, ordinal_cols),
    ('ratio', ratio_pp_steps, ratio_cols)
]
preprocessor = ColumnTransformer(transformers=transformers)

# Define the XGBClassifier
xgb = XGBClassifier(
    random_state=rand_state,
    eval_metric='logloss'  # Adjusted for multi-output
)

# Define the MultiOutputClassifier
clf = MultiOutputClassifier(xgb)

# Final pipeline
pipe = Pipeline([
    ('preprocess', preprocessor), 
    ('estimator', clf)
])

# clf_pred = pipe.fit(train_X, train_y)
# predictions = clf_pred.predict(val_X)


# Define the parameter grid
param_grid = {
    'estimator__estimator__learning_rate': [0.01, 0.1, 0.3],  # Note the double `estimator__`
    'estimator__estimator__max_depth': [3, 4, 5],
}


skf = KFold(n_splits=5, shuffle=True, random_state=rand_state)

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rand_state)  

gs = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=skf, scoring='f1_weighted') 
grid_result = gs.fit(train_X, train_y)

# Best parameters
print(grid_result.best_params_)


{'estimator__estimator__learning_rate': 0.3, 'estimator__estimator__max_depth': 5}


In [286]:
results = pd.DataFrame(grid_result.cv_results_)
results[['param_estimator__estimator__learning_rate', 'param_estimator__estimator__max_depth', 'mean_test_score', 'std_test_score']]

Unnamed: 0,param_estimator__estimator__learning_rate,param_estimator__estimator__max_depth,mean_test_score,std_test_score
0,0.01,3,0.0,0.0
1,0.01,4,0.019778,0.024421
2,0.01,5,0.032335,0.023369
3,0.1,3,0.082771,0.016903
4,0.1,4,0.09046,0.029462
5,0.1,5,0.112639,0.017081
6,0.3,3,0.113678,0.036284
7,0.3,4,0.118668,0.020079
8,0.3,5,0.143403,0.032677


In [287]:
results[['param_estimator__estimator__learning_rate', 'param_estimator__estimator__max_depth', 'rank_test_score']]


Unnamed: 0,param_estimator__estimator__learning_rate,param_estimator__estimator__max_depth,rank_test_score
0,0.01,3,9
1,0.01,4,8
2,0.01,5,7
3,0.1,3,6
4,0.1,4,5
5,0.1,5,4
6,0.3,3,3
7,0.3,4,2
8,0.3,5,1


In [288]:
print("Best Index:", grid_result.best_index_)

Best Index: 8


In [289]:
print("Number of Splits:", grid_result.n_splits_)


Number of Splits: 5


In [290]:
print("Total Fit Time:", grid_result.refit_time_)


Total Fit Time: 0.36438608169555664


In [291]:
print("Best Score:", grid_result.best_score_)

Best Score: 0.14340307240963993


In [292]:
# grid_search = GridSearchCV(
#     estimator=pipe, 
#     param_grid=param_grid, 
#     scoring='roc_auc', 
#     cv=skf.split(train_X, train_y),  # split on training data only
#     return_train_score=False
# )

# grid_result = grid_search.fit(train_X, train_y)

best_model = grid_result.best_estimator_  # The model with the best hyperparameters

# Evaluate using your desired metrics
val_score = best_model.score(val_X, val_y)  
print(f"Validation Score: {val_score}")

Validation Score: 0.5145228215767634


In [293]:
n_labels = val_y.shape[1]  # Number of labels

for i in range(n_labels):
    # Predict for each label
    y_pred = xgb.predict(val_X)[:, i]
    y_true = np.array(val_y.iloc[:, i])
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Display the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=["Did not default", "Defaulted"])
    
    disp.plot(values_format='d')
    plt.title(f'Label {i}')
    plt.show()

NotFittedError: need to call fit or load_model beforehand

In [None]:
n_estimators_list = [100, 1000, 3000]

# Setup default parameters for gridsearch
params = {
    'estimator__estimator__n_estimators': n_estimators_list
}

# Fit the model
# Run gridsearch to fine tune your hyperparameters
Grid=GridSearchCV(pipe,param_grid=params,
                eval_metric='aucpr',
                verbose=10, n_jobs=-1,
               ).fit(train_X,train_y)

In [None]:
# Predict and evaluate
y_pred = pipe.predict(val_X)
print(f"Accuracy: {accuracy_score(val_y, y_pred)}")


Replace the GridsearchCV with gridsearch with regular validation

Score on validation too

In [None]:
# Define types of features:
ordinal_cols = ['pain','acuity' ]
ratio_cols   = ['temperature', 'heartrate','resprate', 'o2sat', 'sbp', 'dbp']


# Define the preprocessing steps:
#  For missing values the lines below use one method each but you can use options
ordinal_pp_steps = Pipeline([('most_often',SimpleImputer(strategy='most_frequent')),
                           ('Ordinal',OrdinalEncoder()),
                           ('Robust',RobustScaler(with_centering=True))])  
ratio_pp_steps   = Pipeline([('missing=-1',SimpleImputer(strategy='constant', fill_value=-1)),
                           ('Robust',RobustScaler(with_centering=True))])


# Define transformers
t = [
    ("ordinal",ordinal_pp_steps,ordinal_cols), 
    ('ratio',ratio_pp_steps ,ratio_cols)
    ]
preprocessor = ColumnTransformer(transformers = t)

# Define estimator
xgb =  XGBClassifier(random_state= rand_state, 
                     learning_rate=0.1,
                     max_depth=3,
                     n_estimators=1000,
                     subsample=0.5,
                     validation_fraction=0.2,
                     n_iter_no_change=10,
                     max_features='log2',
                     verbose=1,
                     )

# Final pipeline
pipe = Pipeline([
    ('preprocess',preprocessor),
    ('estimator', xgb)
    ])

learning_rate = [0.01, 0.05, 0.1, 0.2]
max_depth_list = [2, 3, 5]
n_estimators_list = [100, 1000, 3000]

# Setup default parameters for gridsearch
params = {
    # 'estimator__learning_rate': learning_rate,
    # 'estimator__max_depth': max_depth_list,
    'estimator__n_estimators': n_estimators_list
}


# Run gridsearch to fine tune your hyperparameters
Grid=GridSearchCV(pipe,param_grid=params,
                scoring= ['neg_log_loss','roc_auc',],
                return_train_score=True,
                refit='roc_auc',
                verbose=10, n_jobs=-1,
               ).fit(X_train,y_train)



Model

In [None]:
# Cheack gridsearch results and decide on optimum hyperparameters

# updated estimator after fine tuning
uxg = XGBClassifier(random_state= ?,
                          n_estimators= ?final value?,  
                          etc.
                   }

# update pipeline
pipe = Pipeline([('preprocess',preprocessor), ('estimator', uxgb)])

# Fit final model
pipe.fit(X_train, y_train)

# Predict test data
test_prob = pipe.predict_proba(X_test)

Check results & compare