In [None]:
# %load_ext jupyter_black

uncomment and run below cell if running in Google Colab.  Make sure your secrets are configured in colab and permitted to this notebook.

In [None]:
#!pip install boto3
#!pip install autogluon.tabular
#!pip install shap

# !git clone https://github.com/The-Taimaka-Project/health-predictions.git

#import sys
#sys.path.append('/content/health-predictions')


#import os
#from google.colab import userdata
#os.environ["TAIMAKA_DO_ACCESS_KEY"] = userdata.get('TAIMAKA_DO_ACCESS_KEY')
#os.environ["TAIMAKA_DO_SECRET_KEY"] = userdata.get('TAIMAKA_DO_SECRET_KEY')

In [None]:
# local environment, set up virtual environment
# python -m venv .venv
# . .venv/bin/activate
# then
# pip install -r requirements.txt
#
# or
#
# pip install jupyter
# pip install autogluon.tabular
# pip install lightgbm
# pip install xgboost
# pip install shap

# does nothing in Google Colab but necessary if running locally
%cd ../..

if running locally, make sure you run secrets env assignments before running the following cells.  I run the assignments in a separate py file and connect to the kernel running this notebook.


```
%env TAIMAKA_DO_ACCESS_KEY=your access key
%env TAIMAKA_DO_SECRET_KEY=your secret key
```



In [None]:
import os
from warnings import simplefilter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from taimaka_health_predictions.inference.util import (
    DetnReaderWriter,
    ag_feature_generator,
    drop_feature_columns,
    gbm_shap,
    lightgbm_train,
    select_features,
    split_detn_new_onset_medical_complication,
    strip_column_names,
)
from taimaka_health_predictions.utils.digitalocean import DigitalOceanStorage
from taimaka_health_predictions.utils.globals import ETL_DIR, MODEL_DIR, ADMIT_ONLY, NOT_ADMIT_ONLY, logger

In [None]:
# run secrets first to set the environment variables for your credentials
do_storage = DigitalOceanStorage()

# get the data

In [None]:
detn_reader = DetnReaderWriter()
detn, label = detn_reader.read_status_dead()

this notebook trains both strata of the death model


1.   death occurs on the first visit, detn_admit_only
2.   event occurs on the second or beyond visit, detn_filtered

uncomment/comment out the train split steps as appropriate and use the gbm feature selection and AG train steps with either



In [None]:
LOS_CUTOFF = 11
MUAC_CUTOFF = 12.1
NULL_MUAC_LOS_CUTOFF = 4
DURATION_DAYS_CUTOFF = 101

logger.info(f'rate{detn[label].mean()},count {detn[label].sum()},shape{detn.shape}')

detn = detn[(((detn['weekly_last_muac'].isnull()) & (detn['wk1_calc_los'] < NULL_MUAC_LOS_CUTOFF)) & (detn['duration_days'] < DURATION_DAYS_CUTOFF) | ((detn['weekly_last_muac'] < MUAC_CUTOFF) & (detn['wk1_calc_los'] < LOS_CUTOFF)))]

logger.info(f'rate{detn[label].mean()},count {detn[label].sum()},shape{detn.shape}')


In [None]:
def drop_columns(detn_filtered):
  print(detn_filtered.shape)
  columns_to_explicitly_delete = {'muac_diff_ratio','muac','household_adults','household_,slept','living_children','resp_rate', 'temperature','weekly_avg_muac','weekly_last_wfh'
    'wfa_trend','hfa_trend','cat1_complications_weekly','admit_cat1_complications','wk1_rainy_season_weekly','lean_season_admit','wfh_rsquared','wfh_trend','status','status_date', 'final_date',
    'wk1_calcdate_weekly','wk2_calcdate_weekly','wk3_calcdate_weekly'}
  columns_to_keep = {
    "b_referred_emergency",
    "b_wast_admit",
    "cg_age",
    "enr_age",
    "wk1_age",
    "wk1_b_wast",
  }

  detn_filtered = drop_feature_columns(
    detn_filtered,
    label,
    drop_muac=False,
    drop_weight=False,
    drop_height=False,
    columns_to_keep=columns_to_keep,
    columns_to_explicitly_delete=columns_to_explicitly_delete
  )
  print(detn_filtered.shape)

In [None]:
detn_admit_only, _, _, _ = split_detn_new_onset_medical_complication(detn, label)
pid_not_in_admit = detn[~detn["pid"].isin(detn_admit_only["pid"])]["pid"]

# Get rows from detn where 'pid' is in pid_not_in_admit
detn_filtered = detn[detn["pid"].isin(pid_not_in_admit)].copy()


In [None]:
drop_columns(detn_filtered)
drop_columns(detn_admit_only)
detn_admit_only.drop(columns=[col for col in detn_admit_only.columns if 'rsquared' in col],inplace=True)
detn_admit_only.drop(columns=[col for col in detn_admit_only.columns if 'trend' in col],inplace=True)

# LightGBM iteration for feature selection

point X and y to either detn_admit_only or detn_filtered, depending on which strata you're training.  Just uncomment and comment out the X and y assignment lines appropriately.

In [None]:
# prompt: train test split admit_raw using label column as y
# Separate features (X) and target (y)

X = detn_admit_only.drop(columns=label)
y = detn_admit_only[label]

# X = detn_filtered.drop(columns=label)
# y = detn_filtered[label]


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)  # Adjust test_size and random_state as needed

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_train_transformed, X_test_transformed = ag_feature_generator(X_train, X_test)

In [None]:
gbm, f1_scored, aic, top_features = lightgbm_train(
    X_train_transformed, X_test_transformed, y_train, y_test
)
print(len(gbm.feature_name_), f1_scored)

X_train_transformed_top = X_train_transformed[top_features].copy()
X_test_transformed_top = X_test_transformed[top_features].copy()
gbm, f1_scored, aic, top_features = lightgbm_train(
    X_train_transformed_top, X_test_transformed_top, y_train, y_test
)
print(len(gbm.feature_name_), f1_scored)

best_gbm, best_features, results_df, best_aic, features = select_features(
    gbm, X_train_transformed_top, X_test_transformed_top, y_train, y_test, 30, 0, -1
)



In [None]:
print(best_aic, "\n", best_features, len(best_features))
results_df.sort_values(by="AIC", ascending=True)
#results_df.sort_values(by="f1_score", ascending=False)

the most important part!  Set N_FEATURES to the number of features you want.  Maximize the f1 score but minimize the number of features.  

If you want to see what the 10 features selection would be you can run this cell:
```
print(features[10])
```
if you want to compare what was removed from the 10th set to get the 9th, you can run a cell like:
```
print(set(features[10]) - set(features[9]))
```





In [None]:
N_FEATURES = 16
print(N_FEATURES, features[N_FEATURES])

top_features = [
    col for col in strip_column_names(features[N_FEATURES]) if col in detn.columns
]

try and get the columns to be independent of one another.  There should be few, if any, clustering bars on the right side of the second graph.  

One technique to remove the bars is to combine the clustered features via the reduce_dimensionality method.  Make sure to modify the DetnReaderWriter read_new_onset_medical_complication() method to do this.  Then drop the dimensioned columns in the drop_columns method in this notebook.


In [None]:
gbm_shap(features,N_FEATURES,X_train_transformed,X_test_transformed,X_test_transformed_top,y_train,y_test,cutoff=0.5)

# AutoGluon Training

point X and y to either detn_admit_only or detn_filtered, depending on which strata you're training.  Just uncomment and comment out the X and y assignment lines appropriately.  (This MUST match what was done in the previous train test split cell for gbm training.)

In [None]:
# prompt: train test split admit_raw using label column as y

# Separate features (X) and target (y)

X = detn_admit_only[top_features]
y = detn_admit_only[label]

# X = detn_filtered[top_features]
# y = detn_filtered[label]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=43
)  # Adjust test_size and random_state as needed

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
AG_PATH = f"AutogluonModels/{label}"
train_data = TabularDataset(X_train.join(y_train))
predictor = TabularPredictor(label=label, eval_metric="f1", path=AG_PATH).fit(
    train_data, time_limit=300, presets="medium_quality"
)
# predictor = TabularPredictor(label=label,eval_metric='f1',path=AG_PATH).fit(train_data,time_limit=600,presets='good_quality')

## evaluate AG model on holdout (i.e., test) data

In [None]:
test_data2 = TabularDataset(X_test.join(y_test))
predictor.calibrate_decision_threshold()
y_pred = predictor.predict(test_data2.drop(columns=[label]))
print(predictor.evaluate(test_data2, silent=True))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print(f1_score(y_test, y_pred))
# 0.1.0 detn_filtered is 0.7999505630313477

## feature importance

In [None]:
autogluon_feature_importance = predictor.feature_importance(
    test_data2, subsample_size=1000, time_limit=400
)
autogluon_feature_importance["cumsum"] = (
    autogluon_feature_importance["importance"].cumsum()
    / autogluon_feature_importance["importance"].sum()
)
autogluon_feature_importance["importance_ratio"] = (
    autogluon_feature_importance["importance"]
    / autogluon_feature_importance["importance"].sum()
)
autogluon_feature_importance[["cumsum", "importance_ratio"]]

fig, ax1 = plt.subplots(figsize=(10, 6))

# Bar plot on the primary y-axis
autogluon_feature_importance_filtered = autogluon_feature_importance[
    autogluon_feature_importance["importance"] > 0
]
ax1.barh(
    autogluon_feature_importance_filtered.index,
    autogluon_feature_importance_filtered["importance_ratio"],
    label="Importance Ratio",
)
ax1.set_xlabel("Importance")
ax1.set_ylabel("Features")
ax1.set_title("Feature Importance with Cumulative Sum")
ax1.legend(loc="upper left")  # specify location for the first legend
ax1.grid(True, axis="x")  # gridlines only on the x-axis for the bar plot
ax1.invert_yaxis()

# Create a secondary y-axis
ax2 = ax1.twiny()

# Line plot on the secondary y-axis
ax2.plot(
    autogluon_feature_importance_filtered["cumsum"],
    autogluon_feature_importance_filtered.index,
    marker="o",
    linestyle="-",
    color="red",
    label="Cumulative Sum",
)
ax2.set_xlabel("Cumulative Sum")
ax2.legend(loc="upper right")  # specify location for the second legend

# Improve layout
fig.tight_layout()
plt.show()

## export the AG model

comment/uncomment the path assignment depending on which strata you're training.  This MUST match what the 2 train test split cells were set to.

In [None]:
VERSION = "0.1.0"

metadata = {
    "version": VERSION,
    "inputs": autogluon_feature_importance.sort_values(
        by="importance", ascending=False
    ).index.tolist(),
    "outputs": "chance of death",
    "description": (
        "Predicts chance of death"
    ),
    "feature_engineering": (
        "wfh_trend_z is the PCA dimensionalized reduction of normalized ['wfh_rsquared','wfh_trend'] the r-squared and the slope of the linear regression line of the wfh for the patient's history"
    ),
    "contact": "Brian Chaplin",
}

# use this for strata 1, admit only
path = f"{MODEL_DIR}{label}{ADMIT_ONLY}/{VERSION}/model.tar.gz"

# use this for strata 2, non-admit only (detn_filtered)
#path = f"{MODEL_DIR}{label}{NOT_ADMIT_ONLY}/{VERSION}/model.tar.gz"


do_storage.to_autogluon_tarball(predictor, model_metadata=metadata, path=path)