In [None]:
# %load_ext jupyter_black

uncomment and run below cell if running in Google Colab.  Make sure your secrets are configured in colab and permitted to this notebook.

In [None]:
# !git clone https://github.com/The-Taimaka-Project/health-predictions.git

# import sys
# sys.path.append('/content/health-predictions')

# !pip install boto3
# !pip install autogluon.tabular
# !pip install shap

# import os
# from google.colab import userdata
# os.environ["TAIMAKA_DO_ACCESS_KEY"] = userdata.get('TAIMAKA_DO_ACCESS_KEY')
# os.environ["TAIMAKA_DO_SECRET_KEY"] = userdata.get('TAIMAKA_DO_SECRET_KEY')

In [None]:
# local environment, set up virtual environment
# python -m venv .venv
# . .venv/bin/activate
# then
# pip install -r requirements.txt
#
# or
#
# pip install jupyter
# pip install autogluon.tabular
# pip install lightgbm
# pip install xgboost
# pip install shap


%cd ../..

if running locally, make sure you run secrets env assignments before running the following cells.  I run the assignments in a separate py file and connect to the kernel running this notebook.


```
%env TAIMAKA_DO_ACCESS_KEY=your access key
%env TAIMAKA_DO_SECRET_KEY=your secret key
```



In [None]:
import os
from warnings import simplefilter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split
from taimaka_health_predictions.inference.util import (
    DetnReaderWriter,
    ag_feature_generator,
    drop_feature_columns,
    gbm_shap,
    lightgbm_train,
    select_features,
    split_detn_new_onset_medical_complication,
    strip_column_names,
)
from taimaka_health_predictions.utils.digitalocean import DigitalOceanStorage
from taimaka_health_predictions.utils.globals import ETL_DIR, MODEL_DIR, logger

In [None]:
# run secrets first to set the environment variables for your credentials
do_storage = DigitalOceanStorage()

# get the data

In [None]:
detn_reader = DetnReaderWriter()
detn, label = detn_reader.read_muac_loss_2_weeks_consecutive()

drop the rows that are ineligible for muac loss, 2 weeks consecutive

In [None]:
detn['wk1_calc_los'].fillna(0,inplace=True)

LOS_CUTOFF = 12
MUAC_CUTOFF = 12.7
logger.info(f'event sum: {detn[label].sum()}, mean: {detn[label].mean()},shape: {detn.shape}')
detn = detn[((detn['wk1_b_discharged']==0) & (detn['weekly_last_muac']< MUAC_CUTOFF) & (detn['wk1_calc_los']< LOS_CUTOFF)) ].copy()
logger.info(f'event sum: {detn[label].sum()}, mean: {detn[label].mean()},shape: {detn.shape}')



In [None]:
columns_to_explicitly_delete = {'household_adults','household_slept','living_children','weekly_avg_muac','weekly_last_wfh','wk1_muac_diff_rate','muac_diff_ratio_rate','muac_diff_ratio'}

columns_to_keep = {
    "b_referred_emergency",
    "b_wast_admit",
    "cg_age",
    "enr_age",
    "wk1_age",
    "wk1_b_wast",
}

detn_filtered = drop_feature_columns(
    detn,
    label,
    drop_muac=False,
    drop_weight=False,
    drop_height=False,
    columns_to_keep=columns_to_keep,
    columns_to_explicitly_delete=columns_to_explicitly_delete,
)
logger.info(detn.shape)

# LightGBM iteration for feature selection

In [None]:
X = detn.drop(columns=label)
y = detn[label]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43) # Adjust test_size and random_state as needed

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_train_transformed, X_test_transformed = ag_feature_generator(X_train, X_test)

In [None]:
gbm, f1_scored, aic, top_features,average_precision = lightgbm_train(
    X_train_transformed, X_test_transformed, y_train, y_test
)
print(len(gbm.feature_name_), f1_scored,average_precision)

X_train_transformed_top = X_train_transformed[top_features].copy()
X_test_transformed_top = X_test_transformed[top_features].copy()
gbm, f1_scored, aic, top_features,average_precision = lightgbm_train(
    X_train_transformed_top, X_test_transformed_top, y_train, y_test
)
print(len(gbm.feature_name_), f1_scored,average_precision)

best_gbm, best_features, results_df, best_aic, features = select_features(
    gbm, X_train_transformed_top, X_test_transformed_top, y_train, y_test, 30, 0, -1
)

the most important part!  Set N_FEATURES to the number of features you want.  Maximize the f1 score but minimize the number of features.  

If you want to see what the 10 features selection would be you can run this cell:
```
print(features[10])
```
if you want to compare what was removed from the 10th set to get the 9th, you can run a cell like:
```
print(set(features[10]) - set(features[9]))
```





In [None]:
print(best_aic, "\n", best_features, len(best_features))
#results_df.sort_values(by="AIC", ascending=True)
results_df.sort_values(by="avg_precision", ascending=False)


In [None]:
N_FEATURES = 14
print(N_FEATURES, features[N_FEATURES])

top_features = [
    col for col in strip_column_names(features[N_FEATURES]) if col in detn.columns
]

try and get the columns to be independent of one another.  There should be few, if any, clustering bars on the right side of the second graph.  

One technique to remove the bars is to combine the clustered features via the reduce_dimensionality method.  Make sure to modify the DetnReaderWriter read_new_onset_medical_complication() method to do this.  Then drop the dimensioned columns in the drop_columns method in this notebook.


In [None]:
gbm_shap(features,N_FEATURES,X_train_transformed,X_test_transformed,X_test_transformed_top,y_train,y_test,cutoff=0.5)

# AutoGluon Training

In [None]:
# prompt: train test split admit_raw using column y_detn_ever as y


# Separate features (X) and target (y)
X = detn[top_features]
y = detn[label]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43) # Adjust test_size and random_state as needed

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
AG_PATH = f"AutogluonModels/{label}"
train_data = TabularDataset(X_train.join(y_train))
# sometimes a good quality model generalizes better than a medium quality one does, depends on the data
MEDIUM_QUALITY_MODE = True

if MEDIUM_QUALITY_MODE == True:
    preset = 'medium_quality'
    time_lim = 300
else:
    preset = 'good_quality'
    time_lim = 600

predictor = TabularPredictor(label=label, eval_metric="average_precision", path=AG_PATH).fit(
    train_data, time_limit=time_lim, presets=preset
)


## evaluate AG model on holdout (i.e., test) data

In [None]:
test_data = TabularDataset(X_test.join(y_test))
predictor.calibrate_decision_threshold()
y_pred = predictor.predict(test_data.drop(columns=[label]))
print(predictor.evaluate(test_data, silent=True))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
f1_scored = f1_score(y_test, y_pred)
y_pred_proba = predictor.predict_proba(test_data.drop(columns=[label]))[1]
avg_precision = average_precision_score(y_test, y_pred_proba)
print('f1: ',f1_scored, 'average precision: ', avg_precision)

# Plot the Precision-Recall curve
display = PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba, plot_chance_level=True)
_ = display.ax_.set_title(f"Precision-Recall Curve for {label} ")
plt.show()


## feature importance

In [None]:
autogluon_feature_importance = predictor.feature_importance(
    test_data, subsample_size=1000, time_limit=400
)
autogluon_feature_importance["cumsum"] = (
    autogluon_feature_importance["importance"].cumsum()
    / autogluon_feature_importance["importance"].sum()
)
autogluon_feature_importance["importance_ratio"] = (
    autogluon_feature_importance["importance"]
    / autogluon_feature_importance["importance"].sum()
)
autogluon_feature_importance[["cumsum", "importance_ratio"]]

fig, ax1 = plt.subplots(figsize=(10, 6))

# Bar plot on the primary y-axis
autogluon_feature_importance_filtered = autogluon_feature_importance[
    autogluon_feature_importance["importance"] > 0
]
ax1.barh(
    autogluon_feature_importance_filtered.index,
    autogluon_feature_importance_filtered["importance_ratio"],
    label="Importance Ratio",
)
ax1.set_xlabel("Importance")
ax1.set_ylabel("Features")
ax1.set_title("Feature Importance with Cumulative Sum")
ax1.legend(loc="upper left")  # specify location for the first legend
ax1.grid(True, axis="x")  # gridlines only on the x-axis for the bar plot
ax1.invert_yaxis()

# Create a secondary y-axis
ax2 = ax1.twiny()

# Line plot on the secondary y-axis
ax2.plot(
    autogluon_feature_importance_filtered["cumsum"],
    autogluon_feature_importance_filtered.index,
    marker="o",
    linestyle="-",
    color="red",
    label="Cumulative Sum",
)
ax2.set_xlabel("Cumulative Sum")
ax2.legend(loc="upper right")  # specify location for the second legend

# Improve layout
fig.tight_layout()
plt.show()

## export the AG model

In [None]:
VERSION = "0.1.0"

metadata = {
    "f1": f1_scored,
    "average precision": avg_precision,
    "AG quality": preset,
    "AG time limit": time_lim,    
    "version": VERSION,
    "inputs": autogluon_feature_importance.sort_values(
        by="importance", ascending=False
    ).index.tolist(),
    "outputs": f"chance of {label}",
    "description": (
        f"Predicts chance of {label} given the latest 3 weeks of patient weekly (raw and processed) data plus their admission data."
    ),
    "feature_engineering": (
        "wfh_diff_ratio_rate is the change in wfh per kg weight per day using the first and last wfh, wk1_muac_diff_weekly is the difference between most recent visit and prior visit muac"
    ),
    "contact": "Brian Chaplin",
}

path = f"{MODEL_DIR}{label}/{VERSION}/model.tar.gz"

do_storage.to_autogluon_tarball(predictor, model_metadata=metadata, path=path)