## Evaluating and Analyzing of XGBoost Trainined Models for Detecting Adverse Events

In this notebook, we are going to analyze a trained xgboost models to detect adverse events for given patients based on their historical data.
Let's first start to import the packages needed.

In [1]:
import json
import os
import numpy as np
import pandas as pd
import shutil

from urllib.parse import urlparse

import boto3

import shap
import tarfile
import pickle

import matplotlib.pyplot as plt
#%matplotlib inline

pd.options.mode.chained_assignment = None

import xgboost as xgb

##User defined import
import utils
from metrics import compute_metrics

In [2]:
def copy_model_from_s3(s3_model_path, local_model_dir):
    """Copy model from s3 to local
    Args:
        s3_model_path(str): S3 path where the model gz is saved
    Returns:
        Destination model path
    """
    client = boto3.client('s3')
    o = urlparse(s3_model_path)
    bucket = o.netloc
    key = o.path
    key = key.lstrip('/')
    if not os.path.exists(local_model_dir): 
        os.makedirs(local_model_dir) 
    fname = os.path.basename(s3_model_path) 
    output_path = os.path.join(local_model_dir, fname)
    
    client.download_file(bucket, key, output_path)
    
    return output_path
   

def load_model(gz_model_path): 
    """
    Loads xgboost trained model from disk
    Args:
        gz_model_path(str): Compressed Model path
    Returns:
        xgboost: Xgboost model object
    """
    model_dir = os.path.dirname(gz_model_path)
    model_path = os.path.join(model_dir, 'xgboost-model')

    tar = tarfile.open(gz_model_path, "r:gz")
    tar.extractall(model_dir)
    tar.close()
    
    #Load Model
    model = pickle.load(open(model_path, "rb"))
    
    #Remove the local copy of the model files
    shutil.rmtree(model_dir)

    return model


def get_labels_scores(df_preds_labels, target_names=None):
    """Get labels and scores/predictions to compute model metrics
    Args:
        df_preds_labels(pd.DataFrame): Dataframe of predictions & true labels
        target_names(list): List of target events
    Returns:
        Tuple of labels(np.array), scores(np.array) and Event names(list)
    """
    labels = None
    scores = None
    if target_names is None:
        cols = df_preds_labels.columns.tolist()
        label_names = [col for col in cols if not col.endswith('_')]
        label_names = [name for name in label_names if not name.endswith('probs')]
        pred_names = [col for col in cols if col.endswith('probs')]
    else:
        label_names = target_names
        pred_names = [name+'_probs' for name in target_names]
    
    labels = df_preds_labels[label_names].values
    scores = df_preds_labels[pred_names].values

    return labels, scores, label_names


In [4]:
PREPROCESSED_DATA_DIR = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/ae/1000/preprocessed'
SPLIT = 'val'
NUM_FEATURES = 100
DATA_PATH = os.path.join(PREPROCESSED_DATA_DIR, SPLIT+'.csv')

TRAIN_DATA_DIR = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/ae/1000/training/'
MODEL_DIR = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/ae/1000/model/'

TRAIN_RESULTS_PATH = os.path.join(TRAIN_DATA_DIR, str(NUM_FEATURES), 'train_results.csv')
FINAL_RESULTS_DIR = os.path.join(TRAIN_DATA_DIR, str(NUM_FEATURES), 'final_results')

Now, we will add all the values/paths needed to train the models

In [5]:
df_results = pd.read_csv(TRAIN_RESULTS_PATH)
df_results.head()

Unnamed: 0,class,num_features,val_auc,best_model_path
0,d_5990,100,0.8034,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
1,d_78605,100,0.816,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
2,d_486,100,0.8641,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
3,d_78650,100,0.7395,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
4,d_78079,100,0.7415,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...


In [6]:
# df_vis = df_results.pivot(index='num_features', columns='class', values='val_auc')
# df_vis.plot()

In [7]:
#Get models having the best performance for each target variable
idx = df_results.groupby('class')['val_auc'].transform(max) ==df_results['val_auc']
df_best = df_results[idx]
print(df_best.shape)
df_best.head()

(20, 4)


Unnamed: 0,class,num_features,val_auc,best_model_path
0,d_5990,100,0.8034,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
1,d_78605,100,0.816,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
2,d_486,100,0.8641,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
3,d_78650,100,0.7395,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...
4,d_78079,100,0.7415,s3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/...


In [8]:
# best_models = [['d_5990', 100, 0.7, 's3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/ae/final/month-0/xgboost/2020-11-10-20-48-57/100/d_5990/output/sagemaker-xgboost-201110-2049-020-212dc74f/output/model.tar.gz'],
#                ['d_78605', 100, 0.6, 's3://cmsai-mrk-amzn/CSVModelInputs/Tes/models/ae/final/month-0/xgboost/2020-11-10-20-48-57/100/d_5990/output/sagemaker-xgboost-201110-2049-016-3e3ab8f4/output/model.tar.gz']]
# columns = ['class', 'num_features', 'val_auc', 'best_model_path']
# df_best = pd.DataFrame(best_models, columns=columns)
# print(df_best.shape)
# df_best.head()

In [9]:
df_data = pd.read_csv(DATA_PATH)
print(df_data.shape)
df_data.head()

(1474322, 320)


Unnamed: 0,h_99213,h_99214,h_36415,d_25000,p_D1E,d_4019,h_85025,h_80053,h_97110,d_4011,...,d_5789,d_78791,d_6826,d_78659,d_78907,d_7840,d_28860,d_4660,d_6829,d_00845
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def get_model_predictions(row, df_data, local_model_dir):
    """Process the predictions and performance for best model for each class.
    df_data first column is labels and others are features
    """
    best_model_path = row['best_model_path']
    target = row['class']
    num_features = row['num_features']
    
    #Copy the best model from s3 to local
    output_path = copy_model_from_s3(best_model_path, local_model_dir)

    #Load the copied model
    model = load_model(output_path)
    
    preds = []
    features = df_data.columns.tolist()[:num_features]
    #Predict for data and save in pd Dataframe
    probs = model.predict(xgb.DMatrix(df_data[features].values, df_data[target].values))
    #probs = model.predict(xgb.DMatrix(df_data.iloc[:, :num_features], df_data[target].values, feature_names=feature_names))
    preds.append(df_data[target].tolist())
    preds.append((probs>=0.5).astype(int).tolist())
    preds.append(probs.tolist())
    
    columns = [target, target+'_', target+'_probs']
    return preds, columns


def get_all_predictions(df_best_models, df_data, local_model_dir):
    """Get predictions from each of the best models of each target variable."""
    num_rows = df_best_models.shape[0]
    all_columns = []
    all_preds = []
    for i in range(num_rows):
        row = df_best_models.iloc[i, :]
        preds, columns = get_model_predictions(row, df_data, local_model_dir)
        all_preds += preds
        all_columns += columns
        
    df_preds = pd.DataFrame(np.array(all_preds).T, columns=all_columns)
    return df_preds

Evaluate for a sample model

In [None]:
# target = df_best.iloc[0, 0]
# num_features = df_best.iloc[0,1]
# best_model_path = df_best.iloc[0, 3]

# #Copy the best model from s3 to local
# output_path = copy_model_from_s3(best_model_path, MODEL_DIR)
# #Load the copied model
# model = load_model(output_path)
# #model.feature_names

# #Evaluate model on data
# feature_names = df_data.columns.tolist()[:num_features]
# auc = model.eval(xgb.DMatrix(df_data[feature_names].values, df_data[target].values))
# print('AUC: - {}'.format(auc))

In [None]:
df_preds = get_all_predictions(df_best, df_data, MODEL_DIR)

In [None]:
print(df_preds.shape)
df_preds.head()

In [None]:
np_labels, np_scores, _ = get_labels_scores(df_preds)
target_names = df_best['class'].tolist()
df_metrics = compute_metrics(np_labels, np_scores, target_names=target_names)

In [None]:
print('Labels Shape: {}, Scores Shape: {}'.format(np_labels.shape, np_scores.shape))
df_metrics.head()

In [None]:
#pd.DataFrame(df_metrics.mean()).T

In [None]:
mn = df_metrics.min()
mx = df_metrics.max()
avg = df_metrics.mean()

df_metrics.loc['Min'] = mn
df_metrics.loc['Max'] = mx
df_metrics.loc['Average'] = avg

In [None]:
df_metrics.tail()

In [None]:
feature_names = df_data.columns.tolist()[:NUM_FEATURES]
if not os.path.exists(FINAL_RESULTS_DIR):
    os.makedirs(FINAL_RESULTS_DIR)
    
#Save the features used
features_list_path = os.path.join(FINAL_RESULTS_DIR, 'features.txt')
with open(features_list_path, 'w') as fp:
    fp.write('\n'.join(feature_names))

#Save the final metrics results
final_results_path = os.path.join(FINAL_RESULTS_DIR, SPLIT+'_metrics.csv')
df_metrics.to_csv(final_results_path)

## Explainability and Visualization using SHAP (SHapley Additive exPlanations)

*Source: https://github.com/slundberg/shap*

In [10]:
import warnings
warnings.filterwarnings("ignore")

import shap
import matplotlib.pyplot as plt
%matplotlib inline

# load JS visualization code to notebook
#shap.initjs()

In [None]:
print('Processing for {} data...'.format(SPLIT))
feature_names = df_data.columns.tolist()[:NUM_FEATURES]
X = df_data[feature_names]

#Create a new shap dir if not available
shap_dir = os.path.join(FINAL_RESULTS_DIR, 'shap_'+SPLIT)
if not os.path.exists(shap_dir):
    os.makedirs(shap_dir)
    
num_rows = df_best.shape[0]
for i in range(num_rows):
    target = df_best.iloc[i, 0]
    num_features = df_best.iloc[i, 1]
    best_model_path = df_best.iloc[i, 3]

    y = df_data[target]

    #Copy the best model from s3 to local
    output_path = copy_model_from_s3(best_model_path, MODEL_DIR)
    #Load the copied model
    model = load_model(output_path)
    
    # explain the model's predictions using SHAP
    # (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    
    print('Computing SHAP Results for Target={}...'.format(target))
    
#     vis_path = os.path.join(shap_dir, target+'_shap_values.pkl')
#     with open(vis_path, 'wb') as fp:
#         pickle.dump(shap_values, fp)
        
    # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
    vis_path = os.path.join(shap_dir, target+'_per_patient_shap.png')
    shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:], matplotlib=True, show=False)
    plt.savefig(vis_path, bbox_inches='tight')
    plt.close("all")
    
    # visualize the training set predictions
    #shap.force_plot(explainer.expected_value, shap_values, X) ## Out-of-memory Error
    
    # create a dependence plot to show the effect of a single feature across the whole dataset
    vis_path = os.path.join(shap_dir, target+'_per_feature_shap.png')
    shap.dependence_plot(feature_names[0], shap_values, X, show=False)
    plt.savefig(vis_path, bbox_inches='tight')
    plt.close("all")
    
    # summarize the effects of all the features
    shap.summary_plot(shap_values, X, show=False)
    vis_path = os.path.join(shap_dir, target+'_all_features_shap.png')
    plt.savefig(vis_path, bbox_inches='tight')
    plt.close("all")
    
    #Compute the mean absolute value of the SHAP values for each feature to get a standard bar plot
    shap.summary_plot(shap_values, X, plot_type="bar", show=False)
    vis_path = os.path.join(shap_dir, target+'_all_features_importance.png')
    plt.savefig(vis_path, bbox_inches='tight')
    plt.close("all")
    
print('Shap Values and Visualizations Successfully Saved to {}!'.format(shap_dir))

Processing for val data...
Computing SHAP Results for Target=d_5990...
Computing SHAP Results for Target=d_78605...
Computing SHAP Results for Target=d_486...
Computing SHAP Results for Target=d_78650...
