In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap

# Load the dataset
tyg_shap = pd.read_csv("all_features.csv", sep=",", encoding='GBK')

# Define the groups you want to loop over
groups = ['group_30', 'group_90', 'group_180', 'group_1y']

# Prepare the feature matrix (X) by dropping unnecessary columns
X = tyg_shap.drop(columns=['livetime', 'death'] + groups)

# Loop through each group
for group in groups:
    print(f"Processing {group}...")

    # Set the target variable (y) for the current group
    y = tyg_shap[group]

    # Split dataset into training (70%) and testing (30%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    # Initialize the Random Forest Classifier with balanced class weights and limited max depth
    rf = RandomForestClassifier(n_jobs=10, class_weight='balanced', max_depth=5)

    # Train the Random Forest model
    rf.fit(X_train, y_train)

    # Use SHAP TreeExplainer to explain the predictions made by the Random Forest model
    explainer = shap.Explainer(rf.predict, X_train)
    shap_values = explainer.shap_values(X_test)

    # Set the font family for the plot to Arial
    plt.rcParams['font.family'] = 'Arial'

    # Plot SHAP summary plot for the test set
    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)

    # Save the SHAP plot as a PDF in the specified directory
    output_filename = f'result/shap_{group}.pdf'
    plt.savefig(output_filename, format='pdf')
    plt.close()  # Close the plot to avoid memory issues in loops

    print(f"Saved SHAP plot for {group} as {output_filename}")

print("All groups processed.")
