In [None]:
#The external pressure characteristics model in period t-1
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt", "Govern", "HHI", "Media"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

feature_importances = best_model.feature_importances_

feature_importance_with_names = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

feature_names, importances = zip(*feature_importance_with_names)

print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

plt.figure(figsize=(10, 8))
sns.barplot(x=[importance for _, importance in feature_importance_with_names], y=[feature for feature, _ in feature_importance_with_names], color='blue')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

shap_abs_mean = np.abs(shap_values).mean(0)

sorted_feature_importance_with_names = sorted(zip(feature_names, shap_abs_mean), key=lambda x: x[1], reverse=True)

sorted_feature_names, sorted_shap_abs_mean = zip(*sorted_feature_importance_with_names)

shap.summary_plot(shap_values, X_test, feature_names=feature_names, plot_type="dot")

plt.figure(figsize=(10, 8))
plt.title('Average Absolute SHAP Values')
plt.barh(sorted_feature_names, sorted_shap_abs_mean, color='skyblue')

for index, value in enumerate(sorted_shap_abs_mean):
    plt.text(value, index, f"{value:.4f}", va='center', ha='left')

plt.gca().invert_yaxis()
plt.xlabel('Average Absolute SHAP Value')
plt.ylabel('Feature')
plt.show()

for feature_name in feature_names:
    shap.dependence_plot(feature_name, shap_values, X_test, feature_names=feature_names, interaction_index=None)

top3_features = np.argsort(shap_abs_mean)[::-1][:3]
top3_feature_names = [feature_names[i] for i in top3_features]
for i, feature_name1 in enumerate(top3_feature_names):
    for feature_name2 in top3_feature_names[i+1:]:
        shap.dependence_plot(feature_name1, shap_values, X_test, feature_names=feature_names, interaction_index=feature_name2)

correct_pred_1_index = np.where((Y_test == 1) & (Y_pred_05 == 1))[0][0]
sample_1 = X_test.iloc[correct_pred_1_index]

correct_pred_0_index = np.where((Y_test == 0) & (Y_pred_05 == 0))[0][0]
sample_0 = X_test.iloc[correct_pred_0_index]

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_1_index], 3),  
    np.round(X_test.iloc[correct_pred_1_index], 3), 
    feature_names=feature_names,
    matplotlib=True
)

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_0_index], 3),  
    np.round(X_test.iloc[correct_pred_0_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

In [None]:
#The corporate characteristics model in period t-1
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

feature_importances = best_model.feature_importances_

feature_importance_with_names = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

feature_names, importances = zip(*feature_importance_with_names)

print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

plt.figure(figsize=(10, 8))
sns.barplot(x=[importance for _, importance in feature_importance_with_names], y=[feature for feature, _ in feature_importance_with_names], color='blue')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

shap_abs_mean = np.abs(shap_values).mean(0)

sorted_feature_importance_with_names = sorted(zip(feature_names, shap_abs_mean), key=lambda x: x[1], reverse=True)

sorted_feature_names, sorted_shap_abs_mean = zip(*sorted_feature_importance_with_names)

shap.summary_plot(shap_values, X_test, feature_names=feature_names, plot_type="dot")

plt.figure(figsize=(10, 8))
plt.title('Average Absolute SHAP Values')
plt.barh(sorted_feature_names, sorted_shap_abs_mean, color='skyblue')

for index, value in enumerate(sorted_shap_abs_mean):
    plt.text(value, index, f"{value:.4f}", va='center', ha='left')

plt.gca().invert_yaxis() 
plt.xlabel('Average Absolute SHAP Value')
plt.ylabel('Feature')
plt.show()

for feature_name in feature_names:
    shap.dependence_plot(feature_name, shap_values, X_test, feature_names=feature_names, interaction_index=None)

top3_features = np.argsort(shap_abs_mean)[::-1][:3]
top3_feature_names = [feature_names[i] for i in top3_features]
for i, feature_name1 in enumerate(top3_feature_names):
    for feature_name2 in top3_feature_names[i+1:]:
        shap.dependence_plot(feature_name1, shap_values, X_test, feature_names=feature_names, interaction_index=feature_name2)

correct_pred_1_index = np.where((Y_test == 1) & (Y_pred_05 == 1))[0][0]
sample_1 = X_test.iloc[correct_pred_1_index]

correct_pred_0_index = np.where((Y_test == 0) & (Y_pred_05 == 0))[0][0]
sample_0 = X_test.iloc[correct_pred_0_index]

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_1_index], 3),  
    np.round(X_test.iloc[correct_pred_1_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_0_index], 3), 
    np.round(X_test.iloc[correct_pred_0_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

In [None]:
#The external pressure characteristics model in period t-2
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data2.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt", "Govern", "HHI", "Media"]
features = df[feature_names]
labels = df["f2_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

feature_importances = best_model.feature_importances_

feature_importance_with_names = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

feature_names, importances = zip(*feature_importance_with_names)

print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

plt.figure(figsize=(10, 8))
sns.barplot(x=[importance for _, importance in feature_importance_with_names], y=[feature for feature, _ in feature_importance_with_names], color='blue')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

shap_abs_mean = np.abs(shap_values).mean(0)

sorted_feature_importance_with_names = sorted(zip(feature_names, shap_abs_mean), key=lambda x: x[1], reverse=True)

sorted_feature_names, sorted_shap_abs_mean = zip(*sorted_feature_importance_with_names)

shap.summary_plot(shap_values, X_test, feature_names=feature_names, plot_type="dot")

plt.figure(figsize=(10, 8))
plt.title('Average Absolute SHAP Values')
plt.barh(sorted_feature_names, sorted_shap_abs_mean, color='skyblue')

for index, value in enumerate(sorted_shap_abs_mean):
    plt.text(value, index, f"{value:.4f}", va='center', ha='left')

plt.gca().invert_yaxis()  
plt.xlabel('Average Absolute SHAP Value')
plt.ylabel('Feature')
plt.show()

for feature_name in feature_names:
    shap.dependence_plot(feature_name, shap_values, X_test, feature_names=feature_names, interaction_index=None)

top3_features = np.argsort(shap_abs_mean)[::-1][:3]
top3_feature_names = [feature_names[i] for i in top3_features]
for i, feature_name1 in enumerate(top3_feature_names):
    for feature_name2 in top3_feature_names[i+1:]:
        shap.dependence_plot(feature_name1, shap_values, X_test, feature_names=feature_names, interaction_index=feature_name2)

correct_pred_1_index = np.where((Y_test == 1) & (Y_pred_05 == 1))[0][0]
sample_1 = X_test.iloc[correct_pred_1_index]

correct_pred_0_index = np.where((Y_test == 0) & (Y_pred_05 == 0))[0][0]
sample_0 = X_test.iloc[correct_pred_0_index]

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_1_index], 3),  
    np.round(X_test.iloc[correct_pred_1_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_0_index], 3), 
    np.round(X_test.iloc[correct_pred_0_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

In [None]:
#The corporate characteristics model in period t-2
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data2.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f2_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

feature_importances = best_model.feature_importances_

feature_importance_with_names = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

feature_names, importances = zip(*feature_importance_with_names)

print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

plt.figure(figsize=(10, 8))
sns.barplot(x=[importance for _, importance in feature_importance_with_names], y=[feature for feature, _ in feature_importance_with_names], color='blue')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

shap_abs_mean = np.abs(shap_values).mean(0)

sorted_feature_importance_with_names = sorted(zip(feature_names, shap_abs_mean), key=lambda x: x[1], reverse=True)

sorted_feature_names, sorted_shap_abs_mean = zip(*sorted_feature_importance_with_names)

shap.summary_plot(shap_values, X_test, feature_names=feature_names, plot_type="dot")

plt.figure(figsize=(10, 8))
plt.title('Average Absolute SHAP Values')
plt.barh(sorted_feature_names, sorted_shap_abs_mean, color='skyblue')

for index, value in enumerate(sorted_shap_abs_mean):
    plt.text(value, index, f"{value:.4f}", va='center', ha='left')

plt.gca().invert_yaxis() 
plt.xlabel('Average Absolute SHAP Value')
plt.ylabel('Feature')
plt.show()

for feature_name in feature_names:
    shap.dependence_plot(feature_name, shap_values, X_test, feature_names=feature_names, interaction_index=None)

top3_features = np.argsort(shap_abs_mean)[::-1][:3]
top3_feature_names = [feature_names[i] for i in top3_features]
for i, feature_name1 in enumerate(top3_feature_names):
    for feature_name2 in top3_feature_names[i+1:]:
        shap.dependence_plot(feature_name1, shap_values, X_test, feature_names=feature_names, interaction_index=feature_name2)

correct_pred_1_index = np.where((Y_test == 1) & (Y_pred_05 == 1))[0][0]
sample_1 = X_test.iloc[correct_pred_1_index]

correct_pred_0_index = np.where((Y_test == 0) & (Y_pred_05 == 0))[0][0]
sample_0 = X_test.iloc[correct_pred_0_index]

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_1_index], 3),  
    np.round(X_test.iloc[correct_pred_1_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

shap.force_plot(
    explainer.expected_value,
    np.round(shap_values[correct_pred_0_index], 3),  
    np.round(X_test.iloc[correct_pred_0_index], 3),  
    feature_names=feature_names,
    matplotlib=True
)

In [None]:
#Merge the two ROC curves in period t-1
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names1 = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt", "Govern", "HHI", "Media"]
features1 = df[feature_names1]
labels1 = df["f_GW"]

smoteenn1 = SMOTEENN(random_state=42)
X_resampled1, Y_resampled1 = smoteenn1.fit_resample(features1, labels1)

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names1:
    lower_bound = X_resampled1[feature].quantile(quantile_low)
    upper_bound = X_resampled1[feature].quantile(quantile_high)
    X_resampled1[feature] = np.clip(X_resampled1[feature], lower_bound, upper_bound)

scaler1 = StandardScaler()
X_resampled1[feature_names1] = scaler1.fit_transform(X_resampled1[feature_names1])

X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_resampled1, Y_resampled1, test_size=0.2, random_state=42)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search1 = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search1.fit(X_train1, Y_train1)

best_model1 = grid_search1.best_estimator_
Y_prob1 = best_model1.predict_proba(X_test1)[:, 1]

feature_names2 = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features2 = df[feature_names2]
labels2 = df["f_GW"]

smoteenn2 = SMOTEENN(random_state=42)
X_resampled2, Y_resampled2 = smoteenn2.fit_resample(features2, labels2)

for feature in feature_names2:
    lower_bound = X_resampled2[feature].quantile(quantile_low)
    upper_bound = X_resampled2[feature].quantile(quantile_high)
    X_resampled2[feature] = np.clip(X_resampled2[feature], lower_bound, upper_bound)

scaler2 = StandardScaler()
X_resampled2[feature_names2] = scaler2.fit_transform(X_resampled2[feature_names2])

X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_resampled2, Y_resampled2, test_size=0.2, random_state=42)

grid_search2 = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search2.fit(X_train2, Y_train2)

best_model2 = grid_search2.best_estimator_
Y_prob2 = best_model2.predict_proba(X_test2)[:, 1]

fpr1, tpr1, _ = roc_curve(Y_test1, Y_prob1)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, _ = roc_curve(Y_test2, Y_prob2)
roc_auc2 = auc(fpr2, tpr2)

plt.figure()
plt.plot(fpr1, tpr1, color="blue", lw=2, label=f"External pressure characteristics model (AUC = {roc_auc1:.2f})")
plt.plot(fpr2, tpr2, color="red", lw=2, label=f"Corporate characteristics model(AUC = {roc_auc2:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve in period t-1")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Merge the two ROC curves in period t-2
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Data2.xlsx"
df = pd.read_excel(excel_file)

feature_names1 = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt", "Govern", "HHI", "Media"]
features1 = df[feature_names1]
labels1 = df["f2_GW"]

smoteenn1 = SMOTEENN(random_state=42)
X_resampled1, Y_resampled1 = smoteenn1.fit_resample(features1, labels1)

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names1:
    lower_bound = X_resampled1[feature].quantile(quantile_low)
    upper_bound = X_resampled1[feature].quantile(quantile_high)
    X_resampled1[feature] = np.clip(X_resampled1[feature], lower_bound, upper_bound)

scaler1 = StandardScaler()
X_resampled1[feature_names1] = scaler1.fit_transform(X_resampled1[feature_names1])

X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_resampled1, Y_resampled1, test_size=0.2, random_state=42)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search1 = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search1.fit(X_train1, Y_train1)

best_model1 = grid_search1.best_estimator_
Y_prob1 = best_model1.predict_proba(X_test1)[:, 1]

feature_names2 = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features2 = df[feature_names2]
labels2 = df["f2_GW"]

smoteenn2 = SMOTEENN(random_state=42)
X_resampled2, Y_resampled2 = smoteenn2.fit_resample(features2, labels2)

for feature in feature_names2:
    lower_bound = X_resampled2[feature].quantile(quantile_low)
    upper_bound = X_resampled2[feature].quantile(quantile_high)
    X_resampled2[feature] = np.clip(X_resampled2[feature], lower_bound, upper_bound)

scaler2 = StandardScaler()
X_resampled2[feature_names2] = scaler2.fit_transform(X_resampled2[feature_names2])

X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_resampled2, Y_resampled2, test_size=0.2, random_state=42)

grid_search2 = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search2.fit(X_train2, Y_train2)

best_model2 = grid_search2.best_estimator_
Y_prob2 = best_model2.predict_proba(X_test2)[:, 1]

fpr1, tpr1, _ = roc_curve(Y_test1, Y_prob1)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, _ = roc_curve(Y_test2, Y_prob2)
roc_auc2 = auc(fpr2, tpr2)

plt.figure()
plt.plot(fpr1, tpr1, color="blue", lw=2, label=f"External pressure characteristics model (AUC = {roc_auc1:.2f})")
plt.plot(fpr2, tpr2, color="red", lw=2, label=f"Corporate characteristics model(AUC = {roc_auc2:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve in period t-2")
plt.legend(loc="lower right")
plt.show()