In [None]:
#Heterogeneity prediction results of the corporate characteristics model in period t-1 (heavy pollution)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Heavy pollution.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Heterogeneity prediction results of the corporate characteristics model in period t-1 (non-heavy pollution)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Non-heavy pollution.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Heterogeneity prediction results of the corporate characteristics model in period t-1 (state-owned)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\State-owned.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Heterogeneity prediction results of the corporate characteristics model in period t-1 (non-state-owned)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\\Desk\\Non-state-owned.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#The three-classification prediction model for greenwashing in period t-1
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns

excel_file = "E:\\Desk\\Three classifications.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["TC"] 

labels = labels.astype(int)

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(
    xgb.XGBClassifier(objective="multi:softprob", num_class=3, eval_metric="mlogloss"),
    param_grid,
    scoring="f1_weighted", 
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train.values, Y_train.values)  

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test.values)

print(classification_report(Y_test, Y_pred))

conf_matrix = confusion_matrix(Y_test, Y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"], 
            yticklabels=["Negative", "Positive"],
            annot_kws={"size": 18})  
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

feature_importances = best_model.feature_importances_

feature_importance_with_names = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

feature_names_sorted, importances_sorted = zip(*feature_importance_with_names)

print("Feature Importances:")
for feature, importance in zip(feature_names_sorted, importances_sorted):
    print(f"{feature}: {importance:.4f}")

In [None]:
#The corporate characteristics model in period t-1 based on Wind
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\Desk\Wind ESG.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"], 
            yticklabels=["Negative", "Positive"],
            annot_kws={"size": 18})  
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#The corporate characteristics model in period t-1 based on MSCI
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\Desk\MSCI ESG.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"], 
            yticklabels=["Negative", "Positive"],
            annot_kws={"size": 18})  
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#The corporate characteristics model in period t-1 based on Hong Kong stock market
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = "E:\Desk\Hong Kong market.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"], 
            yticklabels=["Negative", "Positive"],
            annot_kws={"size": 18})  
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Prediction results of Random Forest
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier  

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced", None]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Prediction results of LightGBM
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

param_grid = {
    "learning_rate": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}

grid_search = GridSearchCV(lgb.LGBMClassifier(objective="binary", metric="logloss"), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Prediction results of BP neural network
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier  

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

param_grid = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "alpha": [0.0001, 0.001, 0.01],
    "max_iter": [1000, 2000]
}

grid_search = GridSearchCV(MLPClassifier(random_state=42), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Prediction results of SVM
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC 

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

print("Class distribution after SMOTEENN:")
print(pd.Series(Y_resampled).value_counts())

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best accuracy from Grid Search: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Y_prob = best_model.predict_proba(X_test)[:, 1]

Y_pred_05 = (Y_prob >= 0.5).astype(int)

accuracy_05 = accuracy_score(Y_test, Y_pred_05)
precision_05 = precision_score(Y_test, Y_pred_05)
recall_05 = recall_score(Y_test, Y_pred_05)
f1_05 = f1_score(Y_test, Y_pred_05)

print(f"Accuracy (Threshold = 0.5): {accuracy_05}")
print(f"Precision (Threshold = 0.5): {precision_05}")
print(f"Recall (Threshold = 0.5): {recall_05}")
print(f"F1 Score (Threshold = 0.5): {f1_05}")

conf_matrix = confusion_matrix(Y_test, Y_pred_05)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Threshold = 0.5)")
plt.show()

fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area={roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
#ROC curve of five machine learning methods
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns

excel_file = "E:\\Desk\\Data1.xlsx"
df = pd.read_excel(excel_file)

feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size", "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

smoteenn = SMOTEENN(random_state=42)
X_resampled, Y_resampled = smoteenn.fit_resample(features, labels)

quantile_low = 0.01
quantile_high = 0.99
for feature in feature_names:
    lower_bound = X_resampled[feature].quantile(quantile_low)
    upper_bound = X_resampled[feature].quantile(quantile_high)
    X_resampled[feature] = np.clip(X_resampled[feature], lower_bound, upper_bound)

scaler = StandardScaler()
X_resampled[feature_names] = scaler.fit_transform(X_resampled[feature_names])

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

def get_roc_data(model, params, X_train, X_test, Y_train, Y_test):
    grid_search = GridSearchCV(model, params, scoring="accuracy", cv=5, verbose=0, n_jobs=-1)
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_
    Y_prob = best_model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_prob)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc

xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss")
xgb_param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}
xgb_fpr, xgb_tpr, xgb_auc = get_roc_data(xgb_model, xgb_param_grid, X_train, X_test, Y_train, Y_test)

rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced", None]
}
rf_fpr, rf_tpr, rf_auc = get_roc_data(rf_model, rf_param_grid, X_train, X_test, Y_train, Y_test)

lgb_model = lgb.LGBMClassifier(objective="binary", metric="logloss")
lgb_param_grid = {
    "learning_rate": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}
lgb_fpr, lgb_tpr, lgb_auc = get_roc_data(lgb_model, lgb_param_grid, X_train, X_test, Y_train, Y_test)

mlp_model = MLPClassifier(random_state=42)
mlp_param_grid = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "alpha": [0.0001, 0.001, 0.01],
    "max_iter": [1000, 2000]
}
mlp_fpr, mlp_tpr, mlp_auc = get_roc_data(mlp_model, mlp_param_grid, X_train, X_test, Y_train, Y_test)

svm_model = SVC(probability=True, random_state=42)
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}
svm_fpr, svm_tpr, svm_auc = get_roc_data(svm_model, svm_param_grid, X_train, X_test, Y_train, Y_test)

plt.figure(figsize=(10, 8))
plt.plot(xgb_fpr, xgb_tpr, color="blue", lw=2, label=f"XGBoost (AUC = {xgb_auc:.2f})")
plt.plot(rf_fpr, rf_tpr, color="red", lw=2, label=f"RF (AUC = {rf_auc:.2f})")
plt.plot(lgb_fpr, lgb_tpr, color="green", lw=2, label=f"LightGBM (AUC = {lgb_auc:.2f})")
plt.plot(mlp_fpr, mlp_tpr, color="purple", lw=2, label=f"BP (AUC = {mlp_auc:.2f})")
plt.plot(svm_fpr, svm_tpr, color="orange", lw=2, label=f"SVM (AUC = {svm_auc:.2f})")

plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve of five machine learning methods.")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Prediction results of four XGBoost methods.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                             roc_curve, auc, confusion_matrix)
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import shap

excel_file = r"E:\Desk\Data1.xlsx"
df = pd.read_excel(excel_file)
feature_names = ["ROA", "Growth", "Lev", "CR", "TAT", "Tbq", "IDT", "AC", "Size",
                 "Owncon1", "Bdnum", "Nshrsms", "Nshrstt"]
features = df[feature_names]
labels = df["f_GW"]

sm = SMOTE(random_state=42)
X_sm, Y_sm = sm.fit_resample(features, labels)

smenn = SMOTEENN(random_state=42)
X_en, Y_en = smenn.fit_resample(features, labels)

datasets = {
    'Original':        (features, labels),
    'SMOTE':           (X_sm, Y_sm),
    'SMOTE-ENN':       (X_en, Y_en)
}

quantile_low, quantile_high = 0.01, 0.99
scaler = StandardScaler()

def prep_whole(X, cols):
    for col in cols:
        low, high = X[col].quantile(quantile_low), X[col].quantile(quantile_high)
        X[col] = np.clip(X[col], low, high)
    X[cols] = scaler.fit_transform(X[cols])
    return X

for k, (X, y) in datasets.items():
    datasets[k] = (prep_whole(X.copy(), feature_names), y)

splits = {}
for k, (X, y) in datasets.items():
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, y, test_size=0.2, random_state=42,stratify=y)
    splits[k] = (X_train, X_test, Y_train, Y_test)

def new_clf():
    return xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

param_grid = {
    "eta": [0.01, 0.02, 0.05],
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 3]
}


models = {}   

# ① XGBoost（Original）
X_tr, X_te, Y_tr, Y_te = splits['Original']
models['XGBoost'] = (X_tr, Y_tr, X_te, Y_te, new_clf())

# ② XGBoost + SMOTE
X_tr, X_te, Y_tr, Y_te = splits['SMOTE']
models['XGBoost + SMOTE'] = (X_tr, Y_tr, X_te, Y_te, new_clf())

# ③ XGBoost + SMOTE-ENN
X_tr, X_te, Y_tr, Y_te = splits['SMOTE-ENN']
models['XGBoost + SMOTE-ENN'] = (X_tr, Y_tr, X_te, Y_te, new_clf())

# ④ XGBoost + SMOTE-ENN + GridSearchCV
grid_clf = GridSearchCV(
    xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42),
    param_grid, scoring='accuracy', cv=5, verbose=0, n_jobs=-1)
X_tr, X_te, Y_tr, Y_te = splits['SMOTE-ENN']
grid_clf.fit(X_tr, Y_tr)
models['XGBoost + SMOTE-ENN + GridSearchCV'] = (X_tr, Y_tr, X_te, Y_te, grid_clf.best_estimator_)

roc_data = {}
for name, (X_tr, Y_tr, X_te, Y_te, clf) in models.items():
   
    if name != 'XGBoost + SMOTE-ENN + GridSearchCV':
        clf.fit(X_tr, Y_tr)          

    y_prob = clf.predict_proba(X_te)[:, 1]
    fpr, tpr, _ = roc_curve(Y_te, y_prob)
    auc_val = auc(fpr, tpr)
    y_pred = (y_prob >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(Y_te, y_pred).ravel()
    type1, type2 = fp / (fp + tn), fn / (fn + tp)

    roc_data[name] = (fpr, tpr, auc_val, type1, type2)
    print(f"\n{name}  Test set indicators")
    print("AUC: {:.4f} | Acc: {:.4f} | Prec: {:.4f} | Rec: {:.4f} | F1: {:.4f}".format(
        auc_val, accuracy_score(Y_te, y_pred), precision_score(Y_te, y_pred),
        recall_score(Y_te, y_pred), f1_score(Y_te, y_pred)))
    print("Type-I (FPR): {:.4f} | Type-II (FNR): {:.4f}".format(type1, type2))

plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_val, _, _) in roc_data.items():
    plt.plot(fpr, tpr, lw=2, label=f"{name} (AUC={auc_val:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim(0, 1); plt.ylim(0, 1.05)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC curve of four XGBoost methods")
plt.legend(loc="lower right")
plt.show()