### Import the package and split the data set (add a little data preprocessing, because the output.csv here has already been preprocessed)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

# 1. Load data (please replace with your data file path)
df = pd.read_csv("output.csv")  # Assume you have converted it to CSV format

# 2. Data preprocessing
X = df.drop(columns=["Outcome"])  # Features
y = df["Outcome"]  # Target variable

# Standardize data (helpful for logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


### Five models are trained here, namely Logistic Regression, XGBoost, MLP, XGBoost + optimizer, and random forest, and a simple test is performed on the trained models.

In [None]:
# 4. Train models
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

# MLP
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, alpha=0.01, random_state=42)
mlp.fit(X_train, y_train)
mlp_preds = mlp.predict(X_test)

# XGBoost + Optimizer
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
grid_preds = grid.predict(X_test)
print("Best params:", grid.best_params_)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# 5. Evaluate models
def evaluate_model(name, y_true, y_pred):
    print(f"{name} Model Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_true, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("-"*50)

evaluate_model("Logistic Regression", y_test, logreg_preds)

evaluate_model("XGBoost", y_test, xgb_preds)

evaluate_model("MLPClassifier", y_test, mlp_preds)

evaluate_model("grid_preds", y_test, grid_preds)

evaluate_model("Random Forest", y_test, rf_preds)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Logistic Regression Model Performance:
Accuracy: 0.7013
ROC AUC: 0.6550
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       100
           1       0.59      0.50      0.54        54

    accuracy                           0.70       154
   macro avg       0.67      0.66      0.66       154
weighted avg       0.69      0.70      0.70       154

--------------------------------------------------
XGBoost Model Performance:
Accuracy: 0.7078
ROC AUC: 0.6770
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       100
           1       0.58      0.57      0.58        54

    accuracy                           0.71       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.71      0.71      0.71       154

-------------------------

### Two models are trained here, which are integrated models. Different from the above, a simple test is also performed on the two integrated models trained.

In [None]:
# Create a voting classifier (hard voting)
voting_clf = VotingClassifier(
    estimators=[
        ('logreg', logreg),
        ('xgb', xgb),
        ('mlp', mlp),
        ('rf', rf)
    ],
    voting='hard'  # 'hard' represents majority voting, 'soft' uses probability-weighted voting
)
# Train the ensemble model
voting_clf.fit(X_train, y_train)
# Predict
voting_preds = voting_clf.predict(X_test)

# Create a stacking model
stacking_clf = StackingClassifier(
    estimators=[
        ('logreg', logreg),
        ('xgb', xgb),
        ('mlp', mlp),
        ('rf', rf)
    ],
    final_estimator=SVC(probability=True)  # Use Support Vector Machine as the meta-learner
)
# Train the model
stacking_clf.fit(X_train, y_train)
# Predict
stacking_preds = stacking_clf.predict(X_test)

# Evaluate
evaluate_model("Voting Classifier", y_test, voting_preds)
# Evaluate
evaluate_model("Stacking Classifier", y_test, stacking_preds)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Voting Classifier Model Performance:
Accuracy: 0.7403
ROC AUC: 0.6893
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       100
           1       0.67      0.52      0.58        54

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154

--------------------------------------------------
Stacking Classifier Model Performance:
Accuracy: 0.7468
ROC AUC: 0.6943
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       100
           1       0.68      0.52      0.59        54

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.74       154

--------------------------------------------------


### Conclusion: The model with the highest test accuracy is random forest, with a test accuracy of 0.7532