<a href="https://colab.research.google.com/github/Shrajoy92/SEED_CKD/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix, precision_recall_curve, roc_curve, auc
)
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

file_path = '/content/NCBI_ff0492384198294892.csv'
def load_and_preprocess_data(file_path):
    """Load and preprocess the data by dropping irrelevant columns."""
    df = pd.read_csv(file_path)
    columns_to_drop = [
        "Participant ID"
    ]
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    df.dropna(inplace=True)  # Handle missing values
    return df

def calculate_class_weight(y_train):
    """Calculate class weights for balancing."""
    class_counts = np.bincount(y_train)
    weight = class_counts[0] / class_counts[1]
    return weight

def train_model(X_train, y_train, class_weight):
    """Train the XGBClassifier with class weight adjustment."""
    xgb_model = XGBClassifier(
        scale_pos_weight=class_weight,  # Adjust for class imbalance
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=45
    )
    xgb_model.fit(X_train, y_train)
    return xgb_model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using test data."""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return y_pred, y_pred_proba

def plot_confusion_matrix(y_test, y_pred, class_labels):
    """Plot the confusion matrix."""
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

def plot_precision_recall_curve(y_test, y_pred_proba):
    """Plot the Precision-Recall Curve."""
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.plot(recall, precision, marker='.', label='XGBoost')
    plt.title("Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid()
    plt.show()

def plot_roc_curve(y_test, y_pred_proba):
    """Plot the ROC Curve."""
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='red', linestyle='--')
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid()
    plt.show()

# Main Execution
file_path = '/content/NCBI_ff0492384198294892.csv'  # Update path as necessary
data = load_and_preprocess_data(file_path)

# Encode target variable
X = data.drop(columns=["Healthy/CKD"])
y = LabelEncoder().fit_transform(data["Healthy/CKD"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=45)

# Calculate class weight
class_weight = calculate_class_weight(y_train)
print(f"Calculated class weight (scale_pos_weight): {class_weight}")

# Train model
xgb_model = train_model(X_train, y_train, class_weight)

# Evaluate model
y_pred, y_pred_proba = evaluate_model(xgb_model, X_test, y_test)

# Plotting
plot_confusion_matrix(y_test, y_pred, class_labels=['Healthy', 'CKD'])
plot_precision_recall_curve(y_test, y_pred_proba)
plot_roc_curve(y_test, y_pred_proba)

# Save the model
filename = 'trained_ckd.sav'
pickle.dump(xgb_model, open(filename, 'wb'))
