# XGBoost Classifier (Ensemble)
## Dataset: Heart Disease UCI

### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report, roc_curve
)
import warnings
warnings.filterwarnings('ignore')

### 2. Load and Preprocess Data

In [None]:
# Load Heart Disease UCI dataset
df = pd.read_csv('../heart_disease_uci.csv')

print("Original Dataset Shape:", df.shape)
df.head()

In [None]:
# Drop id and dataset columns
df = df.drop(['id', 'dataset'], axis=1)

# Handle missing values
df = df.dropna()
print(f"Dataset Shape after dropping missing values: {df.shape}")

In [None]:
# Convert target to binary
df['target'] = (df['num'] > 0).astype(int)
df = df.drop('num', axis=1)

print("Target Distribution:")
print(df['target'].value_counts())

In [None]:
# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
le = LabelEncoder()

for col in categorical_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col].astype(str))

df.head()

### 3. Prepare Features and Target

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape[0]}, Test set: {X_test.shape[0]}")

### 4. Train XGBoost Model

In [None]:
model = XGBClassifier(
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train_scaled, y_train)

print("XGBoost Model trained successfully!")
print(f"Number of estimators: {model.n_estimators}")

### 5. Model Evaluation

In [None]:
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'AUC': roc_auc_score(y_test, y_prob),
    'Precision': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1': f1_score(y_test, y_pred, average='weighted'),
    'MCC': matthews_corrcoef(y_test, y_pred)
}

print("XGBOOST - EVALUATION METRICS")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost')
plt.show()

In [None]:
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {metrics["AUC"]:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - XGBoost')
plt.legend()
plt.show()

### 6. Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('XGBoost - Feature Importance')
plt.tight_layout()
plt.show()