In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
df = pd.read_csv("heart.csv")  # put the CSV file in the same folder as your notebook

# Remove rows where Cholesterol is 0
df = df[df['Cholesterol'] != 0]

# Step 3: Explore the dataset
print("Shape of dataset:", df.shape)
print("First 5 rows:")
print(df.head())

# Step 4: Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Step 5: Data types and basic info
print("\nInfo:")
print(df.info())

# Step 6: Summary statistics
print("\nStatistics:")
print(df.describe())


In [None]:
# Step 1: Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Step 2: Separate input and output
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

# Step 3: Split into train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Train model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Step 2: Predict
y_pred = model.predict(X_test)

# Step 3: Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion Matrix Visualization
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
plt.title("XGBoost Confusion Matrix")
plt.show()


In [None]:
# Feature Importance
importances = model.feature_importances_

# Plot feature importance
plt.figure(figsize=(10,8))
plt.barh(X.columns, importances)
plt.title("Feature Importance (XGBoost)")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()


In [None]:
# Distribution of Heart Disease (target variable)
sns.countplot(x='HeartDisease', data=df)
plt.title("Distribution of Heart Disease in Dataset")
plt.xlabel("Heart Disease (0 = No, 1 = Yes)")
plt.ylabel("Number of Patients")
plt.show()
