In [None]:
# ==============================================================
# Task 5: Decision Trees and Random Forests
# Objective: Learn tree-based models for classification & regression
# Tools: Scikit-learn, Graphviz, Matplotlib, Seaborn
# ==============================================================

# mport Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


df = pd.read_csv("heart.csv")

# Basic Info
print("📊 Dataset Preview:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:\n", df.isnull().sum())

# Split Features and Target
X = df.drop('target', axis=1)
y = df['target']

# Split Data into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("\n🌳 Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

# Visualize Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt, filled=True, feature_names=X.columns, class_names=['No Disease', 'Disease'])
plt.title("Decision Tree Visualization", fontsize=16)
plt.show()

# Control Overfitting (Pruned Tree)
dt_pruned = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_pruned.fit(X_train, y_train)
y_pred_pruned = dt_pruned.predict(X_test)
print("\n✂️ Pruned Decision Tree Accuracy:", accuracy_score(y_test, y_pred_pruned))

#  Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("\n🌲 Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

#  Compare Results
print("\n===================== MODEL COMPARISON =====================")
print("Decision Tree Accuracy:", round(accuracy_score(y_test, y_pred_pruned), 3))
print("Random Forest Accuracy:", round(accuracy_score(y_test, y_pred_rf), 3))
print("============================================================")

# Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns)
plt.figure(figsize=(10,6))
importances.sort_values(ascending=True).plot(kind='barh', color='skyblue')
plt.title("Feature Importance (Random Forest)", fontsize=14)
plt.xlabel("Importance Score")
plt.show()

# Cross-Validation
scores = cross_val_score(rf, X, y, cv=5)
print("\nCross-Validation Accuracy:", round(np.mean(scores), 3))

# Classification Report and Confusion Matrix
print("\n Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
