# 🧠 Task 5: Decision Trees & Random Forests
Following the internship PDF instructions strictly.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load UCI Heart Disease dataset
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
df = pd.read_csv(url, names=column_names)
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)
df = df.astype({
    'age': 'float', 'sex': 'int', 'cp': 'int', 'trestbps': 'float',
    'chol': 'float', 'fbs': 'int', 'restecg': 'int', 'thalach': 'float',
    'exang': 'int', 'oldpeak': 'float', 'slope': 'int', 'ca': 'float',
    'thal': 'float', 'target': 'int'
})
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)
df.head()

In [None]:
# Train-test split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 1. Train and visualize Decision Tree
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)

plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=X.columns, class_names=['No Disease', 'Disease'], filled=True)
plt.title("Decision Tree")
plt.show()

y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

In [None]:
# 2. Random Forest & Compare Accuracy
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
# 3. Feature Importance
importances = rf.feature_importances_
feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feat_df = feat_df.sort_values(by='Importance', ascending=False)
sns.barplot(x='Importance', y='Feature', data=feat_df)
plt.title("Feature Importances")
plt.show()

In [None]:
# 4. Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))