In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                            mean_squared_error, r2_score)

# ------------------------- Classification (Iris Dataset) -------------------------
# a. Split dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# b. Build decision tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# c. Check performance
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print(f"Classification - Before Pruning:")
print(f"  Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}")

# d. Cost complexity pruning
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Cross-validation to find optimal alpha
clf_pruned = GridSearchCV(DecisionTreeClassifier(random_state=42),
                         param_grid={'ccp_alpha': ccp_alphas},
                         cv=5)
clf_pruned.fit(X_train, y_train)

print(f"\nAfter Pruning (Best alpha: {clf_pruned.best_params_['ccp_alpha']:.4f}):")
print(f"  Train Accuracy: {clf_pruned.score(X_train, y_train):.3f}")
print(f"  Test Accuracy: {clf_pruned.score(X_test, y_test):.3f}")

# Visualize pruning effect
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas[:-1], clf_pruned.cv_results_['mean_test_score'][:-1], marker='o')
plt.xlabel("Alpha")
plt.ylabel("Mean Accuracy")
plt.title("Classification: Accuracy vs Alpha for Cost Complexity Pruning")
plt.show()

# ------------------------- Regression (Diabetes Dataset) -------------------------
# a. Split dataset
X_reg, y_reg = load_diabetes(return_X_y=True)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42)

# b. Build decision tree
reg = DecisionTreeRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)

# c. Check performance
train_mse = mean_squared_error(y_train_reg, reg.predict(X_train_reg))
test_mse = mean_squared_error(y_test_reg, reg.predict(X_test_reg))
print(f"\nRegression - Before Pruning:")
print(f"  Train MSE: {train_mse:.2f}, Test MSE: {test_mse:.2f}")

# d. Cost complexity pruning
path_reg = reg.cost_complexity_pruning_path(X_train_reg, y_train_reg)
ccp_alphas_reg = path_reg.ccp_alphas

# Cross-validation for regression
reg_pruned = GridSearchCV(DecisionTreeRegressor(random_state=42),
                         param_grid={'ccp_alpha': ccp_alphas_reg},
                         cv=5)
reg_pruned.fit(X_train_reg, y_train_reg)

print(f"\nAfter Pruning (Best alpha: {reg_pruned.best_params_['ccp_alpha']:.4f}):")
print(f"  Train MSE: {mean_squared_error(y_train_reg, reg_pruned.predict(X_train_reg)):.2f}")
print(f"  Test MSE: {mean_squared_error(y_test_reg, reg_pruned.predict(X_test_reg)):.2f}")

# ------------------------- Ensemble Methods -------------------------
# e. Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
print(f"\nRandom Forest Classifier:")
print(f"  Train Accuracy: {rf_clf.score(X_train, y_train):.3f}")
print(f"  Test Accuracy: {rf_clf.score(X_test, y_test):.3f}")

# f. AdaBoost with Decision Stumps
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
ada_clf.fit(X_train, y_train)
print(f"\nAdaBoost Classifier:")
print(f"  Train Accuracy: {ada_clf.score(X_train, y_train):.3f}")
print(f"  Test Accuracy: {ada_clf.score(X_test, y_test):.3f}")

# Feature Importance Visualization
plt.figure(figsize=(12, 6))
plt.barh(range(X.shape[1]), rf_clf.feature_importances_, align='center')
plt.yticks(range(X.shape[1]), load_iris().feature_names)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()
