In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets

# Load the Iris dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

# Split the dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree with Hyperparameter Tuning
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': range(2, 11)}
best_tree = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy', cv=5).fit(X_train, y_train).best_estimator_

# Evaluate Model Performance
print("Best Parameters:", best_tree.get_params())
y_pred = best_tree.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cost-Complexity Pruning
ccp_alphas = best_tree.cost_complexity_pruning_path(X_train, y_train).ccp_alphas
scores = [DecisionTreeClassifier(ccp_alpha=a).fit(X_train, y_train).score(X_test, y_test) for a in ccp_alphas]


plt.plot(ccp_alphas, train_scores, label='Train Accuracy')
plt.plot(ccp_alphas, test_scores, label='Test Accuracy')
plt.xlabel('ccp_alpha')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Pruning Analysis')
plt.show()

# Feature Importance Analysis
importances = best_tree.feature_importances_
plt.bar(iris.feature_names, importances)
plt.xticks(rotation=45)
plt.title("Feature Importance")
plt.show()

# Retrain without least important feature
least_important = np.argmin(importances)
X_train_reduced, X_test_reduced = np.delete(X_train, least_important, axis=1), np.delete(X_test, least_important, axis=1)
reduced_tree = DecisionTreeClassifier().fit(X_train_reduced, y_train)
print("Accuracy after dropping least important feature:", reduced_tree.score(X_test_reduced, y_test))


Best Parameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


NameError: name 'train_scores' is not defined