**Building the decision Tree from scratch**

In [None]:
import numpy as np

class DecisionTree:

    def __init__(self, min_samples_split=2, max_depth=5, task="classification", max_features=None, random_state=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.task = task
        self.max_features = max_features
        self.random_state = random_state


    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def _split(self, dataset, feature, threshold):
        left = dataset[dataset[:, feature] <= threshold]
        right = dataset[dataset[:, feature] > threshold]
        return left, right

    def _gini(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probs = counts / labels.size
        return 1 - np.sum(probs ** 2)

    def _mse(self, values):
        return np.mean((values - np.mean(values)) ** 2)

    def _cost(self, left_labels, right_labels):
        if self.task == "classification":
            return self._gini(left_labels) + self._gini(right_labels)
        elif self.task == "regression":
            return self._mse(left_labels) + self._mse(right_labels)

    def _choose_best_split(self, dataset, feature_indices):
        best_cost, best_feature, best_threshold = np.inf, None, None

        for feature in feature_indices:
            feature_values = dataset[:, feature]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                left, right = self._split(dataset, feature, threshold)
                cost = self._cost(left[:, -1], right[:, -1])

                if cost < best_cost:
                    best_cost, best_feature, best_threshold = cost, feature, threshold

        return best_feature, best_threshold

    def _build(self, dataset, depth):
        n_samples, n_features = dataset.shape

        if n_samples >= self.min_samples_split and depth <= self.max_depth:
            feature_indices = np.random.choice(n_features - 1, self.max_features, replace=False) if self.max_features else np.arange(n_features - 1)
            feature, threshold = self._choose_best_split(dataset, feature_indices)
            if feature is not None:
                left, right = self._split(dataset, feature, threshold)
                left_child = self._build(left, depth + 1)
                right_child = self._build(right, depth + 1)
                return self.Node(feature=feature, threshold=threshold, left=left_child, right=right_child)

        value = np.mean(dataset[:, -1]) if self.task == "regression" else np.bincount(dataset[:, -1].astype(int)).argmax()
        return self.Node(value=value)

    def fit(self, X, y):
        dataset = np.column_stack((X, y))
        self.root = self._build(dataset, 1)

    def _predict(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

    def predict(self, X):
        return np.array([self._predict(x, self.root) for x in X])

    def get_params(self, deep=True):
        return {
            "min_samples_split": self.min_samples_split,
            "max_depth": self.max_depth,
            "task": self.task,
            "max_features": self.max_features,
            "random_state": self.random_state,
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self


**Testing the decision tree on a classification task using the Iris dataset**

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def cross_val_score(model, X, y, cv):
    scores = []
    for train_idx, test_idx in cv.split(X, y):
        model.fit(X[train_idx], y[train_idx])
        y_pred = model.predict(X[test_idx])
        score = accuracy_score(y[test_idx], y_pred)
        scores.append(score)
    return np.array(scores)

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Create a decision tree for classification 
dt = DecisionTree()

# Set up cross-validation with 5 folds
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate the average accuracy
scores = cross_val_score(dt, X, y, cv)
print(f"Cross-validated accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}")

Cross-validated accuracy: 0.960 +/- 0.025


**Evaluation of the performance of our Decision Tree on the Iris dataset**

To evaluate our decision tree classifier's performance, we decided to calculate its cross-validated accuracy on a classic dataset for classification (the Iris dataset).

The average accuracy score on this classification task is 96% which suggests that our decision tree performed really well and could accurately predict the species of iris flowers based on their features. 
The standard deviation of 0.025 is very low which suggests that the model is consistent in its predictions across different splits of the dataset.

These results also show that the default values we chose for min_samples_split (2), max_depth (5) and max_features (None) work perfectly well for simple classification tasks

In conclusion, the results indicate that the custom-built decision tree classifier is a reliable model for classifying iris flowers in the given dataset.


**Testing the decision tree on a regression task using the Auto MPG dataset**

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the Auto MPG dataset
mpg_data = sns.load_dataset("mpg").dropna()

# Extract features and target variable
X = mpg_data.drop(columns=["origin", "name", "mpg"])
y = mpg_data["mpg"]

# Preprocessing: Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Your updated DecisionTree class implementation goes here

# Load the Auto MPG dataset
mpg_data = sns.load_dataset("mpg").dropna()

# Extract features and target variable
X = mpg_data.drop(columns=["origin", "name", "mpg"])
y = mpg_data["mpg"]

# Preprocessing: Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter search space
param_dist = {
    "min_samples_split": np.arange(2, 21),
    "max_depth": np.arange(2, 21),
    "max_features": [None] + list(np.arange(1, X_train.shape[1] + 1)),
}

# Create the randomized search with cross-validation
rand_search = RandomizedSearchCV(
    DecisionTree(task="regression", random_state=42),
    param_distributions=param_dist,
    n_iter=100,  # Increased number of iterations
    cv=10,  # Increased number of cross-validation folds
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    random_state=42,
)

# Fit the randomized search to the training data
rand_search.fit(X_train, y_train)

# Get the best model from the randomized search
best_dt = rand_search.best_estimator_

# Make predictions and evaluate the best model
y_pred = best_dt.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", rand_search.best_params_)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Best Parameters: {'min_samples_split': 17, 'max_features': 1, 'max_depth': 19}
Mean Squared Error: 22.951057079327718
R^2 Score: 0.5503370921239449


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


**Evaluation of the performance of our Decision Tree on the Auto MPG dataset**

To evaluate our decision tree regressor's performance, we calculated its MSE and R^2 score on the Auto MPG dataset, which predicts fuel efficiency based on vehicle features.

The MSE of 22.95 suggests that while the model has learned patterns in the data and provides a reasonable approximation, there is still room for improvement as there is still some difference between the predicted fuel efficiency values and the actual values

The R^2 score of 0.5503 suggests that our decision tree model provides a reasonable approximation of fuel efficiency. The R^2 score indicates that the model can explain approximately 55.03% of the variance in the target variable.

These results demonstrate that the tuned hyperparameters min_samples_split (17), max_depth (19), and max_features (1) make the model perform relatively well on this regression task.

In conclusion, our decision tree regressor is a fairly reliable model for simple regression tasks