<a href="https://colab.research.google.com/github/Shreeyamshu/Ai/blob/main/Worksheet8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

In [2]:
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        """Initializes the tree with a max_depth constraint."""
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        """Trains the model."""
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        """Recursively builds the tree."""
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        if len(unique_classes) == 1:
            return {'class': unique_classes[0]}

        if num_samples == 0 or (self.max_depth and depth >= self.max_depth):
            return {'class': np.bincount(y).argmax()}

        best_info_gain = float('-inf')
        best_split = None

        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask

                left_y = y[left_mask]
                right_y = y[right_mask]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                info_gain = self._information_gain(y, left_y, right_y)

                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split = {
                        'feature_idx': feature_idx,
                        'threshold': threshold,
                        'left_y': left_mask,
                        'right_y': right_mask
                    }

        if best_split is None:
            return {'class': np.bincount(y).argmax()}

        left_tree = self._build_tree(X[best_split['left_y']], y[best_split['left_y']], depth + 1)
        right_tree = self._build_tree(X[best_split['right_y']], y[best_split['right_y']], depth + 1)

        return {
            'feature_idx': best_split['feature_idx'],
            'threshold': best_split['threshold'],
            'left_tree': left_tree,
            'right_tree': right_tree
        }

    def _information_gain(self, parent, left, right):
        """Calculates Information Gain."""
        parent_entropy = self._entropy(parent)
        left_entropy = self._entropy(left)
        right_entropy = self._entropy(right)

        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        weighted_avg = (weight_left * left_entropy) + (weight_right * right_entropy)

        return parent_entropy - weighted_avg

    def _entropy(self, y):
        """Calculates Entropy."""
        class_probs = np.bincount(y) / len(y)
        return -np.sum(class_probs * np.log2(class_probs + 1e-9))

    def predict(self, X):
        """Predicts classes for input features X."""
        return np.array([self._predict_single(x, self.tree) for x in X])

    def _predict_single(self, x, tree):
        """Recursive helper for prediction."""
        if 'class' in tree:
            return tree['class']

        feature_val = x[tree['feature_idx']]
        if feature_val <= tree['threshold']:
            return self._predict_single(x, tree['left_tree'])
        else:
            return self._predict_single(x, tree['right_tree'])

In [3]:
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
y_pred_custom = custom_tree.predict(X_test)
acc_custom = accuracy_score(y_test, y_pred_custom)

sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
y_pred_sklearn = sklearn_tree.predict(X_test)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

print(f"Custom Tree Accuracy:      {acc_custom:.4f}")
print(f"Scikit-Learn Tree Accuracy: {acc_sklearn:.4f}")

Custom Tree Accuracy:      1.0000
Scikit-Learn Tree Accuracy: 1.0000


#Exercise


In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error

##Loading the Dataset


In [5]:
wine = load_wine()
X_wine, y_wine = wine.data, wine.target

##Spliting 80,20

In [6]:
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_wine, y_wine, test_size=0.2, random_state=30)

##Task1

###Training Decision Tree and Random Forest

In [7]:
dt_clf = DecisionTreeClassifier(random_state=30)
dt_clf.fit(X_train_w, y_train_w)

rf_clf = RandomForestClassifier(random_state=30)
rf_clf.fit(X_train_w, y_train_w)



###Comparison based on F1 Score

In [8]:
f1_dt = f1_score(y_test_w, dt_clf.predict(X_test_w), average='weighted')
f1_rf = f1_score(y_test_w, rf_clf.predict(X_test_w), average='weighted')

print(f"Decision Tree F1 Score: {f1_dt:.4f}")
print(f"Random Forest F1 Score: {f1_rf:.4f}")

Decision Tree F1 Score: 0.8327
Random Forest F1 Score: 0.9724


##Task2

###Hyperparameter

In [9]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

###GridSearch

In [10]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1_weighted')
grid_search.fit(X_train_w, y_train_w)

print(f"Best Tuned Parameters: {grid_search.best_params_}")
print(f"Best Tuned F1 Score:   {grid_search.best_score_:.4f}")

Best Tuned Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Tuned F1 Score:   0.9931


##Task3

In [11]:
X_reg = wine.data[:, 1:]
y_reg = wine.data[:, 0]
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=30)

dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train_r, y_train_r)
mse_dt = mean_squared_error(y_test_r, dt_reg.predict(X_test_r))

rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train_r, y_train_r)
mse_rf = mean_squared_error(y_test_r, rf_reg.predict(X_test_r))

print(f"Baseline Decision Tree MSE: {mse_dt:.4f}")
print(f"Baseline Random Forest MSE: {mse_rf:.4f}")

param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(RandomForestRegressor(random_state=30), param_dist, n_iter=5, cv=3, random_state=30)
random_search.fit(X_train_r, y_train_r)

print(f"Best Regressor Params: {random_search.best_params_}")

Baseline Decision Tree MSE: 0.5151
Baseline Random Forest MSE: 0.2304
Best Regressor Params: {'n_estimators': 100, 'min_samples_leaf': 4, 'max_depth': 5}
