<a href="https://colab.research.google.com/github/Rohit0562-hub/2513246_RohitJoshi/blob/main/Worksheet_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import numpy as np

In [32]:
class CustomDecisionTree:
  def __init__(self, max_depth=None):
    self.max_depth = max_depth
    self.tree = None

  def fit(self, X, y):
    self.tree = self._build_tree(X, y)

  def _build_tree(self, X, y, depth=0):
    num_samples, num_features = X.shape
    unique_classes = np.unique(y)

    if len(unique_classes) == 1:
      return {'class': unique_classes[0]}

    if num_samples == 0 or (self.max_depth is not None and depth >= self.max_depth):
      return {'class': np.bincount(y).argmax()}

    best_info_gain = -float('inf')
    best_split = None

    for feature_idx in range(num_features):
      thresholds = np.unique(X[:, feature_idx])

      for threshold in thresholds:
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask

        if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
          continue

        info_gain = self.information_gain(y, y[left_mask], y[right_mask])


        if info_gain > best_info_gain:
          best_info_gain = info_gain
          best_split = {
              'feature_idx': feature_idx,
              'threshold': threshold,
              'left_mask': left_mask,
              'right_mask': right_mask
          }

    if best_split is None:
      return {'class': np.bincount(y).argmax()}

    left_tree = self._build_tree(X[best_split['left_mask']], y[best_split['left_mask']], depth + 1)
    right_tree = self._build_tree(X[best_split['right_mask']], y[best_split['right_mask']], depth + 1)

    return {'feature_idx': best_split['feature_idx'], 'threshold': best_split['threshold'], 'left_tree': left_tree, 'right_tree': right_tree}

  def information_gain(self, parent, left, right):
    parent_entropy = self._entropy(parent)
    left_entropy = self._entropy(left)
    right_entropy = self._entropy(right)

    weighted_entropy = (len(left) / len(parent)) * left_entropy + (len(right) / len(parent)) * right_entropy

    return parent_entropy - weighted_entropy

  def _entropy(self, y):
    class_probs = np.bincount(y) / len(y)
    return -np.sum(class_probs * np.log2(class_probs + 1e-9))

  def predict(self, X):
    return np.array([self._predict_single(x, self.tree) for x in X])

  def _predict_single(self, x, tree):
    if 'class' in tree:
      return tree['class']

    if x[tree['feature_idx']] <= tree['threshold']:
      return self._predict_single(x, tree['left_tree'])

    else:
      return self._predict_single(x, tree['right_tree'])


In [33]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
y_pred_custom = custom_tree.predict(X_test)
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")

Custom Decision Tree Accuracy: 1.0000


In [34]:
# Train the Scikit-learn decision tree
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_sklearn = sklearn_tree.predict(X_test)
# Calculate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")

Scikit-learn Decision Tree Accuracy: 1.0000


In [35]:
print(f"Accuracy Comparison:")
print(f"Custom Decision Tree: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree: {accuracy_sklearn:.4f}")

Accuracy Comparison:
Custom Decision Tree: 1.0000
Scikit-learn Decision Tree: 1.0000


In [36]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import make_scorer, f1_score, mean_squared_error, r2_score
import numpy as np

In [37]:
wine = load_wine()
X = wine.data
y = wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

f1_dt = f1_score(y_test, y_pred_dt, average='macro')
print("Decision Tree F1 Score:", f1_dt)

Decision Tree F1 Score: 0.9457411645054665


In [38]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

f1_rf = f1_score(y_test, y_pred_rf, average='macro')
print("Random Forest F1 Score:", f1_rf)

Random Forest F1 Score: 1.0


In [39]:
print(f"\nModel Comparison (F1 Score)")
print(f"Decision Tree : {f1_dt:.4f}")
print(f"Random Forest : {f1_rf:.4f}")


Model Comparison (F1 Score)
Decision Tree : 0.9457
Random Forest : 1.0000


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

f1_macro = make_scorer(f1_score, average='macro')

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=f1_macro, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters:")
print(grid_search.best_params_)

print("\nBest Cross-Validated F1 Score:")
print(grid_search.best_score_)


In [None]:
best_rf = grid_search.best_estimator_

y_pred_best = best_rf.predict(X_test)

f1_best = f1_score(y_test, y_pred_best, average='macro')
print("\nTest F1 Score after tuning:", f1_best)

In [None]:
data = load_wine()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

y_pred_dt = dt_reg.predict(X_test)

mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("Decision Tree Regressor")
print("MSE:", mse_dt)
print("R² Score:", r2_dt)

In [None]:
from sklearn.metrics import r2_score
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

y_pred_rf = rf_reg.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regressor")
print("MSE:", mse_rf)
print("R² Score:", r2_rf)

In [None]:
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}


In [None]:
rf = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42)

random_search.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters:")
print(random_search.best_params_)

best_rf = random_search.best_estimator_

y_pred_best = best_rf.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\nTuned Random Forest Regressor")
print("MSE:", mse_best)
print("R² Score:", r2_best)