<a href="https://colab.research.google.com/github/Varshini-svnit/ML_LABS/blob/main/lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import KFold

In [None]:
def accuracy_from_scratch(y_true, y_pred):
    """Calculates classification accuracy."""
    return np.sum(y_true == y_pred) / len(y_true)

def precision_recall_f1_from_scratch(y_true, y_pred, average='macro'):
    """Calculates precision, recall, and F1-score for multi-class classification."""
    classes = np.unique(np.concatenate((y_true, y_pred)))
    all_precisions = []
    all_recalls = []
    all_f1s = []

    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    if average == 'macro':
        return np.mean(all_precisions), np.mean(all_recalls), np.mean(all_f1s)
    else:
        # Defaulting to macro for simplicity
        return np.mean(all_precisions), np.mean(all_recalls), np.mean(all_f1s)

def evaluate_classifier_from_scratch(y_true, y_pred, average='macro'):
    """Evaluates a classifier using from-scratch metrics."""
    accuracy = accuracy_from_scratch(y_true, y_pred)
    precision, recall, f1 = precision_recall_f1_from_scratch(y_true, y_pred, average)
    return accuracy, precision, recall, f1

def mse_from_scratch(y_true, y_pred):
    """Calculates Mean Squared Error."""
    return np.mean((y_true - y_pred) ** 2)

def mae_from_scratch(y_true, y_pred):
    """Calculates Mean Absolute Error."""
    return np.mean(np.abs(y_true - y_pred))

def r2_from_scratch(y_true, y_pred):
    """Calculates R-squared score."""
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    if ss_tot == 0:
        return 1.0 if ss_res == 0 else 0.0
    return 1 - (ss_res / ss_tot)

def evaluate_regressor_from_scratch(y_true, y_pred):
    """Evaluates a regressor using from-scratch metrics."""
    mse = mse_from_scratch(y_true, y_pred)
    mae = mae_from_scratch(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_from_scratch(y_true, y_pred)
    return mse, mae, rmse, r2

In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


In [None]:
class DecisionTreeClassifier:
    def __init__(self, min_samples_split=2, max_depth=100, criterion="id3"):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.criterion = criterion
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if (depth >= self.max_depth or len(np.unique(y)) == 1 or n_samples < self.min_samples_split):
            return Node(value=self._most_common_label(y))

        feat_idxs = np.random.choice(n_features, n_features, replace=False)
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        if best_feat is None:
            return Node(value=self._most_common_label(y))

        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)

        # --- FIX: Prevent splits that result in empty branches ---
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return Node(value=self._most_common_label(y))

        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                if self.criterion == "c4.5":
                    gain = self._gain_ratio(y, X_column, threshold, gain)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, split_thresh)
        if len(left_idxs) == 0 or len(right_idxs) == 0: return 0
        n, n_l, n_r = len(y), len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
        return parent_entropy - child_entropy

    def _gain_ratio(self, y, X_column, split_thresh, information_gain):
        split_info = self._split_info(y, X_column, split_thresh)
        return information_gain / split_info if split_info != 0 else 0

    def _split_info(self, y, X_column, split_thresh):
        left_idxs, right_idxs = self._split(X_column, split_thresh)
        n, n_l, n_r = len(y), len(left_idxs), len(right_idxs)
        if n_l == 0 or n_r == 0: return 0
        p_l, p_r = n_l / n, n_r / n
        return -p_l * np.log2(p_l) - p_r * np.log2(p_r)

    def _split(self, X_column, split_thresh):
        return np.argwhere(X_column <= split_thresh).flatten(), np.argwhere(X_column > split_thresh).flatten()

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node(): return node.value
        if x[node.feature] <= node.threshold: return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


In [None]:
class DecisionTreeRegressor:
    def __init__(self, min_samples_split=2, max_depth=100):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if (depth >= self.max_depth or n_samples < self.min_samples_split):
            return Node(value=np.mean(y))
        feat_idxs = np.random.choice(n_features, n_features, replace=False)
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
        if best_feat is None:
            return Node(value=np.mean(y))
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_vr = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                vr = self._variance_reduction(y, X_column, threshold)
                if vr > best_vr:
                    best_vr = vr
                    split_idx = feat_idx
                    split_thresh = threshold
        return split_idx, split_thresh

    def _variance_reduction(self, y, X_column, split_thresh):
        parent_variance = np.var(y)
        left_idxs, right_idxs = self._split(X_column, split_thresh)
        if len(left_idxs) == 0 or len(right_idxs) == 0: return 0
        n, n_l, n_r = len(y), len(left_idxs), len(right_idxs)
        var_l, var_r = np.var(y[left_idxs]), np.var(y[right_idxs])
        child_variance = (n_l / n) * var_l + (n_r / n) * var_r
        return parent_variance - child_variance

    def _split(self, X_column, split_thresh):
        return np.argwhere(X_column <= split_thresh).flatten(), np.argwhere(X_column > split_thresh).flatten()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node(): return node.value
        if x[node.feature] <= node.threshold: return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


__play cricket__

In [None]:
df_cricket = pd.read_csv("playCricket.csv")
df_cricket = df_cricket.drop('Day', axis=1)
for col in df_cricket.columns: df_cricket[col] = df_cricket[col].astype('category').cat.codes
X_cricket, y_cricket = df_cricket.iloc[:, :-1].values, df_cricket.iloc[:, -1].values
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ID3
print("\n--- ID3 ---")
id3_scores = []
for train_index, test_index in kf.split(X_cricket):
    X_train, X_test = X_cricket[train_index], X_cricket[test_index]
    y_train, y_test = y_cricket[train_index], y_cricket[test_index]
    model = DecisionTreeClassifier(criterion='id3')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    id3_scores.append(evaluate_classifier_from_scratch(y_test, y_pred))
avg_id3 = np.mean(id3_scores, axis=0)
print(f"Average Accuracy: {avg_id3[0]:.4f}\nAverage Precision: {avg_id3[1]:.4f}\nAverage Recall: {avg_id3[2]:.4f}\nAverage F1-score: {avg_id3[3]:.4f}")

# C4.5
print("\n--- C4.5 ---")
c45_scores = []
for train_index, test_index in kf.split(X_cricket):
    X_train, X_test = X_cricket[train_index], X_cricket[test_index]
    y_train, y_test = y_cricket[train_index], y_cricket[test_index]
    model = DecisionTreeClassifier(criterion='c4.5')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    c45_scores.append(evaluate_classifier_from_scratch(y_test, y_pred))
avg_c45 = np.mean(c45_scores, axis=0)
print(f"Average Accuracy: {avg_c45[0]:.4f}\nAverage Precision: {avg_c45[1]:.4f}\nAverage Recall: {avg_c45[2]:.4f}\nAverage F1-score: {avg_c45[3]:.4f}")



--- ID3 ---
Average Accuracy: 0.7000
Average Precision: 0.6667
Average Recall: 0.6500
Average F1-score: 0.6133

--- C4.5 ---
Average Accuracy: 0.7000
Average Precision: 0.6667
Average Recall: 0.6500
Average F1-score: 0.6133


__drug_200__

In [None]:
df_drug = pd.read_csv("drug_200.csv")
for col in ['Sex', 'BP', 'Cholesterol', 'Drug']: df_drug[col] = df_drug[col].astype('category').cat.codes
for col in ['Age', 'Na_to_K']: df_drug[col] = (df_drug[col] > df_drug[col].mean()).astype(int)
X_drug, y_drug = df_drug.drop('Drug', axis=1).values, df_drug['Drug'].values
kf_drug = KFold(n_splits=5, shuffle=True, random_state=42)

# ID3
print("\n--- ID3 ---")
id3_drug_scores = []
for train_index, test_index in kf_drug.split(X_drug):
    X_train, X_test = X_drug[train_index], X_drug[test_index]
    y_train, y_test = y_drug[train_index], y_drug[test_index]
    model = DecisionTreeClassifier(criterion='id3')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    id3_drug_scores.append(evaluate_classifier_from_scratch(y_test, y_pred))
avg_id3_drug = np.mean(id3_drug_scores, axis=0)
print(f"Average Accuracy: {avg_id3_drug[0]:.4f}\nAverage Precision: {avg_id3_drug[1]:.4f}\nAverage Recall: {avg_id3_drug[2]:.4f}\nAverage F1-score: {avg_id3_drug[3]:.4f}")

# C4.5
print("\n--- C4.5 ---")
c45_drug_scores = []
for train_index, test_index in kf_drug.split(X_drug):
    X_train, X_test = X_drug[train_index], X_drug[test_index]
    y_train, y_test = y_drug[train_index], y_drug[test_index]
    model = DecisionTreeClassifier(criterion='c4.5')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    c45_drug_scores.append(evaluate_classifier_from_scratch(y_test, y_pred))
avg_c45_drug = np.mean(c45_drug_scores, axis=0)
print(f"Average Accuracy: {avg_c45_drug[0]:.4f}\nAverage Precision: {avg_c45_drug[1]:.4f}\nAverage Recall: {avg_c45_drug[2]:.4f}\nAverage F1-score: {avg_c45_drug[3]:.4f}")



--- ID3 ---
Average Accuracy: 0.8950
Average Precision: 0.8231
Average Recall: 0.8699
Average F1-score: 0.8276

--- C4.5 ---
Average Accuracy: 0.8950
Average Precision: 0.8231
Average Recall: 0.8699
Average F1-score: 0.8276


__petrol_consumption__

In [None]:
df_petrol = pd.read_csv("petrol_consumption.csv")
X_petrol, y_petrol = df_petrol.iloc[:, :-1].values, df_petrol.iloc[:, -1].values
kf_petrol = KFold(n_splits=5, shuffle=True, random_state=42)

reg_scores = []
for train_index, test_index in kf_petrol.split(X_petrol):
    X_train, X_test = X_petrol[train_index], X_petrol[test_index]
    y_train, y_test = y_petrol[train_index], y_petrol[test_index]
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    reg_scores.append(evaluate_regressor_from_scratch(y_test, y_pred))
avg_reg = np.mean(reg_scores, axis=0)
print(f"\nAverage MSE: {avg_reg[0]:.4f}\nAverage MAE: {avg_reg[1]:.4f}\nAverage RMSE: {avg_reg[2]:.4f}\nAverage R-squared: {avg_reg[3]:.4f}")



Average MSE: 12152.9089
Average MAE: 75.8600
Average RMSE: 105.8226
Average R-squared: -0.3261


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
