In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.display.max_columns = None

In [2]:
dataset = pd.read_csv("./Student Stress Factors (2).csv")
dataset.columns = ["Sleep Quality", "Headache Frequency", "Academic Performance", "Study Load", "Extracurricular Frequency", "Stress Level"]

## Train-Validation Split

In [3]:
import random

def train_valid_split(df: pd.DataFrame, test_size: float = 0.2, random_state: int=10) -> tuple:
    """
    Performs train-validation split

    Parameters
    ----------
    X: pd.DataFrame
        Input Features
    y: pd.DataFrame
        Target Label
    test_size: float, optional
        Size of validation set, by default 0.2
    random_state: int
        Random State

    Returns
    -------
    train_data, valid_data
    """
    random.seed(random_state)

    test_size = round(test_size * len(df))
    indices = df.index.tolist()

    test_index = random.sample(indices, k=test_size)

    train_data = df.drop(test_index)
    valid_data = df.iloc[test_index]

    return train_data, valid_data


## Implementing a Decision Tree

In [4]:
class Node:
    def __init__(self, feature, threshold, left, right, info_gain):
        self.is_leaf = False
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

class Leaf:
    def __init__(self, value):
        self.is_leaf = True
        self.value = value

In [5]:
class DecisionTree:
    def __init__(self, min_samples: int=3, max_depth: int=2, num_classes: int=2) -> None:
        """

        Parameters
        ----------
        min_samples: int, optional
            by default 3
        max_depth: int, optional
            by default 2
        num_classes: int, optional
            by default 2
        """
        
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.num_classes = num_classes


    def __split(self, dataset: np.ndarray, feature: int, split_thresh: float) -> tuple:
        """
        Splits data into left and right branches

        Parameters
        ----------
        X: np.ndarray
            Data
        indices: list
            Active Indices
        feature: int
            Index of feature for splitting

        Returns
        -------
        left_indices: list
        right_indices: list
        """

        left_dataset = []
        right_dataset = []

        for row in dataset:
            if row[feature] <= split_thresh:
                left_dataset.append(row)
            else:
                right_dataset.append(row)

        return np.array(left_dataset), np.array(right_dataset)
    

    def __get_entropy(self, y: np.ndarray) -> float:
        """
        Computes the entropy

        Parameters
        ----------
        y: np.ndarray
            Label

        Returns
        -------
        Entropy: float
        """

        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -(np.log(probabilities)/np.log(self.num_classes)))
        
        return entropy
    

    def __get_information_gain(self, parent: np.ndarray, left: np.ndarray, right: np.ndarray) -> float:
        """
        Get Information Gain

        Parameters
        ----------
        parent: np.ndarray
            Parent Node
        left: np.ndarray
            Left Child
        right: np.ndarray
            Right Child

        Returns
        -------
        information_gain: float
        """

        w_left = len(left)/len(parent)
        w_right = len(right)/len(parent)

        entropy_left, entropy_right = self.__get_entropy(left), self.__get_entropy(right)

        weighted_entropy = w_left*entropy_left + w_right*entropy_right

        information_gain = self.__get_entropy(parent) - weighted_entropy

        return information_gain
    

    def __get_best_split(self, dataset: np.ndarray, num_features: int) -> dict:
        """
        Get best split parameters

        Parameters
        ----------
        dataset: np.ndarray
        num_features: int
            
        Returns
        -------
        best_split: dict
            keys: gain, feature, threshold, left, right
        """

        best_split = {'gain': -1, 'feature': None, 'split_thresh': None}

        for feature_indx in range(num_features):
            feature_values = dataset[:, feature_indx]
            thresholds = np.unique(feature_values)
            for threshold in thresholds:
                left, right = self.__split(dataset, feature_indx, threshold)
                if len(left) and len(right):
                    y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                    information_gain = self.__get_information_gain(y, left_y, right_y)
                    if information_gain > best_split["gain"]:
                            best_split["feature"] = feature_indx
                            best_split["split_thresh"] = threshold
                            best_split["left"] = left
                            best_split["right"] = right
                            best_split["gain"] = information_gain
        
        return best_split
    

    def __calculate_leaf_value(self, y: np.ndarray) -> int:
        """
        Calculate Leaf Value

        Parameters
        ----------
        y: np.ndarray

        Returns
        -------
        leaf_value: int
        """

        y = list(y)
        return max(y, key=y.count)


    def __build_tree_recur(self, dataset: np.ndarray, depth: int=0) -> Node:
        """
        Build the decision tree recursively

        Parameters
        ----------
        dataset: np.ndarray
        depth: int, optional

        Returns
        -------
        root: Node
        """

        X, y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = X.shape

        if num_samples >= self.min_samples and depth <= self.max_depth:
            best_split = self.__get_best_split(dataset, num_features)
            if best_split["gain"] > 0:
                left_subtree = self.__build_tree_recur(best_split["left"], depth+1)
                right_subtree = self.__build_tree_recur(best_split["right"], depth+1)
                
                return Node(best_split["feature"], best_split["split_thresh"], 
                            left_subtree, right_subtree, best_split["gain"])

        leaf_value = self.__calculate_leaf_value(y)

        return Leaf(leaf_value)
    

    def fit(self, dataset: pd.DataFrame) -> None:
        """
        Fit model to data

        Parameters
        ----------
        dataset: pd.DataFrame
            Training Data
        """

        self.root = self.__build_tree_recur(dataset.to_numpy())
    

    def __make_prediction(self, X: np.ndarray, node: Node):
        if node.is_leaf: 
            return node.value
        else:
            feature = X[node.feature]
            if feature <= node.threshold:
                return self.__make_prediction(X, node.left)
            else:
                return self.__make_prediction(X, node.right)
            
            
    def predict(self, X: pd.DataFrame) -> np.array:
        """
        Make prediction

        Parameters
        ----------
        X: pd.DataFrame
            Data

        Returns
        -------
        predictions: np.array
            Output Label
        """
        
        preditions = [self.__make_prediction(x, self.root) for x in X.to_numpy()]
        return np.array(preditions)
    

## Training the model

In [6]:
train_data, validation_data = train_valid_split(dataset, test_size=0.1, random_state=10)

X_valid = validation_data.copy()
y_valid = X_valid["Stress Level"]
X_valid.drop(["Stress Level"], axis=1, inplace=True)

In [7]:
tree = DecisionTree(min_samples=3, max_depth=20, num_classes=5)

In [8]:
tree.fit(train_data)

## Making predictions

In [9]:
pred = tree.predict(X_valid)

## Metrics

In [10]:
def get_accuracy_score(y_true: pd.DataFrame, y_pred: np.ndarray) -> float:
    """
    Computes the accuracy of a classification model

    Parameters
    ----------
    y_true : pd.DataFrame
        True Labels
    y_pred : np.ndarray
        Predicted Labels

    Returns
    -------
    accuracy_score: float
        The accuracy of the model
    """

    y_true = y_true.to_numpy().flatten()
    correct_predictions = np.sum(y_true == y_pred)
    return correct_predictions/len(y_true)

In [11]:
print(f"Accuracy Score: {get_accuracy_score(y_valid, pred)}")

Accuracy Score: 0.8846153846153846
