In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import decision_tree, utils

%matplotlib inline
pd.options.display.max_columns = None

In [2]:
dataset = pd.read_csv("../dataset/Student Stress Factors (2).csv")
dataset.columns = ["Sleep Quality", "Headache Frequency", "Academic Performance", "Study Load", "Extracurricular Frequency", "Stress Level"]

In [3]:
class MajorityVoter:
    def predict(self, models: list[decision_tree.DecisionTree], X: np.array) -> int:
        """
        Make predictions based on simple majority

        Parameters
        ----------
        models : list[decision_tree.DecisionTree]
        X : np.array
            Data

        Returns
        -------
        label: int
        """
        predictions = {}
        for model in models:
            pred = model.make_prediction(X, model.root)
            if pred in predictions.keys():
                predictions[pred] += 1
            else:
                predictions[pred] = 1
        
        max_vote = max(predictions.values())
        for label, votes in predictions.items():
            if votes == max_vote:
                return label

In [4]:
class RandomForest:
    def __init__(self, n_estimators: int=100, max_depth: int=2, min_samples: int=2, bootstrap_sample_size: int=None, num_classes: int=2) -> None:
        """
        Random Forest class

        Parameters
        ----------
        n_estimators : int, optional
            number of estimators, by default 100
        max_depth : int, optional
            max depth of each estimator, by default 2
        min_samples : int, optional
            minimum number of samples required in the leaf node, by default 2
        num_classes : int, optional
            number of output labels, by default 2
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.num_classes = num_classes
        self.bootstrap_sample_size = bootstrap_sample_size
        self.majority_voter = MajorityVoter()
        self.estimators = [decision_tree.DecisionTree(min_samples=self.min_samples, max_depth=self.max_depth, num_classes=self.num_classes) 
                                for _ in range(n_estimators)]


    def __getBootstrapSample(self, df: pd.DataFrame, sample_size: int=100) -> pd.DataFrame:
        """
        Get bootstrap data with replacement

        Parameters
        ----------
        df : pd.DataFrame
            Dataset
        sample_size : int, optional
            Size of each sample, by default 100

        Returns
        -------
        df: pd.DataFrame
            Bootstrap Dataset
        """
        row_indx = random.choices(df.index.to_list(), k=sample_size)

        return df.loc[row_indx]
    

    def fit(self, df:pd.DataFrame) -> None:
        """
        Fit the estimators

        Parameters
        ----------
        df : pd.DataFrame
            Dataset
        """
        for estimator in self.estimators:
            if self.bootstrap_sample_size:
                bootstrap_dataset = self.__getBootstrapSample(df, self.bootstrap_sample_size)
            else:
                bootstrap_dataset = self.__getBootstrapSample(df, df.shape[0])

            estimator.fit(bootstrap_dataset)


    def predict(self, X: pd.DataFrame) -> np.array:
        """
        Make predictions

        Parameters
        ----------
        X : pd.DataFrame
            Data

        Returns
        -------
        y_hat: np.array
            Output labels
        """
        y_hat = [self.majority_voter.predict(self.estimators, x) for x in X.to_numpy()]
        
        return np.array(y_hat)

In [5]:
train_data, validation_data = utils.train_valid_split(dataset, test_size=0.1, random_state=11)

X_valid = validation_data.copy()
y_valid = X_valid["Stress Level"]
X_valid.drop(["Stress Level"], axis=1, inplace=True)

X_train = train_data.copy()
y_train = X_train["Stress Level"]
X_train.drop(["Stress Level"], axis=1, inplace=True)

In [6]:
rf = RandomForest(n_estimators=150, max_depth=20, bootstrap_sample_size=100, num_classes=5)
rf.fit(train_data)

In [7]:
pred = rf.predict(X_valid)
print(f"Accuracy Score on Validation Set: {utils.get_accuracy_score(y_valid, pred)}")

Accuracy Score on Validation Set: 0.9230769230769231


In [8]:
from sklearn.metrics import f1_score

print(f"F1 Score on Validation Set: {f1_score(y_valid, pred, average='weighted')}")

F1 Score on Validation Set: 0.9228500459621742


In [9]:
from math import inf

run = 0
if run:
    accuracy_vs_estimators = {"x": [], "y": []}
    best_n_estimator = inf
    highest_score = -inf

    for n_estimators in range(1, 200, 10):
        rf = RandomForest(n_estimators=n_estimators, max_depth=20, num_classes=5)
        rf.fit(train_data)
        pred = rf.predict(X_valid)
        accuracy = utils.get_accuracy_score(y_valid, pred)
        if accuracy>highest_score:
            best_n_estimator = n_estimators
            highest_score = accuracy

        accuracy_vs_estimators["x"].append(n_estimators)
        accuracy_vs_estimators["y"].append(accuracy)

    plt.plot(accuracy_vs_estimators["x"], accuracy_vs_estimators["y"])
    plt.xlabel("n_estimators")
    plt.ylabel("accuracy")
    plt.title("n_estimators vs Accuracy")
    plt.show()

In [10]:
run = 0
if run:
    accuracy_vs_max_depth = {"x": [], "y": []}
    best_max_depth = inf
    highest_score = -inf

    for max_depth in range(2, 60, 5):
        rf = RandomForest(n_estimators=best_n_estimator, max_depth=max_depth, num_classes=5)
        rf.fit(train_data)
        pred = rf.predict(X_valid)
        accuracy = utils.get_accuracy_score(y_valid, pred)
        if accuracy>highest_score:
            best_max_depth = max_depth
            highest_score = accuracy
            
        accuracy_vs_max_depth["x"].append(max_depth)
        accuracy_vs_max_depth["y"].append(accuracy)

    plt.plot(accuracy_vs_max_depth["x"], accuracy_vs_max_depth["y"])
    plt.xlabel("max_depth")
    plt.ylabel("accuracy")
    plt.title("max_depth vs Accuracy")
    plt.show()