In [11]:
import os
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [12]:
import kagglehub

path = kagglehub.dataset_download("mdsultanulislamovi/sleep-disorder-diagnosis-dataset")

print("Path to dataset files:", path)

for file in os.listdir(path):
  print(file)

df = pd.read_csv(path + '/Sleep_health_and_lifestyle_dataset.csv')
df.columns = df.columns.str.strip().str.replace(' ', '_')
df = df.drop(columns='Person_ID')
print(df.isna().sum())
df = df.dropna()
df.head()
numeric_cols = [
    "Age",
    "Sleep_Duration",
    "Quality_of_Sleep",
    "Physical_Activity_Level",
    "Stress_Level",
    "Heart_Rate",
    "Daily_Steps"
]
for col in numeric_cols:
  df[col] = pd.to_numeric(df[col], errors='coerce')
df["Systolic_BP"] = df["Blood_Pressure"].str.split("/", expand=True)[0].astype(int)
df["Diastolic_BP"] = df["Blood_Pressure"].str.split("/", expand=True)[1].astype(int)
df = df.drop(columns='Blood_Pressure')
df["Sleep_Binary"] = (df["Quality_of_Sleep"] >= 7).astype(int)
y = df["Sleep_Binary"]
X = df.drop(columns=["Quality_of_Sleep", "Sleep_Disorder", "Sleep_Binary"])

categorical_ordinal = ['Gender', 'BMI_Category']
categorical_onehot = ['Occupation']
preprocessor = ColumnTransformer(
    transformers = [
        ('ord', OrdinalEncoder(), categorical_ordinal),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), categorical_onehot)
    ],
    remainder='passthrough'
)
X_processed = preprocessor.fit_transform(X)
X_processed = pd.DataFrame(X_processed)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_processed = scaler.fit_transform(X_processed)
X_processed = pd.DataFrame(X_processed)

X_temp, X_test, y_temp, y_test = train_test_split(X_processed, y, test_size=0.15, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)

X_train = np.array(X_train)
X_val   = np.array(X_val)
X_test  = np.array(X_test)

y_train = np.array(y_train)
y_val   = np.array(y_val)
y_test  = np.array(y_test)

Path to dataset files: C:\Users\Shush\.cache\kagglehub\datasets\mdsultanulislamovi\sleep-disorder-diagnosis-dataset\versions\1
Sleep_health_and_lifestyle_dataset.csv
Gender                       0
Age                          0
Occupation                   0
Sleep_Duration               0
Quality_of_Sleep             0
Physical_Activity_Level      0
Stress_Level                 0
BMI_Category                 0
Blood_Pressure               0
Heart_Rate                   0
Daily_Steps                  0
Sleep_Disorder             219
dtype: int64


In [13]:
def accuracy(y_true, y_pred):
    return np.mean(y_true.flatten() == y_pred.flatten())

def precision(y_true, y_pred):
    TP = np.sum((y_pred == 1) & (y_true == 1))
    FP = np.sum((y_pred == 1) & (y_true == 0))
    return TP / (TP + FP + 1e-9)

def recall(y_true, y_pred):
    TP = np.sum((y_pred == 1) & (y_true == 1))
    FN = np.sum((y_pred == 0) & (y_true == 1))
    return TP / (TP + FN + 1e-9)

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r + 1e-9)

def confusion_matrix(y_true, y_pred):
    TP = np.sum((y_pred == 1) & (y_true == 1))
    TN = np.sum((y_pred == 0) & (y_true == 0))
    FP = np.sum((y_pred == 1) & (y_true == 0))
    FN = np.sum((y_pred == 0) & (y_true == 1))
    return np.array([[TP, FP],
                     [FN, TN]])

In [15]:
class SplitNode:
    def __init__(self, feature, threshold, left, right):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right

    def go_right(self, x):
        return x[self.feature] >= self.threshold


class LeafNode:
    def __init__(self, values):
        self.value = sum(values) / len(values)


class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=4, min_samples_split=25,
                 min_samples_leaf=10, min_improvement=0.02):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_improvement = min_improvement
        self.root = None

    def _mse(self, labels):
        if len(labels) == 0:
            return 0
        mean_val = sum(labels) / len(labels)
        return sum((y - mean_val) ** 2 for y in labels) / len(labels)

    def _split(self, X, y, feature, threshold):
        X_left, y_left = [], []
        X_right, y_right = [], []

        for xi, yi in zip(X, y):
            if xi[feature] >= threshold:
                X_right.append(xi)
                y_right.append(yi)
            else:
                X_left.append(xi)
                y_left.append(yi)

        return X_left, y_left, X_right, y_right

    def _mse_gain(self, X, y, feature, threshold):
        X_left, y_left, X_right, y_right = self._split(X, y, feature, threshold)

        if len(y_left) == 0 or len(y_right) == 0:
            return 0

        parent = self._mse(y)
        n = len(y)

        child = (len(y_left) / n) * self._mse(y_left) + \
                (len(y_right) / n) * self._mse(y_right)

        return parent - child

    def _best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None

        n_features = len(X[0])

        for f in range(n_features):
            thresholds = set(x[f] for x in X)

            for t in thresholds:
                gain = self._mse_gain(X, y, f, t)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = f
                    best_threshold = t

        if best_gain > 0:
            return best_feature, best_threshold, best_gain
        return None

    def _build_tree(self, X, y, depth=0):
        # Stopping rules
        if depth >= self.max_depth:
            return LeafNode(y)

        if len(y) < self.min_samples_split:
            return LeafNode(y)

        split = self._best_split(X, y)
        if split is None:
            return LeafNode(y)

        feature, threshold, gain = split

        if gain < self.min_improvement:
            return LeafNode(y)

        X_left, y_left, X_right, y_right = self._split(X, y, feature, threshold)

        if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
            return LeafNode(y)

        left_child = self._build_tree(X_left, y_left, depth + 1)
        right_child = self._build_tree(X_right, y_right, depth + 1)

        return SplitNode(feature, threshold, left_child, right_child)

    def fit(self, X, y):
        """Train the decision tree."""
        self.root = self._build_tree(X, y)
        return self

    def _predict_one(self, x):
        node = self.root
        while isinstance(node, SplitNode):
            node = node.right if node.go_right(x) else node.left
        return node.value

    def predict(self, X):
        return [self._predict_one(x) for x in X]

tree = DecisionTreeRegressorScratch(
    max_depth=4,
    min_samples_split=25,
    min_samples_leaf=10,
    min_improvement=0.02
)

tree.fit(train_data, train_labels)

y_pred_train = tree.predict(train_data)
y_pred_val   = tree.predict(val_data)
y_pred_test  = tree.predict(test_data)


def mse_manual(y_true, y_pred):
    return sum((yt - yp) ** 2 for yt, yp in zip(y_true, y_pred)) / len(y_true)

print("Train MSE:", mse_manual(train_labels, y_pred_train))
print("Val MSE:",   mse_manual(val_labels, y_pred_val))
print("Test MSE:",  mse_manual(test_labels, y_pred_test))

NameError: name 'train_data' is not defined