In [138]:
import matplotlib.pyplot as plt
import kagglehub
import pandas as pd
import numpy as np
import os

path = kagglehub.dataset_download("varishabatool/disorder")
print("Path to dataset files:", path)

os.listdir(path)
# looks inside the folder stored in path, shows all files downloaded by kagglehub.
df = pd.read_csv(path + '/Sleep_health_and_lifestyle_dataset.csv')
# read the csv file, converst it into a pandas dataframe
print("Original Quality of Sleep counts:")
print(df['Quality of Sleep'].value_counts())


Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\varishabatool\disorder\versions\1
Original Quality of Sleep counts:
Quality of Sleep
8    109
6    105
7     77
9     71
5      7
4      5
Name: count, dtype: int64


In [122]:
# preproccessing
# Map original classes to binary
binary_mapping = {4:0, 5:0, 6:0, 7:1, 8:1, 9:1}
df['Sleep_Binary'] = df['Quality of Sleep'].map(binary_mapping)

# Drop rows with NaN (if any)
df = df.dropna(subset=['Sleep_Binary'])

# Separate features and target
X = df.drop(['Quality of Sleep', 'Sleep_Binary'], axis=1)
y = df['Sleep_Binary'].astype(int)

# One-hot encode categorical variables if needed
X = pd.get_dummies(X, drop_first=True)
X = X.astype(float)

# Normalize features
X_min = X.min(axis=0)
X_max = X.max(axis=0)
X_norm = (X - X_min) / (X_max - X_min + 1e-8)

# Convert to NumPy arrays
X_norm = X_norm.values
y = y.values

# Shuffle dataset
np.random.seed(42)
indices = np.arange(X_norm.shape[0])
np.random.shuffle(indices)
X_norm = X_norm[indices]
y = y[indices]

# Split dataset
total = X_norm.shape[0]
train_size = int(0.6 * total)
val_size = int(0.2 * total)

X_train = X_norm[:train_size]
y_train = y[:train_size]

X_val = X_norm[train_size:train_size+val_size]
y_val = y[train_size:train_size+val_size]

X_test = X_norm[train_size+val_size:]
y_test = y[train_size+val_size:]

# Check class distribution
print("Class distribution (train):", np.bincount(y_train))
print("Class distribution (val):", np.bincount(y_val))
print("Class distribution (test):", np.bincount(y_test))





Class distribution (train): [ 78 146]
Class distribution (val): [19 55]
Class distribution (test): [20 56]


In [123]:
# Linear regression
class LinearRegression:
    def __init__(self, lr = 0.1, epochs = 1000):
        self.lr = lr
        self.epochs = epochs
        self.W = None
        self.b = None
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.W = np.zeros(n_features)
        self.b = 0

        for i in range(self.epochs):
            y_pred = np.dot(X, self.W) + self.b
            # compute gradients
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # update weights
            self.W -= self.lr * dw
            self.b -= self.lr * db
    def predict(self, X):
        return np.dot(X, self.W) + self.b

linreg = LinearRegression(lr = 0.01, epochs = 2000)
linreg.fit(X_train, y_train)

y_val_pred_cont = linreg.predict(X_val)
y_test_pred_cont = linreg.predict(X_test)

y_val_pred = (y_val_pred_cont >= 0.5).astype(int)
y_test_pred = (y_test_pred_cont >=0.5).astype(int)

print("Validation_accuracy:", np.mean(y_val_pred == y_val))
print("Test accuracy:", np.mean(y_test_pred == y_test))

print("Predicted classes (val):", np.bincount(y_val_pred))
print("Predicted classes (test):", np.bincount(y_test_pred))
print(X_train.shape)

            

Validation_accuracy: 0.9864864864864865
Test accuracy: 0.9605263157894737
Predicted classes (val): [20 54]
Predicted classes (test): [23 53]
(224, 46)


In [129]:
#  Logistic regression
class LogisticRegression:
    def __init__(self, lr = 0.1, epochs = 500):
        self.lr = lr
        self.epochs = epochs
        self.W = None
        self.b = None
    def _softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims = True))
        return exp_z / np.sum(exp_z, axis=1, keepdims = True)
    def _one_hot(self, y):
        n_classes = np.max(y) +1
        one_hot = np.zeros((y.shape[0], n_classes))
        one_hot[np.arange(y.shape[0]), y] = 1
        return one_hot
    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = np.max(y) + 1
        self.W = np.zeros((n_features, n_classes))
        self.b = np.zeros((1, n_classes))
        y_onehot = self._one_hot(y)
        for i in range(self.epochs):
            z = np.dot(X, self.W) + self.b
            y_pred = self._softmax(z)
            # gradient
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y_onehot))
            db = (1/n_samples) * np.sum(y_pred - y_onehot, axis = 0, keepdims = True)
            self.W -= self.lr * dw
            self.b -= self.lr * db
    def predict(self, X):
        z = np.dot(X, self.W) + self.b
        y_pred = self._softmax(z)
        return np.argmax(y_pred, axis = 1)
model = LogisticRegression(lr = 0.1, epochs = 1000)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

print("Validation_accuracy:", np.mean(y_val_pred == y_val))
print("Test accuracy:", np.mean(y_test_pred == y_test))


Validation_accuracy: 0.9864864864864865
Test accuracy: 1.0


In [128]:
# K-Nearest Neighbors
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    def predict(self, X):
        y_pred = []
        for x in X:
            # Euclidean distance
            distances = np.sqrt(np.sum((self.X_train - x)**2, axis = 1))
            k_idx = np.argsort(distances)[:self.k]
            k_labels = self.y_train[k_idx]
            # majority vote
            counts = np.bincount(k_labels)
            y_pred.append(np.argmax(counts))
        return np.array(y_pred)
knn = KNN(k=5)
knn.fit(X_train, y_train)
y_val_pred = knn.predict(X_val)
y_test_pred = knn.predict(X_test)

print("Validation_accuracy:", np.mean(y_val_pred == y_val))
print("Test accuracy:", np.mean(y_test_pred == y_test))


Validation_accuracy: 0.9864864864864865
Test accuracy: 0.9736842105263158


In [126]:
# Decision tree
class DecisionTreeNode:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
class DecisionTree:
    def __init__(self, max_depth = None):
        self.max_depth = max_depth
        self.n_classes = None
        self.n_features = None
        self.tree = None

    def _gini(self, y):
        m = y.size
        if m==0:
            return 0
        counts = np.bincount(y, minlength = self.n_classes)
        prob = counts / m
        return 1 - np.sum(prob**2)
    def _best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None
        best_gini = 1.0
        best_idx, best_thr = None, None
        for idx in range(n):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = np.zeros(self.n_classes)
            num_right = np.bincount(classes, minlength = self.n_classes)
            for i in range(1, m):
                c = classes[i-1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1 - np.sum((num_left / i) ** 2)
                gini_right = 1 - np.sum(num_right / (m-i) ** 2)
                gini = (i * gini_left + (m-i) * gini_right) / m
                if thresholds[i] == thresholds[i-1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i-1]) / 2

        return best_idx, best_thr
    def _grow_tree(self, X, y, depth = 0):
        num_samples_per_class = [np.sum(y==i) for i in range(self.n_classes)]
        predicted_class = np.argmax(num_samples_per_class)
        node = DecisionTreeNode(
            gini = self._gini(y), 
            num_samples = y.size, 
            num_samples_per_class = num_samples_per_class, 
            predicted_class = predicted_class
        )
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] <=thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _predict(self, inputs):
        node = self.tree
        while node.left:
            if inputs[node.feature_index] <= node.threshold:
                node = node.right
            else:
                node = node.right
        return node.predicted_class

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

dt = DecisionTree(max_depth = 10)
dt.fit(X_train, y_train)
y_val_pred = dt.predict(X_val)
y_test_pred = dt.predict(X_test)
print("Validation_accuracy:", np.mean(y_val_pred == y_val))
print("Test accuracy:", np.mean(y_test_pred == y_test))

Validation_accuracy: 0.7432432432432432
Test accuracy: 0.7368421052631579


In [134]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def confusion_matrix(y_true, y_pred, n_classes = None):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    if n_classes is None:
        n_classes = len(np.unique(np.concatenate([y_true, y_pred])))
    cm = np.zeros((n_classes, n_classes), dtype = int)
    for t, p in zip(y_true, y_pred):
        cm[t, p] += 1
    return cm

def precision_recall_f1(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = cm[1, 1]
    FP = cm[0, 1]
    FN = cm[1, 0]
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2* precision * recall/ (precision + recall)
    return precision, recall, f1
def class_counts(y_pred):
    classes, counts = np.unique(y_pred, return_counts=True)
    for c, cnt in zip(classes, counts):
        print(f"Class {c}: {cnt} predictions")

        
y_train = y_train.astype(int)
y_val   = y_val.astype(int)
y_test  = y_test.astype(int)

models = {
    "Linear regression": linreg,
    "Logistic regression": model,
    "KNN": knn,
    "Decision tree": dt
}

for name, clf in models.items():
    print(f"\n--- Evaluatong {name} ---")
    y_val_pred = clf.predict(X_val)
    y_test_pred = clf.predict(X_test)

    if y_val_pred.dtype.kind == 'f':
        y_val_pred = (y_val_pred >= 0.5).astype(int)
    if y_test_pred.dtype.kind == 'f':
        y_test_pred = (y_test_pred >= 0.5).astype(int)

    print("Validation Accuracy:", accuracy(y_val, y_val_pred))
    print("Test Accuracy:", accuracy(y_test, y_test_pred))

    print("Validation confusion matrix:")
    print(confusion_matrix(y_val, y_val_pred))
    print("Test Confusion matrix")
    print(confusion_matrix(y_test, y_test_pred))

    val_prec, val_rec, val_f1 = precision_recall_f1(y_val, y_val_pred)
    test_prec, test_rec, test_f1 = precision_recall_f1(y_test, y_test_pred)
    print(f"Validation - Precision: {val_prec:.3f}, Recal: {val_rec:.3f}, F1_score: {val_f1:.3f}")
    print(f"Test - Precision: {test_prec:.3f}, Recal: {test_rec:.3f}, F1_score: {test_f1:.3f}")
    print("Predicted class counts (test):")
    class_counts(y_test_pred)


--- Evaluatong Linear regression ---
Validation Accuracy: 0.9864864864864865
Test Accuracy: 0.9605263157894737
Validation confusion matrix:
[[19  0]
 [ 1 54]]
Test Confusion matrix
[[20  0]
 [ 3 53]]
Validation - Precision: 1.000, Recal: 0.982, F1_score: 0.991
Test - Precision: 1.000, Recal: 0.946, F1_score: 0.972
Predicted class counts (test):
Class 0: 23 predictions
Class 1: 53 predictions

--- Evaluatong Logistic regression ---
Validation Accuracy: 0.9864864864864865
Test Accuracy: 1.0
Validation confusion matrix:
[[19  0]
 [ 1 54]]
Test Confusion matrix
[[20  0]
 [ 0 56]]
Validation - Precision: 1.000, Recal: 0.982, F1_score: 0.991
Test - Precision: 1.000, Recal: 1.000, F1_score: 1.000
Predicted class counts (test):
Class 0: 20 predictions
Class 1: 56 predictions

--- Evaluatong KNN ---
Validation Accuracy: 0.9864864864864865
Test Accuracy: 0.9736842105263158
Validation confusion matrix:
[[19  0]
 [ 1 54]]
Test Confusion matrix
[[18  2]
 [ 0 56]]
Validation - Precision: 1.000, Rec