In [121]:
import os

import numpy as np
import polars as pl

In [122]:
SEED = 462
np.random.seed(SEED)

In [123]:
data_path = os.path.join("data", "tabular")

In [124]:
class Dataset:
    def __init__(self, train_path, val_path, test_path):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.mean = None
        self.std = None
        self.label_map = None

    def load_csv(self, path):
        df = pl.read_csv(path)
        data = df.to_numpy()
        X = data[:, :-1].astype(float)
        Y_str = data[:, -1]
        return X, Y_str

    def normalize(self, X, fit=False):
        if fit:
            self.mean = np.mean(X, axis=0)
            self.std = np.std(X, axis=0)
            self.std[self.std == 0] = 1.0

        return (X - self.mean) / self.std

    def get_data(self):
        X_train, Y_train = self.load_csv(self.train_path)
        X_val, Y_val = self.load_csv(self.val_path)
        X_test, Y_test = self.load_csv(self.test_path)

        X_train = self.normalize(X_train, fit=True)
        X_val = self.normalize(X_val, fit=False)
        X_test = self.normalize(X_test, fit=False)

        return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)

In [125]:
class LogisticRegression:
    def __init__(self, learning_rate, num_iters):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.weights = None
        self.bias = None
        self.cls = None

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def predict_proba(self, X):
        return self.sigmoid(np.dot(X, self.weights) + self.bias)

    def logistic_loss(self, y_true, y_pred):
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def train(self, X, Y):
        n_examples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.num_iters):
            y_pred = self.predict_proba(X)
            y_diff = y_pred - Y

            self.weights -= self.learning_rate * np.dot(X.T, y_diff) / n_examples
            self.bias -= self.learning_rate * np.mean(y_diff)

In [126]:
class LogisticRegressionOVA:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.models = []
        self.classes = None

    def train(self, X_train, Y_train, X_val, Y_val):
        self.classes = np.unique(Y_train)
        self.models = []

        for cls in self.classes:
            print(f"Training for class {cls}:")
            Y_train_bin = (Y_train == cls).astype(float)
            Y_val_bin = (Y_val == cls).astype(float)

            model = LogisticRegression(self.learning_rate, self.num_iters)
            # Custom training loop to print loss
            n_examples, n_features = X_train.shape
            model.weights = np.zeros(n_features)
            model.bias = 0
            model.cls = cls

            for i in range(model.num_iters):
                y_pred = model.predict_proba(X_train)
                y_diff = y_pred - Y_train_bin

                model.weights -= model.learning_rate * np.dot(X_train.T, y_diff) / n_examples
                model.bias -= model.learning_rate * np.mean(y_diff)

                if i % 100 == 0:
                    train_loss = model.logistic_loss(Y_train_bin, y_pred)
                    val_pred = model.predict_proba(X_val)
                    val_loss = model.logistic_loss(Y_val_bin, val_pred)
                    print(f"Iter {i}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}")

            self.models.append(model)

    def predict(self, X):
        probs = np.column_stack([model.predict_proba(X) for model in self.models])
        return [self.models[idx].cls for idx in np.argmax(probs, axis=1) ]


def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

In [127]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

In [128]:
if __name__ == "__main__":
    dataset = Dataset(
        train_path=os.path.join(data_path, "train_processed.csv"),
        val_path=os.path.join(data_path, "validation_processed.csv"),
        test_path=os.path.join(data_path, "test_processed.csv"),
    )

    (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = dataset.get_data()

    model = LogisticRegressionOVA(learning_rate=0.1, num_iters=1000)
    model.train(X_train, Y_train, X_val, Y_val)

    test_pred = model.predict(X_test)
    print(f"Test Accuracy: {accuracy(Y_test, test_pred):.2f}%")

Training for class banana:
Iter 0: Train Loss 0.6931, Val Loss 0.6467
Iter 100: Train Loss 0.1630, Val Loss 0.1670
Iter 200: Train Loss 0.1103, Val Loss 0.1153
Iter 300: Train Loss 0.0883, Val Loss 0.0938
Iter 400: Train Loss 0.0757, Val Loss 0.0816
Iter 500: Train Loss 0.0674, Val Loss 0.0734
Iter 600: Train Loss 0.0614, Val Loss 0.0676
Iter 700: Train Loss 0.0568, Val Loss 0.0631
Iter 800: Train Loss 0.0531, Val Loss 0.0595
Iter 900: Train Loss 0.0501, Val Loss 0.0566
Training for class carrot:
Iter 0: Train Loss 0.6931, Val Loss 0.6626
Iter 100: Train Loss 0.1528, Val Loss 0.1567
Iter 200: Train Loss 0.0995, Val Loss 0.1032
Iter 300: Train Loss 0.0775, Val Loss 0.0808
Iter 400: Train Loss 0.0650, Val Loss 0.0681
Iter 500: Train Loss 0.0569, Val Loss 0.0597
Iter 600: Train Loss 0.0510, Val Loss 0.0537
Iter 700: Train Loss 0.0466, Val Loss 0.0491
Iter 800: Train Loss 0.0431, Val Loss 0.0456
Iter 900: Train Loss 0.0403, Val Loss 0.0426
Training for class cucumber:
Iter 0: Train Loss 0.