In [24]:
import numpy as np
import pandas as pd

In [25]:
class Dataset:
    def __init__(self, train_path=None, val_path=None, test_path=None):
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
    
    def load_csv(self, path):
        data = pd.read_csv(path).to_numpy()
        X, Y_str = data[:, :-1], data[:, -1]  # separate data and target
        n_examples = len(Y_str)
        Y = np.zeros(n_examples)
        for i in range(n_examples):
            category = Y_str[i]
            if category == "banana":
                Y[i] = 0
            elif category == "carrot":
                Y[i] = 1
            elif category == "cucumber":
                Y[i] = 2
            elif category == "mandarin":
                Y[i] = 3
            else:
                Y[i] = 4
        
        return X.astype(float), Y.astype(float)
    
    def get_data(self):
        X_train, Y_train = self.load_csv(self.train_path)
        X_val, Y_val     = self.load_csv(self.val_path)
        X_test, Y_test   = self.load_csv(self.test_path)
        
        return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)


In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.weights = None
        self.bias = None
        self.loss_hist = []

    def sigmoid(self, z):
        # not to get RuntimeWarning: overflow encountered in exp
        z = np.clip(z, -512, 512)
        return 1 / (1 + np.exp(-z))
    
    def predict_raw(self, X):
        return np.dot(X, self.weights) + self.bias
    
    def predict_proba(self, X):
        return self.sigmoid(self.predict_raw(X))
    
    def logisticLoss(self, y_true, y_pred):
        """
            binary cross entropy
        """
        # to prevent RuntimeWarning: divide by zero encountered in log and RuntimeWarning: invalid value encountered in multiply
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)  # keeps values in (0, 1)
        y0 = y_true * np.log(y_pred)
        y1 = (1 - y_true) * np.log(1 - y_pred)
        return -np.mean(y0 + y1)

    
    def train(self, X, Y, X_val=None, Y_val=None):
        n_examples, n_features = X.shape

        self.weights        = np.zeros(n_features)
        self.bias           = 0
        self.loss_hist      = []
        self.val_loss_hist  = []
        
        for i in range(self.num_iters):
            # Forward
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)

            # Save loss
            loss = self.logisticLoss(Y, y_pred)
            self.loss_hist.append(loss)

            # Save validation loss
            if X_val is not None:
                y_val_pred = self.predict_proba(X_val)
                self.val_loss_hist.append(self.logisticLoss(Y_val, y_val_pred))

            # gradient of binary cross entropy
            y_diff = (y_pred - Y)
            self.weights -= self.learning_rate * np.dot(X.T, y_diff) / n_examples
            self.bias    -= self.learning_rate * np.mean(y_diff)
            # print(f"---------- WEIGHTS (in step {i + 1}) ----------")
            # print(self.weights)
            # print("------------------------------------------------")

In [27]:
class LogisticRegressionOVA:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.models = []
        self.classes = None

    def train(self, X, Y):
        self.classes = np.unique(Y)
        self.models = []

        for cls in self.classes:
            print(f"Train class {cls} vs rest")
            Y_binary = (Y == cls).astype(float)
            model = LogisticRegression(learning_rate=self.learning_rate, num_iters=self.num_iters)
            model.train(X, Y_binary)
            self.models.append(model)

    def predict(self, X):
        all_probs = []

        for model in self.models:
            probs = model.predict_proba(X)
            all_probs.append(probs)

        all_probs = np.column_stack(all_probs)
        max_idx = np.argmax(all_probs, axis=1)
        return self.classes[max_idx]

In [28]:
dataset = Dataset(
    train_path="./data/tabular/train_processed.csv",
    val_path="./data/tabular/validation_processed.csv",
    test_path="./data/tabular/test_processed.csv"
)

(X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = dataset.get_data()

# Train and evaluate
model = LogisticRegressionOVA()  # with predefined values
model.train(X_train, Y_train)

accuracy = lambda y_true, y_pred: np.mean(y_true == y_pred) * 100

print(f"Train Accuracy: {accuracy(Y_train, model.predict(X_train)):.2f}%")
print(f"Val Accuracy:   {accuracy(Y_val, model.predict(X_val)):.2f}%")
print(f"Test Accuracy:  {accuracy(Y_test, model.predict(X_test)):.2f}%")

Train class 0.0 vs rest


  return 1 / (1 + np.exp(-z))


Train class 1.0 vs rest
Train class 2.0 vs rest
Train class 3.0 vs rest
Train class 4.0 vs rest
Train Accuracy: 42.57%
Val Accuracy:   40.09%
Test Accuracy:  54.04%
