In [None]:
import numpy as np
import pandas as pd

In [None]:
class Dataset:
    def __init__(self, dataset_path=None):
        self.path = dataset_path

    def get_data(self):
        data = pd.read_csv(self.path).to_numpy()
        X, Y_str = data[:, :-1], data[:, -1]  # remove the target column from the input and extract our targets
        # n_classes = len(set(Y_str))
        n_examples = len(Y_str)
        Y = np.zeros(n_examples)
        for i in range(len(Y_str)):
            category = Y_str[i]
            if category == "banana":
                Y[i] = 0
            elif category == "carrot":
                Y[i] = 1
            elif category == "cucumber":
                Y[i] = 2
            elif category == "mandarin":
                Y[i] = 3
            else:
                Y[i] = 4
        return X, Y


In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.weights = None
        self.bias = None
        self.loss_hist = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def predict_raw(self, X):
        return np.dot(X, self.weights) + self.bias
    
    def predict_proba(self, X):
        return self.sigmoid(self.predict_raw(X))
    
    def logisticLoss(self, y_true, y_pred):
        """
            binary cross entropy
        """
        y0 = y_true * np.log(y_pred)
        y1 = (1 - y_true) * np.log(1 - y_pred)
        return -np.mean(y0 + y1)
    
    def train(self, X, Y, X_val=None, Y_val=None):
        n_examples, n_features = X.shape

        self.weights        = np.zeros(n_features)
        self.bias           = 0
        self.loss_hist      = []
        self.val_loss_hist  = []
        
        for i in range(self.num_iters):
            # Forward
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)

            # Save loss
            loss = self.logisticLoss(Y, y_pred)
            self.loss_hist.append(loss)

            # Save validation loss
            if X_val is not None:
                y_val_pred = self.predict_proba(X_val)
                self.val_loss_hist.append(self.logisticLoss(Y_val, y_val_pred))

            # gradient of binary cross entropy
            y_diff = (y_pred - Y)
            self.weights -= self.learning_rate * np.dot(X.T, y_diff) / n_examples
            self.bias    -= self.learning_rate * np.mean(y_diff)
            # print(f"---------- WEIGHTS (in step {i + 1}) ----------")
            # print(self.weights)
            # print("------------------------------------------------")

In [None]:
class LogisticRegressionOVA:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.models = []
        self.classes = None

    def train(self, X, Y):
        self.classes = np.unique(Y)
        self.models = []

        for cls in self.classes:
            print(f"Train class {cls} vs rest")
            Y_binary = (Y == cls).astype(float)
            model = LogisticRegression(learning_rate=self.learning_rate, num_iters=self.num_iters)
            model.train(X, Y_binary)
            self.models.append(model)

    def predict(self, X):
        all_probs = []

        for model in self.models:
            probs = model.predict_proba(X)
            all_probs.append(probs)

        all_probs = np.column_stack(all_probs)
        max_idx = np.argmax(all_probs, axis=1)
        return self.classes[max_idx]

In [None]:
dataset_path = "./data/tabular/feature_extraction.csv"
dataset = Dataset(dataset_path)

X, Y = dataset.get_data()
X = X.astype(float)
Y = Y.astype(float)
print(X)
print(Y)

logistic_regression = LogisticRegressionOVA()
logistic_regression.train(X, Y)

predictions = logistic_regression.predict(X)
print(predictions)