In [7]:
import numpy as np
import pandas as pd

In [9]:
class Dataset:
    def __init__(self, dataset_path=None):
        self.path = dataset_path

    def get_data(self):
        data = pd.read_csv(self.path).to_numpy()
        X, Y_str = data[:, :-1], data[:, -1]  # remove the target column from the input and extract our targets
        # n_classes = len(set(Y_str))
        n_examples = len(Y_str)
        Y = np.zeros(n_examples)
        for i in range(len(Y_str)):
            category = Y_str[i]
            if category == "banana":
                Y[i] = 0
            elif category == "carrot":
                Y[i] = 1
            elif category == "cucumber":
                Y[i] = 2
            elif category == "mandarin":
                Y[i] = 3
            else:
                Y[i] = 4
        return X, Y


In [33]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iters=1000):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.weights = None
        self.bias = None
        self.loss_hist = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def predict(y):
        return 1 if y > 0.5 else 0
    
    def logisticLoss(self, y_true, y_pred):
        """
            binary cross entropy
        """
        y0 = y_true * np.log(y_pred)
        y1 = (1 - y_true) * np.log(1 - y_pred)
        return -np.mean(y0 + y1)
    
    def train(self, X, Y):
        n_examples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias    = 0
        
        for i in range(self.num_iters):
            y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)

            # gradient of binary cross entropy
            y_diff = (y_pred - Y)
            self.weights -= self.learning_rate * np.dot(X.T, y_diff) / n_examples
            self.bias    -= self.learning_rate * np.mean(y_diff)
            # print(f"---------- WEIGHTS (in step {i + 1}) ----------")
            # print(self.weights)
            # print("------------------------------------------------")

In [34]:
dataset_path = "./data/tabular/feature_extraction.csv"
dataset = Dataset(dataset_path)
X, Y = dataset.get_data()
X = X.astype(float)
Y = Y.astype(float)
print(X)
print(Y)
logistic_regression = LogisticRegression()
logistic_regression.train(X, Y)
print(logistic_regression.weights)
print(logistic_regression.bias)

[[ 75.19295502  95.64546964  92.35715485 ... 235.         136.69693422
   20.32672843]
 [169.61224747  75.30395898 187.84392166 ... 210.         130.65284701
   16.25080312]
 [157.57231903  73.4237332  193.68634415 ... 193.         111.54250186
   19.40190755]
 ...
 [117.6336441   47.24371812 118.35143661 ...  61.          95.74628637
    8.54891902]
 [112.62142944  42.76008329 115.11558151 ...  87.         106.43180303
    7.07523644]
 [ 66.81015396  63.37149468  77.40125275 ...  35.          85.86439753
    6.53533473]]
[0. 0. 0. ... 4. 4. 4.]
[1343.98476732  921.44477754 1709.63121991  827.24919638 2192.32148814
  739.05915705 1773.49979764 1793.5167179  1796.03594092 1800.02480191
 1800.98337346 1801.59322746 1786.3047763  1768.20035605 1881.13066855
 1906.34394723 1890.27291667 1857.472979   1858.68544484 1894.94101861
 1898.33126195 1880.47366547 1984.32727459 1940.47099727 1851.9882915
 1764.12499402 1762.97723702 1877.17565147 1956.12076247 1988.53872268
 2040.86528518 1868.560