In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def zScoreScale(x):
    data = x
    mean = data.mean()
    sig = np.sqrt(((data - mean)**2).sum())
    scaled_x = np.zeros(x.shape)
    
    scaled_data = (data - mean)/sig
    for i in range(x.shape[1]):
        scaled_x[:,i] = scaled_data[:,i]
    return scaled_x, mean, sig

def zScoreDescale(x, mean, sig):
    scaled_data = x
    descaled_data = scaled_data * sig + mean
    
    descaled_x = np.zeros(x.shape)
    for i in range(x.shape[1]):
        descaled_x[:,i] = descaled_data[:,i]
        
    return descaled_x

In [15]:
class BinaryLogisticalRegression:
    
    # Construtor
    def __init__(self, x, y, addOnes = True):
        if addOnes:
            self.x = np.c_[np.ones(x.shape[0]), x]
        else:
            self.x = x
        self.y = y
        self.w = np.zeros(self.x.shape[1]).reshape(-1,1)
        self.MSE = 0.0
    
    # Getters
    def getX(self):
        return self.x
    
    def getY(self):
        return self.y
    
    def getW(self):
        return self.w
    
    def getMSE(self):
        return self.MSE
    
    # Setters
    def setX(self, x, addOnes = True):
        if addOnes:
            x = np.c_[np.ones(x.shape[0]), x]
        self.x = x
        self.w = np.zeros(self.x.shape[1]).reshape(-1,1)
    
    def setY(self, y):
        self.y = y
    
    # Métodos
    def sigmoid(self, x):
        return 1/(1 + np.exp(-x))
    
    def trainGD(self, alpha = 0.1, max_iterations = 100):
        
        self.w = np.zeros(self.x.shape[1]).reshape(-1,1)
        n = len(self.y)                 
        for t in range(max_iterations):
            yhat = np.zeros(n).reshape(-1,1)
            e = np.zeros(n).reshape(-1,1)
            
            yhat = self.sigmoid(self.x @ self.w)
            e = self.y - yhat
            self.w[0] = self.w[0] + (alpha/n * e.sum())
            for column in range(1, len(self.w)):
                self.w[column] = self.w[column] + alpha/n * ((e * self.x[:,[column]]).sum())
                    
        self.MSE = ((e ** 2).sum())/(2*n)
    
    def test(self):
        return self.y - (self.x @ self.w)
    
    def predict(self, x, addOnes = True):
        if addOnes:
            x = np.c_[np.ones(x.shape[0]), x]
        return np.sign(x @ self.w)

In [None]:
class MulticlassLogisticalRegression:
    
    # Construtor
    def __init__(self, x, y, addOnes = True):
        if addOnes:
            self.x = np.c_[np.ones(x.shape[0]), x]
        else:
            self.x = x
        self.y = y
        self.w = np.zeros((self.x.shape[1], y.shape[1]))
        self.MSE = 0.0
    
    # Getters
    def getX(self):
        return self.x
    
    def getY(self):
        return self.y
    
    def getW(self):
        return self.w
    
    def getMSE(self):
        return self.MSE
    
    # Setters
    def setXY(self, x, y, addOnes = True):
        if addOnes:
            x = np.c_[np.ones(x.shape[0]), x]
        self.x = x
        self.y = y
        self.w = np.zeros((self.x.shape[1], y.shape[1]))
    
    # Métodos
    def softmax(self, w, x):
        numerator = np.exp(x @ w)
        denominator = np.sum(np.exp(x @ w), axis=1).reshape(-1,1)
        return numerator/denominator
    
    def trainGD(self, alpha = 0.1, max_iterations = 100):
    
        n = self.y.shape[0]
        yhat = np.zeros(self.y.shape)
        e = np.zeros(self.y.shape)
                 
        for t in range(max_iterations):
            yhat = self.softmax(self.w, self.x)
            e = self.y - yhat
            self.w = self.w + (alpha/n * (self.x.T @ e))
        
        print(self.w)
        self.MSE = ((e ** 2).sum())/(2*n)
    
    def test(self):
        return self.y - (self.x @ self.w)
    
    def predict(self, x, addOnes = True):
        if addOnes:
            x = np.c_[np.ones(x.shape[0]), x]
        probabilities = self.softmax(self.w, x)
        max_indexes = np.argmax(probabilities, axis=1)
        prediction = np.zeros(probabilities.shape)
        prediction[np.arange(len(max_indexes)), max_indexes] = 1
        return prediction

In [11]:
class NaiveBayesGaussiano:
    
    # Construtor 
    def __init__(self, x, y):
        self.classes = np.eye(y.shape[1])
        self.n_per_class = np.zeros((y.shape[1], 1))
        self.class_priors = np.zeros((y.shape[1], 1)) # Probabilidade de cada classe
        self.means = np.zeros((self.classes.shape[1], x.shape[1]))
        self.variances = np.zeros((self.classes.shape[1], x.shape[1], x.shape[1]))
        
        # Calcula n_per_class
        for line in y:
            for i in range(0, len(line)):
                if line[i] == 1:
                    self.n_per_class[i] += 1
                    break
        
        # Calcula class_priors
        for i in range(0, len(self.class_priors)):
            self.class_priors[i] = self.n_per_class[i]/self.n_per_class.sum()
        
        # Calcula a média de cada atributo para cada classe
        for i in range(0, y.shape[0]):
            for c in range(self.classes.shape[0]):
                if np.array_equal(y[i], self.classes[c]):
                    self.means[c] += x[i]
                    break      
        for clss in range(0, y.shape[1]):
            self.means[clss, :] = self.means[clss, :]/self.n_per_class[clss]
    
        # Calcula a variância de cada feature
        for clss in range(self.classes.shape[0]):
            for feature in range(x.shape[1]):
                self.variances[clss, feature, feature] = np.sum((x[y[:, clss] == 1, feature] - self.means[clss, feature])**2)/(self.n_per_class[clss] - 1)
    
    def predict(self, x):
        prob = np.zeros((self.classes.shape[0], 1))
        for clss in range(self.classes.shape[0]):
            prob[clss] = np.log(self.class_priors[clss])
            prob[clss] -= 0.5 * np.sum(np.log(2 * np.pi * self.variances[clss, :].sum()))
            prob[clss] -= 0.5 * np.sum(((x - self.means[clss, :])**2)/self.variances[clss, :].sum())
        return self.classes[np.argmax(prob)]

In [12]:
class DiscriminanteGaussiano:
    
    # Construtor 
    def __init__(self, x, y):
        self.classes = np.eye(y.shape[1])
        self.n_per_class = np.zeros((y.shape[1], 1))
        self.class_priors = np.zeros((y.shape[1], 1)) # Probabilidade de cada classe
        self.means = np.zeros((self.classes.shape[1], x.shape[1]))
        self.variances = np.zeros((self.classes.shape[1], x.shape[1], x.shape[1]))
        
        # Calcula n_per_class
        for line in y:
            for i in range(0, len(line)):
                if line[i] == 1:
                    self.n_per_class[i] += 1
                    break
        
        # Calcula class_priors
        for i in range(0, len(self.class_priors)):
            self.class_priors[i] = self.n_per_class[i]/self.n_per_class.sum()
        
        # Calcula a média de cada atributo para cada classe
        for i in range(0, y.shape[0]):
            for c in range(self.classes.shape[0]):
                if np.array_equal(y[i], self.classes[c]):
                    self.means[c] += x[i]
                    break      
        for clss in range(0, y.shape[1]):
            self.means[clss, :] = self.means[clss, :]/self.n_per_class[clss]
    
        # Calcula a variância de cada feature
        
        for clss in range(self.classes.shape[0]):
            self.variances[clss] = np.cov(x[y[:, clss] == 1, :], rowvar=False)/self.n_per_class[clss]
            np.linalg.inv(self.variances[clss])
            
    def predict(self, x):
        prob = np.zeros((self.classes.shape[0], 1))
        for clss in range(self.classes.shape[0]):
            
            det = np.linalg.det(self.variances[clss])
            inv = np.linalg.inv(self.variances[clss])
            
            prob[clss] = np.log(self.class_priors[clss])
            prob[clss] -= 0.5 * np.log(det)
            prob[clss] -= 0.5 * (x - self.means[clss, :]).T @ inv @ (x - self.means[clss, :])
        return self.classes[np.argmax(prob)]

In [13]:
def kfold(data):
    folds = []
    splits = 10
    indices = np.random.permutation(data.shape[0])
    folds_idx = np.array_split(indices, splits)
    
    for i in range(splits):
        train_idx = np.concatenate(folds_idx[:i] + folds_idx[i+1:])
        test_idx = folds_idx[i]
        folds.append((train_idx, test_idx))
    
    return folds

In [None]:
data = np.genfromtxt("breastcancer.csv", delimiter=',')

x = data[:, :30]
y = data[:, 30]

scaled_x, mean, sig = zScoreScale(x)

one_hot_y = np.zeros((x.shape[0], 2))
for l in range(len(y)):
    if y[l] == 1:
        one_hot_y[l] = np.array([1, 0])
    else:
        one_hot_y[l] = np.array([0, 1])

folds = kfold(x)

for i, (train_idx, test_idx) in enumerate(folds):
    x_train = scaled_x[train_idx, :]
    y_train = y[train_idx]
    one_hot_y_train = one_hot_y[train_idx]
    
    x_test = scaled_x[test_idx, :]
    y_test = y[test_idx]
    one_hot_y_test = one_hot_y[test_idx]
    
    logistical_gd = BinaryLogisticalRegression(x_train, y_train)
    gaussian_discriminant = DiscriminanteGaussiano(x_train, one_hot_y_train)
    gaussian_naive_bayes = NaiveBayesGaussiano(x_train, one_hot_y_train)
    
    logistical_gd.trainGD(max_iterations = 200)
    yhat_lgd = logistical_gd.predict(x_test)
    print(yhat_lgd)
