# Bibliotecas

In [260]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Importando o dataset, renomeando coluna e split em treino e teste

In [261]:
data = pd.read_csv('Covid Data.csv')
data = data.rename(columns = {'CLASIFFICATION_FINAL': 'CLASSIFICATION_FINAL'})
data1 = data.copy()
data1.loc[data1['DATE_DIED'] == '9999-99-99', 'DIED'] = 2
data1.loc[data1['DATE_DIED'] != '9999-99-99', 'DIED'] = 1
data1.drop(columns = ['DATE_DIED'], inplace = True)

In [262]:
label = 'DIED'
features = list(set(data1.columns).difference({label}))
X = data1[features]
y = data1[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Pré-Processamento

In [263]:
boolean_features = ['PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
                   'CARDIOVASCULAR', 'RENAL_CHRONIC', 'OTHER_DISEASE', 'OBESITY', 'TOBACCO',
                   'INTUBED', 'ICU']

In [264]:
def create_boolean_columns(data, boolean_features):
    new_data = data.copy()
    for feature in boolean_features:
        new_data.loc[new_data[feature] < 3, f'is_{feature}_defined'] = 1
        new_data.loc[new_data[feature] >= 3, f'is_{feature}_defined'] = 2
    return new_data

def correct_pregnant_for_men(data):
    new_data = data.copy()
    new_data.loc[new_data['SEX'] == 2, 'PREGNANT'] = 0
    return new_data

def mode_imputing(data, pre_imputing_train_data, boolean_features):
    new_data = data.copy()
    for feature in boolean_features:
        most_common = pre_imputing_train_data[feature].mode()[0]
        new_data.loc[new_data[feature] >= 3, feature] = most_common
    return new_data

def intubed_and_icu_imputing(data):
    new_data = data.copy()
    more_nan_features = ['INTUBED', 'ICU']
    for feature in more_nan_features:
        new_data.loc[new_data[feature] >= 3, feature] = 3
    return new_data

def covid_degree(data):
    new_data = data.copy()
    new_data.loc[new_data['CLASSIFICATION_FINAL'] >= 4, 'covid_degree'] = 0
    new_data.loc[new_data['CLASSIFICATION_FINAL'] < 4, 'covid_degree'] = new_data['CLASSIFICATION_FINAL']
    new_data.drop('CLASSIFICATION_FINAL', axis = 1, inplace = True)
    return new_data

def scale(feature, unscaled_train_feature):
    minimum = min(unscaled_train_feature)
    maximum = max(unscaled_train_feature)
    return (feature - minimum)/(maximum - minimum)

def binary_change(data):  
    new_data = data.copy()
    new_data.loc[new_data == 2] = 0
    return new_data

In [265]:
def pre_processing_pipeline(X_train, X_test, y_train, y_test, boolean_features):
    X_train = create_boolean_columns(X_train, boolean_features)
    X_test = create_boolean_columns(X_test, boolean_features)
    
    X_train = correct_pregnant_for_men(X_train)
    X_test = correct_pregnant_for_men(X_test)
    
    pre_imputing_X_train = X_train.copy()
    X_train = mode_imputing(X_train, pre_imputing_X_train, boolean_features)
    X_test = mode_imputing(X_test, pre_imputing_X_train, boolean_features)
    
    X_train = intubed_and_icu_imputing(X_train)
    X_test = intubed_and_icu_imputing(X_test)
    
    X_train = covid_degree(X_train)
    X_test = covid_degree(X_test)
    
    features = X_train.columns
    unscaled_X_train = X_train.copy()
    
    for feature in features:
        X_train[feature] = scale(X_train[feature], unscaled_X_train[feature])
        X_test[feature] = scale(X_test[feature], unscaled_X_train[feature])
        
    y_train = binary_change(y_train)
    y_test = binary_change(y_test)
    
    return X_train, X_test, y_train, y_test

In [266]:
X_train, X_test, y_train, y_test = pre_processing_pipeline(X_train, X_test, y_train, y_test, boolean_features)

In [267]:
features = X_train.columns

In [268]:
X_train.head()

Unnamed: 0,USMER,INMSUPR,INTUBED,ICU,COPD,PNEUMONIA,DIABETES,RENAL_CHRONIC,MEDICAL_UNIT,HIPERTENSION,...,is_INMSUPR_defined,is_HIPERTENSION_defined,is_CARDIOVASCULAR_defined,is_RENAL_CHRONIC_defined,is_OTHER_DISEASE_defined,is_OBESITY_defined,is_TOBACCO_defined,is_INTUBED_defined,is_ICU_defined,covid_degree
592908,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.916667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
184386,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1021782,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.916667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
59606,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
93792,0.0,1.0,0.5,0.5,1.0,1.0,0.0,1.0,0.25,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [346]:
y_train.head()

592908     0.0
184386     0.0
1021782    0.0
59606      0.0
93792      0.0
Name: DIED, dtype: float64

# Modelo

A função de custo utilizada no modelo de regressão logística é a log loss:

$$Log\;Loss = -\frac{1}{n}\sum_{i = 1}^{n} y\;log(\hat{y}) + (1-y)\;log(1-\hat{y})$$

sendo y a label do dataset de treino (0 ou 1) e $\hat{y}$ o valor da label previsto pelo modelo (algo entre 0 e 1) para dados valores de features. O valor de $\hat{y}$ é obtido pelo uso da função de ativação sigmoid em um modelo linear. Assim, definimos:

$$z = b + \sum_{i = 1}^{m} w_{i} x_{i}$$

$$\hat{y} = sigmoid(z) = sig(z) = \frac{1}{1 + e^{-z}}$$

em que $w_{i}$ é o peso da feature $x_{i}$ no modelo.

Nesse modelo, também aplicaremos a regularização $L_{1}$, responsável por levar os pesos de features pouco importantes (com pesos muito pequenos) a exatamente 0, realizando, assim, uma seleção de features que gera um modelo mais esparso. A função de regularização $L_{1}$ é dada por:

$L_{1}\;regularization\;term = \sum_{i = 1}^{m} |w_{i}|$

Assim, a função que vamos querer minimizar é:

$$f(\theta) = Log\;Loss + L_{1}\;regularization\;term = -\frac{1}{n}\sum_{i = 1}^{n} y_{i}\;log(\hat{y}_{i}) + (1-y_{i})\;log(1-\hat{y}_{i}) + \lambda\sum_{i = 1}^{m} |w_{i}|$$

onde $\lambda$ é a constante de regularização, $\theta = (w_{1}, w_{2}, ..., w_{n}, b)$, sendo $n$ o número de linhas de dados que temos no dataset de treino e $m$ o número de features.

Sabemos que: $\frac{d\hat{y}_{i}}{dz_{i}} = sig(z_{i}) \cdot (1-sig(z_{i}))$ (verifique!)

Assim, podemos calcular as derivadas parciais de $\hat{y}_{i}$: 

$$\frac{\partial{\hat{y}_{i}}}{\partial{w_{k}}} = \frac{d\hat{y}_{i}}{dz_{i}} \cdot \frac{\partial{z_{i}}}{\partial{w_{k}}} = sig(z_{i})\cdot(1-sig(z_{i}))\cdot x_{k}$$

para toda feature $x_{k}$, e:

$$\frac{\partial{\hat{y}_{i}}}{\partial{b}} = \frac{d\hat{y}_{i}}{dz_{i}} \cdot \frac{\partial{z_{i}}}{\partial{b}} = sig(z_{i})\cdot(1-sig(z_{i}))$$

Calculando agora as derivadas parciais de $f(\theta)$:

$$\frac{\partial{f}}{\partial{w_{k}}} = -\frac{1}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{(1-\hat{y}_{i})}\right] + \lambda \frac{|w_{k}|}{w_{k}}$$

$$\frac{\partial{f}}{\partial{b}} = -\frac{1}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))}{(1-\hat{y}_{i})}\right]$$

Assim, sendo $\alpha$ a taxa de aprendizado do modelo, os novos valores dos pesos e do viés serão:

$$w_{k}' = w_{k} - \alpha \frac{\partial{f}}{\partial{w_{k}}}$$

$$b' = b - \alpha \frac{\partial{f}}{\partial{b}}$$

Desenvolvendo, temos, por fim:

$$w_{k}' = w_{k} + \frac{\alpha}{n}\sum_{i = 1}^{n}(x_{k})_{i}\cdot\left[y_{i}\cdot {(1-\hat{y}_{i})} - (1-y_{i})\cdot \hat{y}_{i}\right] - \alpha \lambda \frac{|w_{k}|}{w_{k}}$$

$$b' = b + \frac{\alpha}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot {(1-\hat{y}_{i})} - (1-y_{i})\cdot \hat{y}_{i}\right] $$

# Implementação

In [270]:
class Logistic_Regression():
    def __init__(self, X_train, y_train, ws: list,
                 b = 0, alpha = 0.1, lambda_reg = 0.1, random_state = 0):
        self.features = X_train
        self.label = y_train
        for i in range(0, len(ws)):
            ws[i] = float(ws[i])
        self.ws = np.array(ws) # weights
        self.b = b
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.rand = np.random.RandomState(random_state)
        
    def print_parameters(self):
        for i in range(1, len(self.ws) + 1):
            print(f'w{i} = {self.ws[i - 1]}')
        print (f'b = {self.b}')
        
    def get_parameters(self):
        i_vals = list(range(1, len(self.ws) + 1))
        parameters = {f'w{i}': self.ws[i - 1] for i in i_vals}
        parameters['b'] = self.b
        return parameters
    
    @staticmethod
    def sigmoid(z):
        return 1/(1 + np.exp(-z))
    
    def get_single_prediction(self, xs: list):
        '''Get the prediction for a list with all the features' values.'''
        for i in range(0, len(xs)):
            xs[i] = float(xs[i])
        xs = np.array(xs)
        z = np.dot(self.ws, xs) + self.b
        prediction = Logistic_Regression.sigmoid(z)
        return prediction
    
    def predict(self, X_test):
        n = len(X_test)
        X_test_copy = X_test.copy()
        X_test_copy.reset_index(inplace = True, drop = True)
        k = 0
        w_cols = []
        for col in X_test_copy.columns:
            X_test_copy.loc[0:n-1, f'weighted_{col}'] = self.ws[k]*X_test_copy[col]
            w_cols.append(f'weighted_{col}')
            k += 1
        X_test_copy['Dot'] = X_test_copy[w_cols].sum(axis = 1)
        X_test_copy['z'] = X_test_copy['Dot'] + self.b
        X_test_copy.loc[0:n-1, 'Predictions'] = 1/(1 + np.exp(-X_test_copy['z']))
        predictions = np.array(X_test_copy['Predictions'])
        return predictions
    
    def get_loss(self, X_test, y_test):
        n = len(X_test)
        predictions = self.predict(X_test)
        fst_term = np.dot(y_test, np.log(predictions))
        sec_term = np.dot(1 - y_test, np.log(1 - predictions))
        loss = -(1/n)*(fst_term + sec_term)
        return loss
    
    def get_l1_term(self):
        return np.sum(np.abs(self.ws))
    
    def get_structural_risk(self, X_test, y_test):
        return self.get_loss(X_test, y_test) + self.lambda_reg * self.get_l1_term
    
    def sgd_update_parameters(self, batch_size: int):
        n = len(self.label)
        index_list = list(range(0, n))
        random_indices = self.rand.choice(index_list, size = batch_size, replace = True) # bootstrap sample
        xs_sample = list()
        y_sample = np.array(self.label.iloc[random_indices])
        preds_sample = np.zeros(batch_size)
        for i in range(0, batch_size):
            xs = list(self.features.iloc[random_indices[i]])
            preds_sample[i] += self.get_single_prediction(xs)
        for col in self.features:
            xs_sample.append(np.array(self.features[col].iloc[random_indices])) # len(xs_sample) = len(self.ws)
        partial_w = np.zeros(len(self.ws))
        for k in range(0, len(self.ws)):
            if self.ws[k] == 0:
                partial_reg_term = 0
            else:
                partial_reg_term = np.abs(self.ws[k])/self.ws[k]
            fst_term = (y_sample) * (1 - preds_sample)
            sec_term = (1 - y_sample) * (preds_sample)
            partial_w[k] += -(1/batch_size) * np.dot(xs_sample[k], fst_term - sec_term) + self.lambda_reg * partial_reg_term
        partial_b = -(1/batch_size) * np.sum(fst_term - sec_term)
        self.ws -= self.alpha * partial_w
        self.b -= self.alpha * partial_b
        
    def sgd(self, iterations: int, batch_size: float, print_loss = False): # stochastic gradient descent
        for i in range(0, iterations):
            self.sgd_update_parameters(batch_size)
            if print_loss:
                print(f'loss = {self.get_loss(self.features, self.label)}')

In [336]:
ws = list(np.zeros(len(features)))
model = Logistic_Regression(X_train = X_train, y_train = y_train, ws = ws, lambda_reg = 0.05, alpha = 0.1)

In [337]:
model.sgd(iterations = 40, batch_size = 100, print_loss = True)

loss = 0.44959754569474175
loss = 0.35680894940755653
loss = 0.3169411413930308
loss = 0.29838621222226314
loss = 0.2765721215829366
loss = 0.2717039086114165
loss = 0.2580306466793227
loss = 0.25067064653386195
loss = 0.2454664350698992
loss = 0.24182927054444495
loss = 0.23946622298096598
loss = 0.23766402293369632
loss = 0.236645060760695
loss = 0.23722821316904877
loss = 0.2361099474849269
loss = 0.23510364164027325
loss = 0.23440350601537754
loss = 0.2324461203275588
loss = 0.2331542851234061
loss = 0.2335351863325628
loss = 0.23335221365252254
loss = 0.2325670499613375
loss = 0.2316148793098996
loss = 0.23484770819623552
loss = 0.2336953504597814
loss = 0.23347468443863745
loss = 0.2327363674447556
loss = 0.2306857886386171
loss = 0.22902112733454524
loss = 0.22767427375550248
loss = 0.2290536724222965
loss = 0.2298482632883099
loss = 0.2319992136201092
loss = 0.23051457938386044
loss = 0.23012800601585684
loss = 0.2314292199872328
loss = 0.23081272895655866
loss = 0.230998826696

In [338]:
model.predict(X_test)

array([0.0914716 , 0.09653352, 0.08854787, ..., 0.0907446 , 0.10233488,
       0.09145578])

In [339]:
(pd.DataFrame(model.predict(X_test)) > 0.19).astype(int).value_counts()

0    193057
1     16658
dtype: int64