# Bibliotecas

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Importando o dataset, renomeando coluna e split em treino e teste

In [2]:
data = pd.read_csv('Covid Data.csv')
data = data.rename(columns = {'CLASIFFICATION_FINAL': 'CLASSIFICATION_FINAL'})
data1 = data.copy()
data1.loc[data1['DATE_DIED'] == '9999-99-99', 'DIED'] = 2
data1.loc[data1['DATE_DIED'] != '9999-99-99', 'DIED'] = 1
data1.drop(columns = ['DATE_DIED'], inplace = True)

In [3]:
label = 'DIED'
features = list(set(data1.columns).difference({label}))
X = data1[features]
y = data1[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Pré-Processamento

In [4]:
boolean_features = ['PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
                   'CARDIOVASCULAR', 'RENAL_CHRONIC', 'OTHER_DISEASE', 'OBESITY', 'TOBACCO',
                   'INTUBED', 'ICU']

In [5]:
def create_boolean_columns(data, boolean_features):
    new_data = data.copy()
    for feature in boolean_features:
        new_data.loc[new_data[feature] < 3, f'is_{feature}_defined'] = 1
        new_data.loc[new_data[feature] >= 3, f'is_{feature}_defined'] = 2
    return new_data

def correct_pregnant_for_men(data):
    new_data = data.copy()
    new_data.loc[new_data['SEX'] == 2, 'PREGNANT'] = 0
    return new_data

def mode_imputing(data, pre_imputing_train_data, boolean_features):
    new_data = data.copy()
    for feature in boolean_features:
        most_common = pre_imputing_train_data[feature].mode()[0]
        new_data.loc[new_data[feature] >= 3, feature] = most_common
    return new_data

def intubed_and_icu_imputing(data):
    new_data = data.copy()
    more_nan_features = ['INTUBED', 'ICU']
    for feature in more_nan_features:
        new_data.loc[new_data[feature] >= 3, feature] = 3
    return new_data

def covid_degree(data):
    new_data = data.copy()
    new_data.loc[new_data['CLASSIFICATION_FINAL'] >= 4, 'covid_degree'] = 0
    new_data.loc[new_data['CLASSIFICATION_FINAL'] < 4, 'covid_degree'] = new_data['CLASSIFICATION_FINAL']
    new_data.drop('CLASSIFICATION_FINAL', axis = 1, inplace = True)
    return new_data

def scale(feature, unscaled_train_feature):
    minimum = min(unscaled_train_feature)
    maximum = max(unscaled_train_feature)
    return (feature - minimum)/(maximum - minimum)

In [6]:
def pre_processing_pipeline(X_train, X_test, boolean_features):
    X_train = create_boolean_columns(X_train, boolean_features)
    X_test = create_boolean_columns(X_test, boolean_features)
    
    X_train = correct_pregnant_for_men(X_train)
    X_test = correct_pregnant_for_men(X_test)
    
    pre_imputing_X_train = X_train.copy()
    X_train = mode_imputing(X_train, pre_imputing_X_train, boolean_features)
    X_test = mode_imputing(X_test, pre_imputing_X_train, boolean_features)
    
    X_train = intubed_and_icu_imputing(X_train)
    X_test = intubed_and_icu_imputing(X_test)
    
    X_train = covid_degree(X_train)
    X_test = covid_degree(X_test)
    
    features = X_train.columns
    unscaled_X_train = X_train.copy()
    
    for feature in features:
        X_train[feature] = scale(X_train[feature], unscaled_X_train[feature])
        X_test[feature] = scale(X_test[feature], unscaled_X_train[feature])
    
    return X_train, X_test

In [7]:
X_train, X_test = pre_processing_pipeline(X_train, X_test, boolean_features)

In [8]:
X_train.head()

Unnamed: 0,OBESITY,PNEUMONIA,OTHER_DISEASE,CARDIOVASCULAR,DIABETES,PATIENT_TYPE,AGE,COPD,INTUBED,USMER,...,is_INMSUPR_defined,is_HIPERTENSION_defined,is_CARDIOVASCULAR_defined,is_RENAL_CHRONIC_defined,is_OTHER_DISEASE_defined,is_OBESITY_defined,is_TOBACCO_defined,is_INTUBED_defined,is_ICU_defined,covid_degree
592908,0.0,1.0,1.0,1.0,0.0,0.0,0.239669,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
184386,1.0,1.0,1.0,1.0,1.0,0.0,0.22314,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1021782,1.0,1.0,1.0,1.0,1.0,0.0,0.206612,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
59606,0.0,1.0,1.0,1.0,1.0,0.0,0.214876,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
93792,1.0,1.0,1.0,1.0,0.0,1.0,0.380165,1.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
X_test.head()

Unnamed: 0,OBESITY,PNEUMONIA,OTHER_DISEASE,CARDIOVASCULAR,DIABETES,PATIENT_TYPE,AGE,COPD,INTUBED,USMER,...,is_INMSUPR_defined,is_HIPERTENSION_defined,is_CARDIOVASCULAR_defined,is_RENAL_CHRONIC_defined,is_OTHER_DISEASE_defined,is_OBESITY_defined,is_TOBACCO_defined,is_INTUBED_defined,is_ICU_defined,covid_degree
875680,1.0,1.0,1.0,1.0,1.0,0.0,0.413223,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1046906,1.0,1.0,1.0,1.0,1.0,0.0,0.338843,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
646861,1.0,1.0,1.0,1.0,1.0,0.0,0.322314,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
704385,1.0,1.0,1.0,1.0,1.0,0.0,0.157025,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
798051,1.0,1.0,1.0,1.0,1.0,0.0,0.421488,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


# Implementação do modelo

A função de custo utilizada no modelo de regressão logística é a log loss:

$$Log\;Loss = -\frac{1}{n}\sum_{i = 1}^{n} y\;log(\hat{y}) + (1-y)\;log(1-\hat{y})$$

sendo y a label do dataset de treino (0 ou 1) e $\hat{y}$ o valor da label previsto pelo modelo (algo entre 0 e 1) para dados valores de features. O valor de $\hat{y}$ é obtido pelo uso da função de ativação sigmoid em um modelo linear. Assim, definimos:

$$z = b + \sum_{i = 1}^{m} w_{i} x_{i}$$

$$\hat{y} = sigmoid(z) = sig(z) = \frac{1}{1 + e^{-z}}$$

em que $w_{i}$ é o peso da feature $x_{i}$ no modelo.

Nesse modelo, também aplicaremos a regularização $L_{1}$, responsável por levar os pesos de features pouco importantes (com pesos muito pequenos) a exatamente 0, realizando, assim, uma seleção de features que gera um modelo mais esparso. A função de regularização $L_{1}$ é dada por:

$L_{1}\;regularization\;term = \sum_{i = 1}^{m} |w_{i}|$

Assim, a função que vamos querer minimizar é:

$$f(\theta) = Log\;Loss + L_{1}\;regularization\;term = -\frac{1}{n}\sum_{i = 1}^{n} y_{i}\;log(\hat{y}_{i}) + (1-y_{i})\;log(1-\hat{y}_{i}) + \lambda\sum_{i = 1}^{m} |w_{i}|$$

onde $\lambda$ é a constante de regularização, $\theta = (w_{1}, w_{2}, ..., w_{n}, b)$, sendo $n$ o número de linhas de dados que temos no dataset de treino e $m$ o número de features.

Sabemos que: $\frac{d\hat{y}_{i}}{dz_{i}} = sig(z_{i}) \cdot (1-sig(z_{i}))$ (verifique!)

Assim, podemos calcular as derivadas parciais de $\hat{y}_{i}$: 

$$\frac{\partial{\hat{y}_{i}}}{\partial{w_{k}}} = \frac{d\hat{y}_{i}}{dz_{i}} \cdot \frac{\partial{z_{i}}}{\partial{w_{k}}} = sig(z_{i})\cdot(1-sig(z_{i}))\cdot x_{k}$$

para toda feature $x_{k}$, e:

$$\frac{\partial{\hat{y}_{i}}}{\partial{b}} = \frac{d\hat{y}_{i}}{dz_{i}} \cdot \frac{\partial{z_{i}}}{\partial{b}} = sig(z_{i})\cdot(1-sig(z_{i}))$$

Calculando agora as derivadas parciais de $f(\theta)$:

$$\frac{\partial{f}}{\partial{w_{k}}} = -\frac{1}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{(1-\hat{y}_{i})}\right] + \lambda \frac{|w_{k}|}{w_{k}}$$

$$\frac{\partial{f}}{\partial{b}} = -\frac{1}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))}{(1-\hat{y}_{i})}\right]$$

Assim, sendo $\alpha$ a taxa de aprendizado do modelo, os novos valores dos pesos e do viés serão:

$$w_{k}' = w_{k} - \alpha \frac{\partial{f}}{\partial{w_{k}}}$$

$$b' = b - \alpha \frac{\partial{f}}{\partial{b}}$$

Desenvolvendo, temos, por fim:

$$w_{k}' = w_{k} + \frac{\alpha}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))\cdot (x_{k})_{i}}{(1-\hat{y}_{i})}\right] + \alpha \lambda \frac{|w_{k}|}{w_{k}}$$

$$b' = b + \frac{\alpha}{n}\sum_{i = 1}^{n}\left[y_{i}\cdot \frac{sig(z_{i})\cdot(1-sig(z_{i}))}{\hat{y}_{i}} - (1-y_{i})\cdot\frac{sig(z_{i})\cdot(1-sig(z_{i}))}{(1-\hat{y}_{i})}\right]$$