# Analising The COVID-19 Mexico Database With Neural Networks
### The purpose of this notebook is to estimate the patients death probability depending on the database features.


In [None]:
# for numbers and datasets
import numpy as np
import pandas as pd
# for the plots
%matplotlib inline
import matplotlib.pyplot as plt

# from sklearn import metrics
# importamos el modulo para cross validation
# from sklearn.model_selection import cross_val_score
# el separador de datos en entrenamiento y testeo


path = '/kaggle/input/cov19-open-data-mexico/data.csv'

# Data Processing

In [None]:
# read the dataset
dataset = pd.read_csv(path, encoding='unicode_escape')
print('Shape:', dataset.shape)
dataset.head()

In [None]:
# different values from column 'CLASIFICACION_FINAL'
dataset['CLASIFICACION_FINAL'].value_counts().plot(kind='bar')

## See if the dataset has null values

In [None]:
dataset.isnull().any()

## Select Positive COVID-19 Tests

For this resarching it's needed to select only the positive COVID-19 tested patients. `Then, select only the rows that has 1 in the 'CLASIFICACION_FINAL' column or delete the rows that does not have that value`.

In [None]:
# values that are negative, that are going to be deleted
negative_values = (4,5,6,7,97)

# then delete those rows
for val in negative_values:
    dataset.drop(dataset[dataset['CLASIFICACION_FINAL'] == val].index, axis=0, inplace=True)

# see the new shape
dataset.shape

Then, having the positive cases the first value shows how many cases Mexico has (in the date the dataset was downloaded)

## Select the Features
The most useful features are those that are `patients health issues`, so the model might show what patients are at greater risk of die. Also the target will be selected with the features.

## Why the target says date of defunction?
This is because that column has fake or strange values (99/99/9999) for recovered patients, and normal dates for death patients. What it's needed is to change the dates to 1 or 0, recovered or death.

In [None]:
# define the features (health issues and other interesting values) and the target
features = ['DIABETES','EPOC','ASMA','INMUSUPR','HIPERTENSION','CARDIOVASCULAR','OBESIDAD',
                'RENAL_CRONICA','TABAQUISMO','INTUBADO','NEUMONIA','EDAD','FECHA_DEF']

# then select the columns required
df = dataset[features]
df.head()

## Labels

These are the labels for the columns:
- Sexo(gender): 1-Woman, 2-Man
- Enfermedades(issues): 1-sick, 2-not sick, 99-unknown.
- Fecha de defuncion(defunction date): 9999-99-99 - recovered, `a normal date` - death.

**Note:** The 98 or 99 values mean *unknown value*, so these are **like null values**, need to be cleaned.

In [None]:
# See how many values there are in Diabetes column
df['DIABETES'].value_counts().plot(kind='bar')

## Cleaning the "null" values

In [None]:
# this procees is for almost all the columns
for column in df.columns:
    # drop the columns that has those unknown values
    df = df.drop(df[df[column]==98].index, axis=0)
    df = df.drop(df[df[column]==99].index, axis=0) 

In [None]:
# the we can see the new plot without that unknown values
df['DIABETES'].value_counts().plot(kind='bar')

## Convert to Numerical Values

In [None]:
df['INTUBADO'].value_counts().plot(kind='bar')

In [None]:
# it's needed to change the values to only 1 and 0

# in the "INTUBADO" column the values are different
# 97 is not intubated and 2 is intubated
df['INTUBADO'] = df['INTUBADO'].replace({2: 1})
df['INTUBADO'] = df['INTUBADO'].replace({97: 0})


# apply the process to all the columns
for column in df.columns:
    # replace the 2 values to 0
    df[column] = df[column].replace({2: 0})

df.head()

## The Target

In [None]:
target = []
# iterate the defunction date column values
for value in df['FECHA_DEF'].values:
    if value == '9999-99-99': # recovered
        target.append(0)
    else: # death
        target.append(1)
        
# then convert the list to a numpy array
target = np.array(target)
# see a slice of the taregt
print(target[:30], target.shape)

# finally drop the defunction date column
df.drop(['FECHA_DEF'], axis=1, inplace=True)

In [None]:
# this is the number of death patients in the dataset
print('Deaths:', list(target).count(1))

Check the sizes

In [None]:
print('target: ', list(target.shape)[0])
print('Dataset: ', list(df.shape)[0])

## Final visualization of the features

In [None]:
rows = 4
cols = 3
# create the plot
fig, axs = plt.subplots(rows, cols)
# are all the columns in exception of age
titles = list(df.columns[:-1])
# ass the defunctions values
titles.append('DEFUNCIONES')
# counter for the titles
count = 0

# iterate the rows
for i in range(rows):
    # iterate the columns
    for j in range(cols):
        # data from the features
        if count < 8:
            # define the values, negative, positive
            vals = [df[titles[count]].value_counts()[0], df[titles[count]].value_counts()[1]]
            # plot the values
            axs[i, j].pie(vals, labels=['Not','Yes'])
        # data from the target
        else:
            vals = [list(target).count(0), list(target).count(1)]
            axs[i, j].pie(vals, labels=['Not','Yes'])
        # set the plot title
        axs[i, j].set_title(titles[count])
        count +=1

In [None]:
df.head()

## Data Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
# instance the scaler
scaler = StandardScaler()
# fit the scaler
scaler.fit(df.values)
# and use the scaler
X = scaler.transform(df.values)

In [None]:
# reshape the target, it's needed for the nn
Y = target.reshape([-1,1])
# check the sizes
len(X), len(Y)

# Slice the data in Training, Validation and Testing

In [None]:
from sklearn.model_selection import train_test_split

# split the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2020)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=2020)

# plot the shapes of the data sets
plt.title('Data sets sizes')
plt.pie([len(x_train), len(x_test), len(x_val)], labels=['Training', 'Testing', 'Validation'])
plt.show()

# The Model (a neural network)

In [None]:
from tensorflow import keras
from keras import Sequential, layers
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping


model = Sequential([
    # layers of the model
    Dense(128, input_shape=[X.shape[1]], activation='relu'),
    BatchNormalization(),
    Dropout(rate=0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(rate=0.3),
    Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'],
)

early_stopping = EarlyStopping(
    min_delta=0.001,
    patience=10,
    restore_best_weights=True,
)

history = model.fit(
    x_train, y_train,
    batch_size=256,
    epochs=30,
    validation_data=(x_val, y_val),
    callbacks=[early_stopping]
)

In [None]:
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

In [None]:
model.save("cov_model_nn.h5")