# Testes de modelos de machine learning:

## Como rodar
Para testar os modelos primeiro é nececssário criar os arquivos ja proeparados. Para isso rode o notebook _prepare_data_. Após isso esse notebook poderá ser usado. 

Para instalar as bibliotecas usadas aqui, basta rodar o comando:
```Shell
pip install <nome do pacote>
```


In [1]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.linear_model import LinearRegression
import scikitplot as skplt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2021-12-30 10:30:44.701190: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2021-12-30 10:30:44.701235: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Lendo os dados dos arquivos ja preparados.
all_files = glob.glob('./data_city' + '/*.csv')

df = []
df_labels = [
         'condicoes',
         'idade',
         'municipioIBGE',
         'resultadoTeste',
         'sexo',
         'paladar',
         'fadiga',
         'olfato',
         'garganta',
         'respirar',
         'febre',
         'tosse',
         'diarreia',
         'coriza',
         'dispneia'
    ]
X_labels = [
         'condicoes',
         'idade',
         'sexo',
         'paladar',
         'fadiga',
         'olfato',
         'garganta',
         'respirar',
         'febre',
         'tosse',
         'diarreia',
         'coriza',
         'dispneia'
    ]

for file in tqdm(all_files):
    df.append(pd.read_csv(file, encoding='latin-1', delimiter=';', names=df_labels))

def df_length(e):
    return e.size

df.sort(reverse=True, key=df_length)

df[0].head()

  0%|          | 0/1865 [00:00<?, ?it/s]

Unnamed: 0,condicoes,idade,municipioIBGE,resultadoTeste,sexo,paladar,fadiga,olfato,garganta,respirar,febre,tosse,diarreia,coriza,dispneia
0,0,31.0,5208707,1,0,0,0,0,0,0,1,1,0,1,0
1,0,90.0,5208707,0,0,0,0,0,0,0,1,1,0,0,0
2,0,44.0,5208707,0,1,0,0,0,0,0,1,1,0,0,0
3,0,51.0,5208707,1,1,0,0,0,0,0,0,1,0,0,0
4,0,59.0,5208707,1,1,0,0,0,0,0,1,0,0,0,0


In [3]:
# Função para separar em conjunto de treino e de teste.

def train_test_set(X, Y):
    return train_test_split(X, Y, test_size = 0.2)

In [4]:
# Função para separar input e output

def input_output_set(df):
    Y = df.iloc[:,3].values
    X = df.drop(['resultadoTeste', 'municipioIBGE'], axis=1).values
    return X, Y
    

In [5]:
# Função para normalizar os dados

def normalize(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

In [6]:
# Funcão para balancear os dados

def balance_set(X, Y):
    balancer = RandomOverSampler()
    return balancer.fit_resample(X, Y)

In [7]:
# Função que imprime a matriz de confunsão para analise dos resultados. 

def print_confusion_matrix(test, pred):
    skplt.metrics.plot_confusion_matrix(test, pred, normalize=True)

## Como testar. 

Para teste dos modelos foram criadas conjuntos de funções. Para testar o modelo testando todas as cidades e aplicando o melhor modelo para cada cidade, execute a função _start_. Caso queira testar apenas em uma cidade, use a função com o nome do tipo de modelo para criar um modelo e, o retorno dessa função, passe como parâmetro da função _test_model_. Além disso passe também o mesmo conjunto de dados da lista _df_.

In [8]:
# Função para executar a regressão linear

def linear_regression(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    regressor = LinearRegression()
    regressor.fit(Xtrain, Ytrain)
    
    return regressor

def test_LR(df, regressor):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    Ypred = regressor.predict(Xtest)
    Ypred = [round(value) for value in Ypred]

    accuracy = accuracy_score(Ytest, Ypred)
    
    return accuracy*100

def get_all_LR():
    all_reg = []
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        all_reg.append(linear_regression(data))
    return all_reg

# Começa a testar a regressão linear. Demora
def start_test_LR():
    all_reg = get_all_LR()
    mean = 0
    size = 0
    for data in tqdm(df):
        acc = 0
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        for regressor in all_reg:
            acc = max(test_LR(data, regressor), acc)
        mean += acc*tam
        size += tam
        print(f'Acurácia: {acc}%\nTamanho do conjunto:{tam}')
    print(f'Acurácia total: {mean/size}')

In [9]:
def xgboost(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)

    model = xgb.XGBClassifier(
        objective='reg:logistic',
        use_label_encoder=False,
        colsample_bytree = 0.3,
        learning_rate = 0.1,
        max_depth = 5,
        alpha = 10,
        n_estimators = 10
    )
    
    model.fit(Xtrain, Ytrain)
    
    return model

    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    print(f'Acurácia: {accuracy*100}%')
    
def test_XGB(df, model):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    return accuracy*100
    
def get_all_XGB():
    models = []
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        models.append(xgboost(data))
    return models

def start_xgboost():
    models = get_all_XGB()
    s = 0
    t = 0
    for data in df:
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_XGB(data, model))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}')

In [10]:
def RandomForest(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    clf = RandomForestClassifier(n_estimators=100)
    
    clf.fit(Xtrain, Ytrain)
    
    ypred=clf.predict(Xtest)
    
    accuracy = accuracy_score(Ytest, ypred)
    
    return clf
    
def get_all_RF():
    all_models = [] 
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        all_models.append(RandomForest(data))
    return all_models

def test_model_RF(clf, data):
    X, Y = input_output_set(data)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    ypred=clf.predict(Xtest)
    
    accuracy = accuracy_score(Ytest, ypred)
    return float(accuracy*100)
    
def start_RF():
    models = get_all_RF()
    s = 0
    t = 0
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_model_RF(model, data))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}') 

In [11]:
# Para fins de otimização para testes, o numero de vertices na segunda camada foi reduzido alem do numeros de repassagens (Epochs)

def keras(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    
    model = Sequential()
    model.add(Dense(13, input_dim=13, activation='relu'))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

def get_all_Keras():
    all_models = [] 
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        all_models.append(keras(data))
    return all_models

def test_model_Keras(model, data):
    X, Y = input_output_set(data)
    X = normalize(X)
    
    model.fit(X, Y, epochs=4, batch_size=100, verbose=0)
    
    _, accuracy = model.evaluate(X, Y)
    return float(accuracy*100)
    
def start_Keras():
    models = get_all_Keras()
    s = 0
    t = 0
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_model_Keras(model, data))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}')

In [None]:
start_Keras()

2021-12-30 10:30:52.966882: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-30 10:30:52.966919: W tensorflow/stream_executor/cuda/cuda_driver.cc:312] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-30 10:30:52.966951: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jps12-Z450UA): /proc/driver/nvidia/version does not exist
2021-12-30 10:30:52.967731: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-30 10:30:52.996895: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 240000000

  0%|          | 0/1865 [00:00<?, ?it/s]

