In [1]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.linear_model import LinearRegression
import scikitplot as skplt
import xgboost as xgb

In [2]:
# Lendo os dados dos arquivos ja preparados.
all_files = glob.glob('./data_city' + '/*.csv')

df = []

for file in tqdm(all_files):
    df.append(pd.read_csv(file, encoding='latin-1', delimiter=';', names=[
         'condicoes',
         'idade',
         'municipioIBGE',
         'resultadoTeste',
         'sexo',
         'paladar',
         'fadiga',
         'olfato',
         'garganta',
         'respirar',
         'febre',
         'tosse',
         'diarreia',
         'coriza',
         'dispneia'
    ]))

def df_length(e):
    return e.size

df.sort(reverse=True, key=df_length)

df[0].head()

  0%|          | 0/1865 [00:00<?, ?it/s]

Unnamed: 0,condicoes,idade,municipioIBGE,resultadoTeste,sexo,paladar,fadiga,olfato,garganta,respirar,febre,tosse,diarreia,coriza,dispneia
0,0,31.0,5208707,1,0,0,0,0,0,0,1,1,0,1,0
1,0,90.0,5208707,0,0,0,0,0,0,0,1,1,0,0,0
2,0,44.0,5208707,0,1,0,0,0,0,0,1,1,0,0,0
3,0,51.0,5208707,1,1,0,0,0,0,0,0,1,0,0,0
4,0,59.0,5208707,1,1,0,0,0,0,0,1,0,0,0,0


In [3]:
# Função para separar em conjunto de treino e de teste.

def train_test_set(X, Y):
    return train_test_split(X, Y, test_size = 0.2)

In [4]:
# Função para separar input e output

def input_output_set(df):
    Y = df.iloc[:,3].values
    X = df.drop(['resultadoTeste', 'municipioIBGE'], axis=1).values
    return X, Y
    

In [5]:
# Função para normalizar os dados

def normalize(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

In [6]:
# Funcão para balancear os dados

def balance_set(X, Y):
    balancer = RandomOverSampler()
    return balancer.fit_resample(X, Y)

In [7]:
def print_confusion_matrix(test, pred):
    skplt.metrics.plot_confusion_matrix(test, pred, normalize=True)

In [8]:
# Função para executar a regressão linear

def linear_regression(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    regressor = LinearRegression()
    regressor.fit(Xtrain, Ytrain)
    
    return regressor

def test_regressor(df, regressor):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    Ypred = regressor.predict(Xtest)
    Ypred = [round(value) for value in Ypred]

    accuracy = accuracy_score(Ytest, Ypred)
    
    return accuracy*100

def get_all_regressors():
    all_reg = []
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        all_reg.append(linear_regression(data))
    return all_reg

# Começa a testar a regressão linear. Demora
def start_test_reg():
    all_reg = get_all_regressors()
    mean = 0
    size = 0
    for data in tqdm(df):
        acc = 0
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        for regressor in all_reg:
            acc = max(test_regressor(data, regressor), acc)
        mean += acc*tam
        size += tam
        print(f'Acurácia: {acc}%\nTamanho do conjunto:{tam}')
    print(f'Acurácia total: {mean/size}')

In [34]:
def xgboost(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)

    model = xgb.XGBClassifier(
        objective='reg:logistic',
        use_label_encoder=False,
        colsample_bytree = 0.3,
        learning_rate = 0.1,
        max_depth = 5,
        alpha = 10,
        n_estimators = 10
    )
    
    model.fit(Xtrain, Ytrain)
    
    return model

    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    print(f'Acurácia: {accuracy*100}%')
    
def test_model(df, model):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    return accuracy*100
    
def get_all_model():
    models = []
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        models.append(xgboost(data))
    return models

def start_xgboost():
    models = get_all_model()
    s = 0
    t = 0
    for data in df:
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_model(data, model))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}')

In [35]:
start_xgboost()

Acurácia: 56.41702596440579%
Acurácia: 62.39477462766406%
Acurácia: 70.34359660865684%
Acurácia: 66.7989161989549%
Acurácia: 65.60245131956114%
Acurácia: 61.71945701357466%
Acurácia: 60.27139087825104%
Acurácia: 77.6803913575214%
Acurácia: 57.55838641188959%
Acurácia: 70.94933820173436%
Acurácia: 68.64848299111247%
Acurácia: 60.133418043202035%
Acurácia: 68.80638445523942%
Acurácia: 85.09425690486628%
Acurácia: 71.75819104753116%
Acurácia: 59.675925925925924%
Acurácia: 59.65828191741813%
Acurácia: 64.27525622254758%
Acurácia: 57.56811301715439%
Acurácia: 57.64955002646903%
Acurácia: 63.1695196024296%
Acurácia: 80.08201523140012%
Acurácia: 55.52927261975162%
Acurácia: 58.37903717245582%
Acurácia: 78.82111034955449%
Acurácia: 58.85496183206107%
Acurácia: 58.26086956521739%
Acurácia: 65.18936341659952%
Acurácia: 57.817589576547235%
Acurácia: 71.55963302752293%
Acurácia: 61.30653266331658%
Acurácia: 64.66367713004485%
Acurácia: 58.03814713896458%
Acurácia: 56.901408450704224%
Acurácia: 83.