In [1]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.linear_model import LinearRegression
import scikitplot as skplt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Lendo os dados dos arquivos ja preparados.
all_files = glob.glob('./data_city' + '/*.csv')

df = []
df_labels = [
         'condicoes',
         'idade',
         'municipioIBGE',
         'resultadoTeste',
         'sexo',
         'paladar',
         'fadiga',
         'olfato',
         'garganta',
         'respirar',
         'febre',
         'tosse',
         'diarreia',
         'coriza',
         'dispneia'
    ]
X_labels = [
         'condicoes',
         'idade',
         'sexo',
         'paladar',
         'fadiga',
         'olfato',
         'garganta',
         'respirar',
         'febre',
         'tosse',
         'diarreia',
         'coriza',
         'dispneia'
    ]

for file in tqdm(all_files):
    df.append(pd.read_csv(file, encoding='latin-1', delimiter=';', names=df_labels))

def df_length(e):
    return e.size

df.sort(reverse=True, key=df_length)

df[0].head()

  0%|          | 0/1865 [00:00<?, ?it/s]

Unnamed: 0,condicoes,idade,municipioIBGE,resultadoTeste,sexo,paladar,fadiga,olfato,garganta,respirar,febre,tosse,diarreia,coriza,dispneia
0,0,31.0,5208707,1,0,0,0,0,0,0,1,1,0,1,0
1,0,90.0,5208707,0,0,0,0,0,0,0,1,1,0,0,0
2,0,44.0,5208707,0,1,0,0,0,0,0,1,1,0,0,0
3,0,51.0,5208707,1,1,0,0,0,0,0,0,1,0,0,0
4,0,59.0,5208707,1,1,0,0,0,0,0,1,0,0,0,0


In [3]:
# Função para separar em conjunto de treino e de teste.

def train_test_set(X, Y):
    return train_test_split(X, Y, test_size = 0.2)

In [4]:
# Função para separar input e output

def input_output_set(df):
    Y = df.iloc[:,3].values
    X = df.drop(['resultadoTeste', 'municipioIBGE'], axis=1).values
    return X, Y
    

In [5]:
# Função para normalizar os dados

def normalize(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

In [6]:
# Funcão para balancear os dados

def balance_set(X, Y):
    balancer = RandomOverSampler()
    return balancer.fit_resample(X, Y)

In [7]:
def print_confusion_matrix(test, pred):
    skplt.metrics.plot_confusion_matrix(test, pred, normalize=True)

In [8]:
# Função para executar a regressão linear

def linear_regression(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    regressor = LinearRegression()
    regressor.fit(Xtrain, Ytrain)
    
    return regressor

def test_regressor(df, regressor):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    Ypred = regressor.predict(Xtest)
    Ypred = [round(value) for value in Ypred]

    accuracy = accuracy_score(Ytest, Ypred)
    
    return accuracy*100

def get_all_regressors():
    all_reg = []
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        all_reg.append(linear_regression(data))
    return all_reg

# Começa a testar a regressão linear. Demora
def start_test_reg():
    all_reg = get_all_regressors()
    mean = 0
    size = 0
    for data in tqdm(df):
        acc = 0
        tam = len(data.iloc[:,:])
        if tam <= 100:
            break
        for regressor in all_reg:
            acc = max(test_regressor(data, regressor), acc)
        mean += acc*tam
        size += tam
        print(f'Acurácia: {acc}%\nTamanho do conjunto:{tam}')
    print(f'Acurácia total: {mean/size}')

In [9]:
def xgboost(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)

    model = xgb.XGBClassifier(
        objective='reg:logistic',
        use_label_encoder=False,
        colsample_bytree = 0.3,
        learning_rate = 0.1,
        max_depth = 5,
        alpha = 10,
        n_estimators = 10
    )
    
    model.fit(Xtrain, Ytrain)
    
    return model

    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    print(f'Acurácia: {accuracy*100}%')
    
def test_model(df, model):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    Ypred = model.predict(Xtest)

    accuracy = accuracy_score(Ytest, Ypred)
    return accuracy*100
    
def get_all_model():
    models = []
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        models.append(xgboost(data))
    return models

def start_xgboost():
    models = get_all_model()
    s = 0
    t = 0
    for data in df:
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_model(data, model))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}')

In [10]:
def RandomForest(df):
    X, Y = input_output_set(df)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    clf = RandomForestClassifier(n_estimators=100)
    
    clf.fit(Xtrain, Ytrain)
    
    ypred=clf.predict(Xtest)
    
    accuracy = accuracy_score(Ytest, ypred)
    
    return clf
    
def get_all_RF():
    all_models = [] 
    for data in df:
        tam = len(data.iloc[:,:])
        if tam <= 1000:
            break
        all_models.append(RandomForest(data))
    return all_models

def test_model_RF(clf, data):
    X, Y = input_output_set(data)
    X = normalize(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_set(X, Y)
    
    ypred=clf.predict(Xtest)
    
    accuracy = accuracy_score(Ytest, ypred)
    return float(accuracy*100)
    
def start_RF():
    models = get_all_RF()
    s = 0
    t = 0
    for data in tqdm(df):
        tam = len(data.iloc[:,:])
        acc = 0    
        if tam <= 1000:
            break
        for model in models:
            acc = max(acc, test_model_RF(model, data))
        s += acc*tam
        t += tam
        print(f'Acurácia: {acc}%')
    print(f'Aucrácia total: {s/t}') 

In [11]:
start_RF()

  0%|          | 0/1865 [00:00<?, ?it/s]

Acurácia: 58.77604512906228%
Acurácia: 63.56316337734733%
Acurácia: 70.68273092369478%
Acurácia: 68.96651828914263%
Acurácia: 67.00602945537216%
Acurácia: 66.49773755656109%
Acurácia: 63.362231436110065%
Acurácia: 78.39380350591114%
Acurácia: 64.69214437367303%
Acurácia: 73.0260155180283%
Acurácia: 70.24210848912044%
Acurácia: 69.05972045743329%
Acurácia: 72.83136710617627%
Acurácia: 84.1297676457694%
Acurácia: 73.28103368712505%
Acurácia: 69.21296296296296%
Acurácia: 69.29283341243475%
Acurácia: 70.76622742801366%
Acurácia: 69.07164480322906%
Acurácia: 67.81365802011646%
Acurácia: 68.8017669795693%
Acurácia: 80.84358523725835%
Acurácia: 68.83500887049084%
Acurácia: 70.62766605728214%
Acurácia: 80.26045236463331%
Acurácia: 72.44274809160305%
Acurácia: 69.48616600790514%
Acurácia: 71.31345688960515%
Acurácia: 71.41693811074919%
Acurácia: 75.0625521267723%
Acurácia: 71.44053601340033%
Acurácia: 74.88789237668162%
Acurácia: 73.56948228882834%
Acurácia: 74.27230046948357%
Acurácia: 83.2391