In [4]:
import pandas as pd
import os
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
dsa = pd.read_csv('./data/respiratory_symptoms_and_treatment.csv')

In [6]:
dsa.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Nature
0,coughing,5.0,female,Asthma,Omalizumab,high
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,high
2,wheezing,6.0,male,Asthma,Mepolizumab,high
3,shortness of breath,7.0,male,Asthma,Mepolizumab,high
4,shortness of breath,9.0,male,Asthma,Mepolizumab,high


In [7]:
dsa.isnull().sum()

Symptoms      696
Age           342
Sex           922
Disease       340
Treatment    2841
Nature       2190
dtype: int64

In [8]:
dsa.describe()

Unnamed: 0,Age
count,38195.0
mean,36.997356
std,27.713575
min,1.0
25%,12.0
50%,30.0
75%,63.0
max,98.0


In [9]:
dsa["Symptoms"].sort_values(ascending=True).unique()

array([' coughing', ' coughing ', ' fatigue', ' low energy',
       ' shortness of breath', ' wheezing',
       'A cough that lasts more than three weeks',
       'A dry, crackling sound in the lungs while breathing in',
       'Bluish skin', 'Chest congestion', 'Chest pain',
       'Chest tightness or chest pain', 'Chills', 'Coughing up blood',
       'Coughing up yellow or green mucus daily', 'Daytime sleepiness',
       'Difficulties with memory and concentration', 'Dry mouth',
       'Fatigue', 'Fatigue, feeling run-down or tired',
       'Feeling run-down or tired', 'Fever', 'Frequently waking',
       'Headache', 'Loss of appetite',
       'Loss of appetite and unintentional weight loss',
       'Low-grade fever', 'Morning headaches', 'Nasal congestion',
       'Nausea', 'Night sweats', 'Pauses in breathing',
       'Persistent dry coug', 'Persistent dry cough', 'Rapid breathing',
       'Rapid heartbeat\n', 'Runny nose', 'Shortness of breath',
       'Shortness of breath ',
    

In [10]:
dsa["Nature"].sort_values(ascending=True).unique()

array([' ', 'high', 'low', 'medium', nan], dtype=object)

In [11]:
dsa["Treatment"].sort_values(ascending=True).unique()

array(['Adaptive servo-ventilation', 'Antibiotic', 'Antibiotics',
       'Antibiotics.', 'Chemotherapy', 'Cough medicine', 'Diuretics',
       'Intrapulmonary Percussive Ventilation', 'Intravenous fluids',
       'Mepolizumab', 'Omalizumab', 'Oseltamivir',
       'Pulmonary rehabilitation', 'Surgery', 'antibiotics', 'aspirin',
       'consult a doctor', 'consult doctor', 'ethambutol',
       'hypertonic saline', 'inhaler', 'inhealer',
       'isotonic sodium chloride solution', 'itraconazole', 'oxygen',
       'oxyzen', 'pyrazinamide', 'rifampin', 'saline nose drops',
       'stay away from cold places', 'steroids to reduce inflammation',
       'surgery', 'x-ray', nan], dtype=object)

In [12]:
if "Antibiotics." in dsa["Treatment"]:
    dsa.drop(index="Antibiotics.")

In [13]:
dsa["Treatment"].value_counts()

Treatment
antibiotics                              5135
Antibiotic                               4128
Chemotherapy                             2928
isotonic sodium chloride solution        2880
Oseltamivir                              1872
saline nose drops                        1800
Diuretics                                1680
oxyzen                                   1536
consult a doctor                         1344
Pulmonary rehabilitation                 1104
consult doctor                            992
Cough medicine                            960
hypertonic saline                         850
Adaptive servo-ventilation                816
Intravenous fluids                        720
ethambutol                                720
steroids to reduce inflammation           672
x-ray                                     624
Antibiotics.                              528
pyrazinamide                              528
inhaler                                   525
rifampin                

In [14]:
dsa.isnull().sum()

Symptoms      696
Age           342
Sex           922
Disease       340
Treatment    2841
Nature       2190
dtype: int64

In [15]:
dsa.dtypes

Symptoms      object
Age          float64
Sex           object
Disease       object
Treatment     object
Nature        object
dtype: object

In [16]:
tratamento = dsa["Treatment"].value_counts().index[0]

In [17]:
dsa["Treatment"].fillna(tratamento, inplace=True)

In [18]:
dsa.isnull().sum()

Symptoms      696
Age           342
Sex           922
Disease       340
Treatment       0
Nature       2190
dtype: int64

In [19]:
def moda(coluna,index):
    moda = dsa[coluna].value_counts().index[index]
    return dsa[coluna].fillna(moda, inplace=True)

In [20]:
moda('Symptoms',0)
moda('Age',0)
moda('Sex',0)
moda('Disease',0)
moda('Treatment',0)
moda('Nature',0)

In [21]:
dsa.isnull().sum()

Symptoms     0
Age          0
Sex          0
Disease      0
Treatment    0
Nature       0
dtype: int64

In [22]:
dsa['Nature'].unique()

array(['high', 'medium', 'low', ' '], dtype=object)

In [23]:
dsa['Symptoms'].sort_values(ascending=True).unique()

array([' coughing', ' coughing ', ' fatigue', ' low energy',
       ' shortness of breath', ' wheezing',
       'A cough that lasts more than three weeks',
       'A dry, crackling sound in the lungs while breathing in',
       'Bluish skin', 'Chest congestion', 'Chest pain',
       'Chest tightness or chest pain', 'Chills', 'Coughing up blood',
       'Coughing up yellow or green mucus daily', 'Daytime sleepiness',
       'Difficulties with memory and concentration', 'Dry mouth',
       'Fatigue', 'Fatigue, feeling run-down or tired',
       'Feeling run-down or tired', 'Fever', 'Frequently waking',
       'Headache', 'Loss of appetite',
       'Loss of appetite and unintentional weight loss',
       'Low-grade fever', 'Morning headaches', 'Nasal congestion',
       'Nausea', 'Night sweats', 'Pauses in breathing',
       'Persistent dry coug', 'Persistent dry cough', 'Rapid breathing',
       'Rapid heartbeat\n', 'Runny nose', 'Shortness of breath',
       'Shortness of breath ',
    

In [24]:
dsa[dsa.Symptoms.str.startswith(" ")].Symptoms.unique()

array([' coughing ', ' wheezing', ' shortness of breath', ' coughing',
       ' low energy', ' fatigue'], dtype=object)

In [25]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' coughing ', 'coughing')

In [26]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' shortness of breath', 'shortness of breath')

In [27]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' wheezing', 'wheezing')

In [28]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' coughing', 'coughing')

In [29]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' low energy', 'low energy')

In [30]:
dsa['Symptoms'] = dsa['Symptoms'].replace(' fatigue', 'fatigue')
dsa['Symptoms'] = dsa['Symptoms'].replace('whistling sound while breathing', 'whistling sound while you breathe')
dsa['Symptoms'] = dsa['Symptoms'].replace('Fatigue, feeling run-down or tired', 'Fatigue')
dsa['Symptoms'] = dsa['Symptoms'].replace('Loss of appetite and unintentional weight loss', 'Loss of appetite')
dsa['Symptoms'] = dsa['Symptoms'].replace('Rapid heartbeat\n', 'Rapid heartbeat')
dsa['Symptoms'] = dsa['Symptoms'].replace('Shortness of breath that gets worse during flare-ups', 'Shortness of breath')
dsa['Symptoms'] = dsa['Symptoms'].replace('Weight loss from loss of appetite', 'Loss of appetite')
dsa['Symptoms'] = dsa['Symptoms'].replace('shortness of breath', 'Shortness of breath')
dsa['Symptoms'] = dsa['Symptoms'].replace('short of breath', 'Shortness of breath')
dsa['Symptoms'] = dsa['Symptoms'].replace('short, shallow and rapid breathing', 'Rapid heartbeat')
dsa['Symptoms'] = dsa['Symptoms'].replace('allergy', 'Allergy')
dsa['Symptoms'] = dsa['Symptoms'].replace('fatigue', 'Fatigue')
dsa['Symptoms'] = dsa['Symptoms'].replace('nausea', 'Nausea')
dsa['Symptoms'] = dsa['Symptoms'].replace('loss of appetite', 'Loss of appetite')
dsa['Symptoms'] = dsa['Symptoms'].replace('wheezing cough', 'Wheezing')
dsa['Symptoms'] = dsa['Symptoms'].replace('high fever', 'Fever')
dsa['Symptoms'] = dsa['Symptoms'].replace('fever', 'Fever')
dsa['Symptoms'] = dsa['Symptoms'].replace('Persistent dry coug', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('chest pain', 'Chest congestion')
dsa['Symptoms'] = dsa['Symptoms'].replace('coughing', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('cough', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('chronic cough', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('cough with blood', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('Morning headaches', 'Headache')
dsa['Symptoms'] = dsa['Symptoms'].replace('pain', 'joint pain')
dsa['Symptoms'] = dsa['Symptoms'].replace('yellow cough', 'Cough')
dsa['Symptoms'] = dsa['Symptoms'].replace('Coughing up blood', 'Cough')

In [31]:
dsa['Symptoms'].sort_values(ascending=True).unique()

array(['A cough that lasts more than three weeks',
       'A dry, crackling sound in the lungs while breathing in',
       'Allergy', 'Bluish skin', 'Chest congestion', 'Chest pain',
       'Chest tightness or chest pain', 'Chills', 'Cough',
       'Coughing up yellow or green mucus daily', 'Daytime sleepiness',
       'Difficulties with memory and concentration', 'Dry mouth',
       'Fatigue', 'Feeling run-down or tired', 'Fever',
       'Frequently waking', 'Headache', 'Loss of appetite',
       'Low-grade fever', 'Nasal congestion', 'Nausea', 'Night sweats',
       'Pauses in breathing', 'Persistent dry cough', 'Rapid breathing',
       'Rapid heartbeat', 'Runny nose', 'Shortness of breath',
       'Shortness of breath ', 'Snoring', 'Sore throat',
       'Unusual moodiness', 'Wheezing',
       'Wider and rounder than normal fingertips and toes', 'breath',
       'cold', 'diarrhea', 'distressing', 'dizziness', 'dry cough',
       'edema', 'fainting', 'faster heart beating', 'greenish

In [32]:
dsa['Treatment'].sort_values(ascending=True).unique()

array(['Adaptive servo-ventilation', 'Antibiotic', 'Antibiotics',
       'Antibiotics.', 'Chemotherapy', 'Cough medicine', 'Diuretics',
       'Intrapulmonary Percussive Ventilation', 'Intravenous fluids',
       'Mepolizumab', 'Omalizumab', 'Oseltamivir',
       'Pulmonary rehabilitation', 'Surgery', 'antibiotics', 'aspirin',
       'consult a doctor', 'consult doctor', 'ethambutol',
       'hypertonic saline', 'inhaler', 'inhealer',
       'isotonic sodium chloride solution', 'itraconazole', 'oxygen',
       'oxyzen', 'pyrazinamide', 'rifampin', 'saline nose drops',
       'stay away from cold places', 'steroids to reduce inflammation',
       'surgery', 'x-ray'], dtype=object)

In [33]:
dsa['Disease'].sort_values(ascending=True).unique()

array(['Acute Respiratory Distress Syndrome', 'Asbestosis',
       'Aspergillosis', 'Asthma', 'Bronchiectasis', 'Chronic Bronchitis',
       'Chronic cough', 'Influenza', 'Mesothelioma', 'Pneumonia',
       'Pneumothorax', 'Pulmonary hypertension',
       'Respiratory syncytial virus', 'Tuberculosis', 'bronchiolitis',
       'bronchitis', 'chronic obstructive pulmonary disease',
       'sleep apnea'], dtype=object)

In [34]:
dsa[dsa.Symptoms.str.endswith(" ")].Symptoms.unique()

array(['Shortness of breath '], dtype=object)

In [35]:
dsa['Symptoms'] = dsa['Symptoms'].replace('Shortness of breath ', 'shortness of breath')

In [36]:
dsa[dsa.Symptoms.str.startswith(" ")].Symptoms.unique()

array([], dtype=object)

In [37]:
dsa[dsa.Treatment.str.startswith("A") | dsa.Treatment.str.startswith("a")].Treatment.unique()

array(['antibiotics', 'aspirin', 'Antibiotics', 'Antibiotics.',
       'Antibiotic', 'Adaptive servo-ventilation'], dtype=object)

In [38]:
dsa['Treatment'] = dsa['Treatment'].replace('Antibiotics.', 'Antibiotic')
dsa['Treatment'] = dsa['Treatment'].replace('Antibiotics', 'Antibiotic')
dsa['Treatment'] = dsa['Treatment'].replace('antibiotics', 'Antibiotic')

In [39]:
dsa[dsa.Treatment.str.endswith(" ")].Treatment.unique()

array([], dtype=object)

In [40]:
dsa.Nature.unique()

array(['high', 'medium', 'low', ' '], dtype=object)

In [41]:
dsa[dsa.Disease.str.startswith(" ")].Disease.unique()

array([], dtype=object)

In [42]:
dsa[dsa.Disease.str.endswith(" ")].Disease.unique()

array([], dtype=object)

In [43]:
dsa[dsa.Sex.str.endswith(" ")].Sex.unique()

array([], dtype=object)

In [44]:
dsa[dsa.Sex.str.startswith(" ")].Sex.unique()

array([], dtype=object)

In [45]:
label_encoder = LabelEncoder()

In [46]:
dsa['Disease'] = label_encoder.fit_transform(dsa['Disease'])

In [47]:
dsa['Symptoms'] = label_encoder.fit_transform(dsa['Symptoms'])

In [48]:
dsa['Sex'] = label_encoder.fit_transform(dsa['Sex'])

In [49]:
dsa['Treatment'] = label_encoder.fit_transform(dsa['Treatment'])

In [50]:
dsa['Nature'] = label_encoder.fit_transform(dsa['Nature'])

In [51]:
dsa.dtypes

Symptoms       int32
Age          float64
Sex            int32
Disease        int32
Treatment      int32
Nature         int32
dtype: object

In [52]:
X = dsa.drop('Disease', axis=1)
y = dsa['Disease']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

In [55]:
y_pred = model.predict(X_test)

# Calcular a acurácia
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

# Imprimir relatório de classificação
print(classification_report(y_test, y_pred))

# Matriz de confusão
conf_matrix = confusion_matrix(y_test, y_pred)

Acurácia: 0.99
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      0.94      0.97       109
           2       1.00      0.95      0.98       108
           3       1.00      0.98      0.99       210
           4       0.99      0.99      0.99       398
           5       1.00      1.00      1.00       419
           6       0.93      1.00      0.97       172
           7       1.00      1.00      1.00       325
           8       1.00      1.00      1.00       647
           9       0.99      0.98      0.99      1276
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       315
          12       1.00      1.00      1.00       154
          13       1.00      1.00      1.00       326
          14       0.99      0.99      0.99       522
          15       0.99      1.00      1.00      1001
          16       1.00      1.00      1.00       801
          17

In [56]:
pd.DataFrame(conf_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,168,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,103,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0
2,0,0,103,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,205,0,0,0,0,0,5,0,0,0,0,0,0,0,0
4,0,0,0,0,393,0,5,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,419,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,172,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,325,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,647,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,7,0,0,1254,0,0,0,0,6,9,0,0


In [57]:
# Definir a grade de parâmetros para ajuste
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Inicializar GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Ajustar o modelo
grid_search.fit(X_train, y_train)

# Melhor modelo encontrado
best_model = grid_search.best_estimator_
print(best_model)

# Avaliar o melhor modelo
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Acurácia após ajuste: {accuracy_best:.2f}')
print(classification_report(y_test, y_pred_best))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
RandomForestClassifier(random_state=42)
Acurácia após ajuste: 0.99
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      0.94      0.97       109
           2       1.00      0.95      0.98       108
           3       1.00      0.98      0.99       210
           4       0.99      0.99      0.99       398
           5       1.00      1.00      1.00       419
           6       0.93      1.00      0.97       172
           7       1.00      1.00      1.00       325
           8       1.00      1.00      1.00       647
           9       0.99      0.98      0.99      1276
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       315
          12       1.00      1.00      1.00       154
          13       1.00      1.00      1.00       326
          14       0.99      0.99      0.99       522
     

EXPORTAR MODELO

In [58]:
with open("../api/models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("../api/models/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

In [59]:
with open("../api/models/best_model.pkl", "rb") as f:
  best_model = pickle.load(f)

with open("../api/models/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

In [63]:
patient_data = ["cough", 5, "female", "Omalizumab", "high"]

patient_df = pd.DataFrame(
        [patient_data], columns=["Symptoms", "Age", "Sex", "Treatment", "Nature"]
    )

# Fazer o pré-processamento usando o LabelEncoder carregado
patient_df["Symptoms"] = label_encoder.transform(patient_df["Symptoms"])
patient_df["Sex"] = label_encoder.transform(patient_df["Sex"])
patient_df["Treatment"] = label_encoder.transform(patient_df["Treatment"])
patient_df["Nature"] = label_encoder.transform(patient_df["Nature"])

ValueError: y contains previously unseen labels: 'cough'