In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTETomek
import joblib


In [42]:

data0 = pd.read_csv("/content/drive/MyDrive/colab/BaseCovid/lbp-train-fold_0.csv")
data1 = pd.read_csv("/content/drive/MyDrive/colab/BaseCovid/lbp-train-fold_1.csv")
data2 = pd.read_csv("/content/drive/MyDrive/colab/BaseCovid/lbp-train-fold_2.csv")
data3 = pd.read_csv("/content/drive/MyDrive/colab/BaseCovid/lbp-train-fold_3.csv")
data4 = pd.read_csv("/content/drive/MyDrive/colab/BaseCovid/lbp-train-fold_4.csv")

df = pd.concat([data0, data1, data2, data3, data4], ignore_index=True)

df = df.dropna()
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizar os dados com MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Supondo que você tenha um scaler
scaler = MinMaxScaler()
scaler.fit(X_train)

joblib.dump(scaler, '/content/drive/MyDrive/colab/BaseCovid/minmax_scaler.joblib')

# Aplicar SMOTETomek com taxa aumentada para classes minoritárias
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

print("Distribuição de classes após SMOTETomek:\n", pd.Series(y_train_balanced).value_counts())

# Treinar o modelo com peso para classes minoritárias
clf = RandomForestClassifier(class_weight='balanced_subsample', random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy:.2f}")

report = classification_report(y_test, y_pred, zero_division=1)
print("Relatório de Classificação:")
print(report)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Matriz de Confusão:")
print(conf_matrix)

model_filename = '/content/drive/MyDrive/colab/BaseCovid/random_forest_model_smote_tomek_normalized.joblib'
joblib.dump(clf, model_filename)
print(f"Modelo salvo em: {model_filename}")


Distribuição de classes após SMOTETomek:
 class
R/Normal                                            560
R/Pneumonia/Acellular/Viral/Coronavirus/COVID-19    560
R/Pneumonia/Celullar/Bacterial/Streptococcus        560
R/Pneumonia/Acellular/Viral/Coronavirus/SARS        560
R/Pneumonia/Acellular/Viral/Varicella               560
R/Pneumonia/Acellular/Viral/Coronavirus/MERS        560
R/Pneumonia/Celullar/Fungus/Pneumocystis            560
Name: count, dtype: int64
Acurácia: 0.96
Relatório de Classificação:
                                                  precision    recall  f1-score   support

                                        R/Normal       0.97      0.99      0.98       140
R/Pneumonia/Acellular/Viral/Coronavirus/COVID-19       0.85      0.85      0.85        13
    R/Pneumonia/Acellular/Viral/Coronavirus/MERS       1.00      0.00      0.00         1
    R/Pneumonia/Acellular/Viral/Coronavirus/SARS       1.00      0.50      0.67         2
           R/Pneumonia/Acellular/Viral/V