In [1]:
import numpy  as np
import pandas as pd
import seaborn as sns
import math

from matplotlib import pyplot as plt
from sklearn  import linear_model    as lm
from sklearn  import model_selection as ms
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, log_loss

In [54]:
df0 = pd.read_csv('datasets/air_system_previous_years.csv')
df_test0 = pd.read_csv('datasets/air_system_present_year.csv')

In [55]:
# Mapear 'neg' e 'pos' para 0 e 1 na coluna 'class'
df0['class'] = df0['class'].map({'neg': 0, 'pos': 1})
df_test0['class'] = df_test0['class'].map({'neg': 0, 'pos': 1})

# Separar variável alvo
y = df0['class']
y_test = df_test0['class']

In [68]:

X = df0.drop('class', axis=1)
X_test = df_test0.drop('class', axis=1)

X = X.replace('na', np.nan).apply(pd.to_numeric)
X_test = X_test.replace('na', np.nan).apply(pd.to_numeric)

X.fillna(X.mean(), inplace=True)
X_test.fillna(X.mean(), inplace=True)

mms = MinMaxScaler()
X_scaled = mms.fit_transform(X)
X_test_scaled = mms.transform(X_test)

# Aplicando PCA
pca = PCA(n_components=0.95)  
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

xgb_final = XGBClassifier(subsample=0.388820765999555, 
                          n_estimators=226,
                          eta=0.07036202197513876,
                          min_child_weight=8,
                          max_depth=8, 
                          colsample_bytree=0.8587727056650034,
                          random_state=42)

xgb_final.fit(X_pca, y)

y_pred_proba = xgb_final.predict_proba(X_test_pca)[:, 1]
y_pred_binary = (y_pred_proba > 0.010101010101010102).astype(int)  

roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)

print(f'Área sob a curva ROC: {roc_auc:.4f}')
print(f'Precisão: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print('Matriz de Confusão:')
print(conf_matrix)

Área sob a curva ROC: 0.9891
Precisão: 0.2800
Recall: 0.9840
Matriz de Confusão:
[[14676   949]
 [    6   369]]


In [69]:
# Lista de thresholds para testar
thresholds = np.linspace(0, 1, 100)

best_threshold = None
min_cost = float('inf')

for threshold in thresholds:
    y_pred_binary = (y_pred_proba > threshold).astype(int)
    
    # Calcular FN, TP, FP
    fn = confusion_matrix(y_test, y_pred_binary)[1, 0]
    tp = confusion_matrix(y_test, y_pred_binary)[1, 1]
    fp = confusion_matrix(y_test, y_pred_binary)[0, 1]
    
    # Calcular custo total
    total_cost = fn * 500 + tp * 25 + fp * 10
    
    # Atualizar o melhor threshold se encontrar um custo menor
    if total_cost < min_cost:
        min_cost = total_cost
        best_threshold = threshold

print("Melhor Threshold:", best_threshold)
print("Custo Mínimo:", min_cost)

Melhor Threshold: 0.010101010101010102
Custo Mínimo: 21715
