##### Criando modelo Baseline

##### Importando bibliotecas para o projeto

In [None]:
!pip install category_encoders
!pip install pycaret

In [None]:
import numpy             as np
import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, auc
from sklearn.linear_model import LogisticRegression

##### Funções que iremos utilizar para metrificar os modelos

In [None]:
# Métricas do modelo

def calculate_metrics(nm_modelo, model, X_train, y_train, X_test, y_test):
    # Fazendo predições
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculando as métricas para o conjunto de treino
    accuracy_train = accuracy_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred)
    recall_train = recall_score(y_train, y_train_pred)
    auc_roc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])

    # Calculando o Índice Gini e Estatística KS para o conjunto de treino
    probabilities_train = model.predict_proba(X_train)[:, 1]
    df_train = pd.DataFrame({'true_labels': y_train, 'predicted_probs': probabilities_train})
    df_train = df_train.sort_values(by='predicted_probs', ascending=False)
    df_train['cumulative_true'] = df_train['true_labels'].cumsum() / df_train['true_labels'].sum()
    df_train['cumulative_false'] = (1 - df_train['true_labels']).cumsum() / (1 - df_train['true_labels']).sum()
    ks_statistic_train = max(abs(df_train['cumulative_true'] - df_train['cumulative_false']))
    gini_index_train = 2 * auc_roc_train - 1

    # Calculando as métricas para o conjunto de teste
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    auc_roc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    # Calculando o Índice Gini e Estatística KS para o conjunto de teste
    probabilities_test = model.predict_proba(X_test)[:, 1]
    df_test = pd.DataFrame({'true_labels': y_test, 'predicted_probs': probabilities_test})
    df_test = df_test.sort_values(by='predicted_probs', ascending=False)
    df_test['cumulative_true'] = df_test['true_labels'].cumsum() / df_test['true_labels'].sum()
    df_test['cumulative_false'] = (1 - df_test['true_labels']).cumsum() / (1 - df_test['true_labels']).sum()
    ks_statistic_test = max(abs(df_test['cumulative_true'] - df_test['cumulative_false']))
    gini_index_test = 2 * auc_roc_test - 1

    # Criando o DataFrame com as métricas calculadas
    metrics_df = pd.DataFrame({
        'Algoritmo': [nm_modelo, nm_modelo],
        'Conjunto': ['Treino', 'Teste'],
        'Acuracia': [accuracy_train, accuracy_test],
        'Precisao': [precision_train, precision_test],
        'Recall': [recall_train, recall_test],
        'AUC_ROC': [auc_roc_train, auc_roc_test],
        'GINI': [gini_index_train, gini_index_test],
        'KS': [ks_statistic_train, ks_statistic_test]
    })
    return metrics_df

In [None]:
def plot_metrics(model, X_train, y_train, X_test, y_test, features):
    plt.figure(figsize=(15, 18))

    # Gráfico de Barras para Taxa do Evento e Importância das Variáveis
    plt.subplot(4, 2, 1)
    event_rate = y_train.mean()
    non_event_rate = 1 - event_rate
    plt.barh(['Não Evento', 'Evento'], [non_event_rate, event_rate], color=['lightsteelblue', 'skyblue'])
    plt.xlabel('Taxa')
    plt.title('Taxa do Evento')

    plt.subplot(4, 2, 2)
    importances = model.feature_importances_
    indices = np.argsort(importances)  # Ordem ascendente
    indices = indices[-10:]  # Considerando apenas as 10 mais importantes
    plt.barh(features[indices], importances[indices], color='skyblue')
    plt.xlabel('Importância')
    plt.title('Importância das Variáveis')

    # Curva ROC para Treino e Teste
    plt.subplot(4, 2, 3)
    for X, y, label in [(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')]:
        fpr, tpr, _ = roc_curve(y, model.predict_proba(X)[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Curva ROC')
    plt.legend(loc='lower right')

    # Gráfico de Barras para Gini e KS
    plt.subplot(4, 2, 4)
    metrics = ['Índice Gini', 'Estatística KS']
    for X, y, label in [(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')]:
        probs = model.predict_proba(X)[:, 1]
        df = pd.DataFrame({'true_labels': y, 'predicted_probs': probs})
        df = df.sort_values(by='predicted_probs', ascending=False)
        df['cumulative_true'] = df['true_labels'].cumsum() / df['true_labels'].sum()
        df['cumulative_false'] = (1 - df['true_labels']).cumsum() / (1 - df['true_labels']).sum()
        ks = max(abs(df['cumulative_true'] - df['cumulative_false']))
        gini = 2 * auc(*roc_curve(y, probs)[:2]) - 1
        plt.bar([f'{label}\n{metric}' for metric in metrics], [gini, ks], color='skyblue')
    plt.title('Índice Gini e Estatística KS')

    # Matrizes de Confusão para Treino e Teste
    for i, (X, y, label) in enumerate([(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')], 5):
        plt.subplot(4, 2, i)
        conf_matrix = confusion_matrix(y, model.predict(X))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar_kws={'label': 'Count'})
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title(f'Matriz de Confusão - {label}')

    # Histograma do Score para Treino e Teste
    plt.subplot(4, 2, 7)
    plt.hist(model.predict_proba(X_train)[:, 1], bins=30, color='skyblue', edgecolor='black', alpha=0.7, label='Treino')
    plt.xlabel('Score')
    plt.ylabel('Frequência')
    plt.title('Histograma do Score - Treino')
    plt.legend(loc='upper right')

    plt.subplot(4, 2, 8)
    plt.hist(model.predict_proba(X_test)[:, 1], bins=30, color='lightsteelblue', edgecolor='black', alpha=0.7, label='Teste')
    plt.xlabel('Score')
    plt.ylabel('Frequência')
    plt.title('Histograma do Score - Teste')
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.show()

def custom_fillna(df):
    import pandas as pd
    import numpy as np

    numerical_cols = df.select_dtypes(include=['float64', 'float32', 'int64', 'int32']).columns
    means = {}

    for col in numerical_cols:
        if col != 'class':  # Verifica se a coluna não é 'class'
            means[col] = df[col].mean()
            df[col].fillna(means[col], inplace=True)

    return df, means


def custom_fillna_prod(df, means):
    import numpy as np
    import pandas as pd

    for col, mean_value in means.items():
      df[col].fillna(mean_value, inplace=True)

    return df

# Ajustar configurações de exibição do pandas para evitar notação científica
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Definir a opção para exibir todas as colunas
pd.set_option('display.max_columns', None)

# Definir a opção para exibir todas as linhas
pd.set_option('display.max_rows', None)

##### Leitura dos Dados

In [None]:
df_train = pd.read_csv('air_system_previous_years.csv')
df_test = pd.read_csv('air_system_present_year.csv')

##### Visão Inicial da Base

In [None]:
df_train.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [None]:
df_train.shape

(60000, 171)

In [None]:
df_train['class'].value_counts()

class
neg    59000
pos     1000
Name: count, dtype: int64

##### Tratamento Inicial da Base

In [None]:
# Substituir 'na' por NaN
df_train.replace('na', np.nan, inplace=True)
df_test.replace('na', np.nan, inplace=True)

In [None]:
def convert_columns_to_float(df, ignore_column='class'):
    for column in df.columns:
        if column == ignore_column:
            continue
        df[column] = df[column].astype(float)
    return df

# Converter colunas para float
df_train_01 = convert_columns_to_float(df_train)
df_test_01 = convert_columns_to_float(df_test)



In [None]:
# Transformar 'neg' para 0 e 'pos' para 1 sem perder a contagem
df_train_01['class'] = df_train_01['class'].replace({'neg': 0, 'pos': 1})
df_test_01['class'] = df_test_01['class'].replace({'neg': 0, 'pos': 1})

##### Split da base de treino

In [None]:
# Dividir dados em treino e teste
abt_train, abt_test = train_test_split(df_train_01, test_size=0.3, random_state=13)

In [None]:
abt_train.shape, abt_test.shape

((42000, 171), (18000, 171))

In [None]:
abt_train_01, means = custom_fillna(abt_train)
means

{'aa_000': 59514.002095238095,
 'ab_000': 0.6886960158945937,
 'ac_000': 354947711.97530365,
 'ad_000': 272466.82924047025,
 'ae_000': 6.48460122394149,
 'af_000': 10.673615602766306,
 'ag_000': 170.01497460095817,
 'ag_001': 957.6014156053639,
 'ag_002': 8497.247273515179,
 'ag_003': 87594.38076895298,
 'ag_004': 435420.6345667718,
 'ag_005': 1113559.7424946434,
 'ag_006': 1668979.03512531,
 'ag_007': 500796.1512868045,
 'ag_008': 34780.73852228134,
 'ag_009': 4975.882851433662,
 'ah_000': 1815174.4951119672,
 'ai_000': 9220.448194511315,
 'aj_000': 1341.012469908522,
 'ak_000': 1222.6525151164287,
 'al_000': 57694.908985842245,
 'am_0': 91273.0911892152,
 'an_000': 3477134.300539343,
 'ao_000': 3013413.564221618,
 'ap_000': 1008051.9598381971,
 'aq_000': 446230.2016022325,
 'ar_000': 0.5092897812406353,
 'as_000': 142.3300914780934,
 'at_000': 4512.01020702937,
 'au_000': 238.4363023591719,
 'av_000': 1097.2673947113112,
 'ax_000': 374.5229115876412,
 'ay_000': 11850.38280046225,
 'a

In [None]:
abt_test_01 = custom_fillna_prod(abt_test, means)
df_test_02 = custom_fillna_prod(df_test_01, means)

##### Treinamento do modelo Baseline

In [None]:
X_train = abt_train_01.drop('class', axis=1)
y_train = abt_train_01['class']
X_test = abt_test_01.drop('class', axis=1)
y_test = abt_test_01['class']

# Testando modelos

algoritmos = [DecisionTreeClassifier(criterion='gini',random_state=0),
              LogisticRegression(solver='liblinear',random_state=0),
              RandomForestClassifier(random_state=0),
              GradientBoostingClassifier(random_state=0),
              XGBClassifier(random_state=0),
              lgb.LGBMClassifier(random_state=0)]


for algoritmo in algoritmos:

    nome_algoritmo = str(algoritmo)[:str(algoritmo).find("(")]

    # Treino do modelo
    algoritmo.fit(X_train,y_train)

    # Avaliar modelo
    metricas = calculate_metrics(nome_algoritmo,algoritmo, X_train, y_train, X_test, y_test)
    display(metricas)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,DecisionTreeClassifier,Treino,1,1,1,1,1,1
1,DecisionTreeClassifier,Teste,1,1,1,1,1,1




Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LogisticRegression,Treino,1,0,0,0,0,0
1,LogisticRegression,Teste,1,0,0,0,0,0


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,RandomForestClassifier,Treino,1,1,1,1,1,1
1,RandomForestClassifier,Teste,1,1,1,1,1,1


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,1,1,1,1,1,1
1,GradientBoostingClassifier,Teste,1,1,1,1,1,1


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,XGBClassifier,Treino,1,1,1,1,1,1
1,XGBClassifier,Teste,1,1,1,1,1,1


[LightGBM] [Info] Number of positive: 700, number of negative: 41300
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38955
[LightGBM] [Info] Number of data points in the train set: 42000, number of used features: 169
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.016667 -> initscore=-4.077537
[LightGBM] [Info] Start training from score -4.077537


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,1,1,1,1,1,1
1,LGBMClassifier,Teste,1,1,1,1,1,1
