In [6]:
import pandas as pd
from funcoes import remover_missings,escolher_estrategia_imputacao,selecionar_variaveis_lightgbm_var_aleatoria,perfil_base,aplicar_imputacao_treino,aplicar_imputacao_teste
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from typing import List, Tuple
import joblib
import pycaret
from typing import Dict, Tuple

In [7]:
treino = pd.read_csv("treino.csv",sep=",")
treino.shape

(8211, 81)

In [8]:
#Remove colunas que possuem um percentual de valores ausentes (missings) maior ou igual ao valor definido em perc_miss.
#Definimos que variáveis com 50% de valores ausentes serão descartadas.
#Modelos de machine learning precisam de um conjunto de dados completo e representativo. 
#Se muitas variáveis tiverem altos níveis de missings, o modelo pode:
    #Perder generalização ao imputar valores incorretos. 
    #Diminuir a capacidade preditiva, pois pode aprender padrões errados.

treino_pos_miss = remover_missings(treino, perc_miss= 50)

Colunas removidas(38): ['VAR_10', 'VAR_12', 'VAR_14', 'VAR_16', 'VAR_18', 'VAR_21', 'VAR_23', 'VAR_26', 'VAR_27', 'VAR_29', 'VAR_31', 'VAR_36', 'VAR_37', 'VAR_41', 'VAR_42', 'VAR_43', 'VAR_46', 'VAR_47', 'VAR_48', 'VAR_49', 'VAR_50', 'VAR_51', 'VAR_55', 'VAR_56', 'VAR_61', 'VAR_62', 'VAR_63', 'VAR_66', 'VAR_67', 'VAR_68', 'VAR_69', 'VAR_70', 'VAR_71', 'VAR_73', 'VAR_74', 'VAR_75', 'VAR_77', 'VAR_78']


In [9]:
#Estratégia 
#Média: Se a coluna for numérica, sem outliers e missing < 5%.
#Mediana: Se houver outliers e missing entre 5%-20%.
#Mediana: Para outros casos de dados numéricos. (ex: sem outlier e missing alto etc..)

#Utilizar modelos para imputação (criar um modelo que faça previsões dos valores faltantes) aumenta a complexidade, deploy e avaliação.
regra_imputacao = escolher_estrategia_imputacao(treino_pos_miss)

In [10]:
#Não vamos salvar as regras imputação pois esse modelo é apenas para auxiliar na seleção de variáveis
df_imputado, regra_imputacao, dict_mediana, dict_media = aplicar_imputacao_treino(treino_pos_miss,regra_imputacao)

In [16]:
from typing import List, Literal
from pycaret.classification import ClassificationExperiment
from collections import Counter
import pandas as pd

def select_features(
    data: pd.DataFrame,
    target: str,
    methods: List[Literal['classic', 'univariate', 'boruta']],
    selection_rule: Literal['intersection', 'union', 'voting'] = 'intersection'
) -> List[str]:
    """
    Realiza a seleção de variáveis no PyCaret usando diferentes métodos e regras de combinação.

    Parâmetros:
    - data (pd.DataFrame): Dataset contendo as variáveis preditoras e a variável alvo.
    - target (str): Nome da variável alvo.
    - methods (List[str]): Lista com os métodos de seleção a serem aplicados. Opções:
        - 'classic' (RFE - Recursive Feature Elimination)
        - 'univariate' (Testes estatísticos ANOVA/qui-quadrado)
        - 'boruta' (Algoritmo Boruta baseado em Random Forest)
    - selection_rule (str): Método de combinação das variáveis selecionadas. Opções:
        - 'intersection': Mantém apenas as variáveis escolhidas por todos os métodos.
        - 'union': Mantém todas as variáveis selecionadas por pelo menos um método.
        - 'voting': Mantém variáveis selecionadas por pelo menos 2 dos métodos escolhidos.

    Retorno:
    - List[str]: Lista final de variáveis selecionadas.
    """
    
    selected_features_sets = []
    
    for method in methods:
        exp = ClassificationExperiment()  # Inicializa o experimento
        exp.setup(data, target=target, feature_selection=True, 
                  feature_selection_method=method, verbose=False)
        
        # Pegamos as features selecionadas via get_config
        selected_features = exp.get_config("X_train").columns.to_list()
        selected_features_sets.append(set(selected_features))
    
    # Combinação das seleções
    if selection_rule == 'intersection':
        final_features = list(set.intersection(*selected_features_sets))
    elif selection_rule == 'union':
        final_features = list(set.union(*selected_features_sets))
    elif selection_rule == 'voting':
        feature_counts = Counter([feat for features in selected_features_sets for feat in features])
        final_features = [feat for feat, count in feature_counts.items() if count >= 2]
    else:
        raise ValueError("selection_rule deve ser 'intersection', 'union' ou 'voting'")
    
    return final_features



In [17]:
selected_features = select_features(
    data=df_imputado, 
    target='y', 
    methods=['classic', 'univariate', 'boruta'], 
    selection_rule='intersection'
)
print(selected_features)

[LightGBM] [Info] Number of positive: 1621, number of negative: 4126
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6974
[LightGBM] [Info] Number of data points in the train set: 5747, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282060 -> initscore=-0.934265
[LightGBM] [Info] Start training from score -0.934265


ValueError: Invalid value for the feature_selection_method parameter, got boruta. Possible values are: 'classic', 'univariate' or 'sequential'.

In [8]:
#A ideia aqui é simples, utilizar um modelo para selecionar variáveis
#Se a variável tiver menos que 10% do valor preditivo da variável mais importante, então descartamos.
#Nosso dataset é pequeno 78 variáveis, talvez nem haja necessidade desse passo, mas é um bom momento para selecionar variáveis. 
df_selecionado, variaveis_mantidas, variaveis_descartadas = selecionar_variaveis_lightgbm_var_aleatoria(
                                                                                            df=df_imputado,
                                                                                            target = 'y',
                                                                                            id_column="id",
                                                                                            ignore_features=["safra"],
                                                                                            percentual_corte= 0.10,
                                                                                            random_var=False,
                                                                                            session_id= 1234)


❌ Variáveis Descartadas:
    Nome_Variavel  Importancia_Variavel
21        VAR_33                    72
15        VAR_22                    68
3          VAR_4                    56
20        VAR_32                    40
2          VAR_3                    37
36        VAR_64                     4


In [9]:
#Agora vamos retornar a base de treino original e Salvar a base de treino_selecionada (após seleção de variáveis)
treino_selecionado = treino[['id','safra','y']+variaveis_mantidas['Nome_Variavel'].tolist()]

In [10]:
treino_selecionado.head()

Unnamed: 0,id,safra,y,VAR_1,VAR_72,VAR_53,VAR_65,VAR_6,VAR_17,VAR_30,...,VAR_58,VAR_39,VAR_20,VAR_40,VAR_44,VAR_25,VAR_60,VAR_2,VAR_28,VAR_19
0,1,201404,0,0.0,151.0,3380.0,1303.79,3277.0,3000.0,500.0,...,369.0,348.0,8,9.0,11.0,12.0,-0.030478,0.0,12.0,45.0
1,2,201407,0,64.0,187.0,1000.0,1486.26,2443.0,,1000.0,...,228.0,179.0,12,5.0,9.0,9.0,0.357324,0.0,10.0,18.0
2,3,201405,0,99.0,96.0,1893.35,800.27,1824.0,,1893.35,...,264.0,,12,4.0,12.0,3.0,-0.411787,2.0,5.0,2.0
3,5,201403,1,0.0,75.0,2946.29,1457.78,437.0,,2000.0,...,332.0,,12,0.0,0.0,0.0,-0.231735,0.0,0.0,0.0
4,6,201405,0,61.0,5.0,400.0,390.54,140.0,,400.0,...,,,10,0.0,0.0,0.0,-0.354108,0.0,0.0,0.0


In [11]:
#Verificando nossa base após seleção de variáveis
resultado = perfil_base(treino_selecionado, id_col='id', target_col='y', safra_col='safra')

Calcula métricas básicas do perfil da base de dados.
Shape da base: Essa base possui 8211 linhas e 37 colunas
Tipos de variáveis: {dtype('float64'): 32, dtype('int64'): 5}
IDs únicos: 8211
Taxa de maus (bad rate): Bons: 5895(71.8 %), Maus: 2316 (28.2%)
Volumetria das safras: {201401: 854, 201402: 898, 201403: 873, 201404: 955, 201405: 972, 201406: 902, 201407: 968, 201408: 912, 201409: 877}




In [17]:
#Salvando a base de treino_selecionada
treino_selecionado.to_csv("treino_selecionada.csv",sep=",",index=False,header=True)

In [18]:
treino_selecionado

Unnamed: 0,id,safra,y,VAR_1,VAR_72,VAR_53,VAR_65,VAR_6,VAR_17,VAR_30,...,VAR_58,VAR_39,VAR_20,VAR_40,VAR_44,VAR_25,VAR_60,VAR_2,VAR_28,VAR_19
0,1,201404,0,0.0,151.0,3380.00,1303.79,3277.0,3000.0,500.00,...,369.0,348.0,8,9.0,11.0,12.0,-0.030478,0.0,12.0,45.0
1,2,201407,0,64.0,187.0,1000.00,1486.26,2443.0,,1000.00,...,228.0,179.0,12,5.0,9.0,9.0,0.357324,0.0,10.0,18.0
2,3,201405,0,99.0,96.0,1893.35,800.27,1824.0,,1893.35,...,264.0,,12,4.0,12.0,3.0,-0.411787,2.0,5.0,2.0
3,5,201403,1,0.0,75.0,2946.29,1457.78,437.0,,2000.00,...,332.0,,12,0.0,0.0,0.0,-0.231735,0.0,0.0,0.0
4,6,201405,0,61.0,5.0,400.00,390.54,140.0,,400.00,...,,,10,0.0,0.0,0.0,-0.354108,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8206,10734,201406,0,44.0,-67.0,500.00,1004.50,515.0,,500.00,...,467.0,,10,0.0,0.0,0.0,-0.411787,0.0,0.0,0.0
8207,10735,201407,0,0.0,404.0,5524.65,,1029.0,,1724.65,...,39.0,,12,7.0,10.0,0.0,-0.133344,0.0,0.0,0.0
8208,10736,201403,1,169.0,7.0,3759.03,599.81,819.0,2000.0,2759.03,...,442.0,418.0,12,1.0,3.0,0.0,-0.030478,3.0,0.0,0.0
8209,10737,201402,1,46.0,6.0,2000.00,,1740.0,1000.0,1000.00,...,869.0,94.0,12,5.0,8.0,8.0,-0.411787,0.0,11.0,6.0
