# **Construção do ATOp-PredictiveModel**
Este notebook organiza o código gerado para a análise da autonomia de uma plataforma naval da Marinha do Brasil, utilizando dados coletados entre 2013 e 2017.

## **Ambiente e Bibliotecas**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error

import ImbalancedLearningRegression as iblr
import optuna
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

from itertools import combinations
from itertools import permutations
import time
from sklearn.model_selection import RepeatedKFold

from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

## **Integração de Dados Historicos**

### **Carregamento dos Dados**

In [None]:
# Carrega o dataset SINGRA
df_Singra = pd.read_csv('C:/Users/12078956/Documents/Projeto/Dados/SINGRA/RMC_MEIO1.csv',
                        sep=',', header=0, parse_dates=['PERIODO'],
                        dtype={'QTDE_ITENS_RM': np.float64, 'PRECO_UNITARIO_VENDA': np.float64})
# Carrega o dataset RFC
df_RFC = pd.read_csv('C:/Users/12078956/Documents/Projeto/Dados/RFC/RFC.csv',
                     sep=',', header=0) #, dtype={'MILHAS': np.float64})

### **Exploração dos Dados**

In [None]:
# Limpeza e padronização dos dados

df_Singra['GRUPO_JURISDICAO'].value_counts()

# Remove consumable items outside the scope of work
df_Singra = df_Singra.drop(df_Singra.loc[lambda df_GENEROS: ((df_Singra.NOME_PORTUGUES == 'REFEIÇÃO INDIVIDUAL') |
                                                              (df_Singra.NOME_PORTUGUES == 'RAÇÃO DE SOBREVIVÊNCIA'))].index, axis=0)

# Standardization of the Supply Unit (SU)
print('UF antes:')
print(df_Singra['UF'].value_counts())
print('\n')

# 1 Pack of coffee = 0.5 kilogram
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*0.5 if x.UF == 'PACOTE'
                                             else x.QTDE_ITENS_RM, axis = 1)
# change the description
df_Singra['UF'] = df_Singra.UF.replace('PACOTE', 'QUILOGRAMA')

# 1 Bottle of vegetable oil = 0.9 liter
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*0.9 if x.UF == 'GARRAFA'
                                             else x.QTDE_ITENS_RM, axis = 1)
df_Singra['UF'] = df_Singra.UF.replace('GARRAFA', 'LITRO') # change the description

# 1 US Quarter Gallon of lubricant = 0.946353 liter
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*0.946353 if x.UF == 'QUARTO DE GALAO AMERICANO'
                                             else x.QTDE_ITENS_RM, axis = 1)
# 1 drum of lubricant = 200 liters
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*200 if x.UF == 'TAMBOR'
                                             else x.QTDE_ITENS_RM, axis = 1)

# 1 Gallon of lubricant = 3.785412 liters
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*3.785412 if x.UF == 'GALAO'
                                             else x.QTDE_ITENS_RM, axis = 1)

# 1 Bucket of lubricant = 20 liters
# 1 Bucket of grease = 20 kilograms
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*20 if x.UF == 'BALDE'
                                             else x.QTDE_ITENS_RM, axis = 1)
# 1 Can of grease = 5 kilograms
df_Singra['QTDE_ITENS_RM'] = df_Singra.apply(lambda x: x.QTDE_ITENS_RM*5 if x.UF == 'LATA'
                                             else x.QTDE_ITENS_RM, axis = 1)

# Change UF description for Lubricant and Grease
df_Singra['UF'] = df_Singra.apply(lambda x: 'LITRO' if x.DESCRICAO_CLG == 'LUBRIFICANTE'
                                  else 'QUILOGRAMA' if x.DESCRICAO_CLG == 'GRAXA' else x.UF, axis = 1)

print('UF depois:')
print(df_Singra['UF'].value_counts())

In [None]:
df_CLG = df_Singra.query('GRUPO_JURISDICAO == "COMBUSTIVEIS"')

In [None]:
df_CLG['DESCRICAO_CLG'].value_counts()

In [None]:
# Remove consumable items outside the scope of work
df_CLG = df_CLG.drop(df_CLG.loc[lambda df_CLG: ((df_CLG.DESCRICAO_CLG == 'COMBUSTÍVEL AVIAÇÃO - QAV-5')
                                                | (df_CLG.DESCRICAO_CLG == 'GASOLINA COMUM'))].index, axis=0)

In [None]:
df_CLG['DESCRICAO_CLG'].value_counts()

In [None]:
# Standardization of the description of CLG types
df_CLG['DESCRICAO_CLG'] = df_CLG.DESCRICAO_CLG.replace('ÓLEO DIESEL ESPECIAL - OCMT', 'COMBUSTIVEL')
df_CLG['DESCRICAO_CLG'] = df_CLG.DESCRICAO_CLG.replace('ÓLEO DIESEL MARÍTIMO', 'COMBUSTIVEL')

In [None]:
df_CLG['DESCRICAO_CLG'].value_counts()

In [None]:
import warnings
warnings.simplefilter("ignore", category=FutureWarning)

fig1 = px.scatter(df_CLG, x = df_CLG.PERIODO,
                  y = df_CLG.QTDE_ITENS_RM,
                  color=df_CLG.DESCRICAO_CLG)
fig1.update_layout(
    title='Visualização do consumo Combustível, Lubrificante e Graxa',
    xaxis_title='Período',
    yaxis_title='Quantidade',
    legend_title='Categoria',
)
fig1.show(renderer='png', width=1600, height=500)

In [None]:
grupoCLG = df_CLG.groupby('DESCRICAO_CLG')
grupoCLG.first()

In [None]:
grupoCLG.get_group('COMBUSTIVEL')

In [None]:
# Method to transform groups into columns
pivot_df_CLG = df_CLG.pivot_table(values='QTDE_ITENS_RM', index='PERIODO', columns='DESCRICAO_CLG', aggfunc='sum')

# Display the resulting DataFrame
# print(pivot_df_CLG)

In [None]:
df_GENEROS = df_Singra.query('GRUPO_JURISDICAO == "SUBSISTENCIA"')

In [None]:
df_GENEROS['NOME_PORTUGUES'].value_counts()

Grupo de alimentos, baseado na pirâmida alimentar brasileira (PHILIPPI et al., 1999)
A pirâmide é composta de 8 grupos:

1.   Arroz, Pão, Massa, Batata, cassava;
2.   Vegetais;
1.   Frutas;
2.   Carnes;
1.   Leite, queijo and yogurte;
2.   Feijões and olaginosas;
1.   Óleos e gorduras;
2.   Açucar e doces.

In [None]:
# 02 categories were included, DRINKS (tea, coffee, etc.) and SEASONING (pepper, salt, etc.)
# Even though they are not part of the food pyramid, there were records of consumption of these items
GRUPO_ALIMENTO = []

for i in df_GENEROS.itertuples():
  valor = i.NOME_PORTUGUES

  if valor == 'AÇÚCAR REFINADO':
    GRUPO_ALIMENTO.append('ACUCARES')

  elif valor == 'ÓLEO VEGETAL':
    GRUPO_ALIMENTO.append('OLEOS_GORDURAS')

  elif valor == 'ARROZ DESCASCADO':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'COXA DE FRANGO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'LEITE INTEGRAL EM PÓ':
    GRUPO_ALIMENTO.append('LEITE_QUEIJO')

  elif valor == 'FEIJÕES PRETOS SECOS':
    GRUPO_ALIMENTO.append('FEIJOES')

  elif valor == 'FILE PEITO FRANGO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'GELÉIA':
    GRUPO_ALIMENTO.append('ACUCARES')

  elif valor == 'SUCO CAJU':
    GRUPO_ALIMENTO.append('FRUTAS')

  elif valor == 'MOLHO DE TOMATE':
    GRUPO_ALIMENTO.append('LEGUMES_VERDURAS')

  elif valor == 'FARINHA MANDIOCA':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'CREME DE LEITE':
    GRUPO_ALIMENTO.append('LEITE_QUEIJO')

  elif valor == 'PATINHO ESPECIAL BOVINO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'AZEITE DE OLIVA':
    GRUPO_ALIMENTO.append('OLEOS_GORDURAS')

  elif valor == 'BIFE DO ALCATRA COM PICANHA BOVINO SEM OSSO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'SUCO MARACUJA':
    GRUPO_ALIMENTO.append('FRUTAS')

  elif valor == 'LOMBO DESOSSADO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'ESPAGUETE':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'BISCOITO CRACKER DE TRIGO':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'LOMBO EM PEDAÇOS':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'BISCOITO DE AÇÚCAR':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'CHÃ DESOSSADO SEM MÚSCULO OU PONTA BOVINO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'MACARRÃO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'CACAU EM PÓ PARA BEBIDA':
    GRUPO_ALIMENTO.append('ACUCARES')

  elif valor == 'PERNIL DESOSSADO E AMARRADO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'BIFE DE CONTRA-FILÉ DO LOMBO BOVINO DESOSSADO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'FILÉ MIGNON COMPLETO DE CARNE BOVINA':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'FILEZINHO PEITO FRANGO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'ACÉM BOVINO':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'BIFES DE CARNE BOVINA':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'BIFE DE FILÉ DE CARNE BOVINA SEM OSSO ESPECIAL':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'SUCO DE UVA ENLATADO':
    GRUPO_ALIMENTO.append('FRUTAS')

  elif valor == 'FEIJÃO-PINTO SECO':
    GRUPO_ALIMENTO.append('FEIJOES')

  elif valor == 'ARROZ PARBOILIZADO':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'BIFE BOVINO EM CUBOS ESPECIAL':
    GRUPO_ALIMENTO.append('CARNES')

  elif valor == 'BEBIDA COM SABOR DE FRUTA':
    GRUPO_ALIMENTO.append('FRUTAS')

  elif valor == 'ARROZ MARROM':
    GRUPO_ALIMENTO.append('ARROZ_MASSA')

  elif valor == 'CAFÉ TORRADO':
    GRUPO_ALIMENTO.append('BEBIDAS')

  else:
    GRUPO_ALIMENTO.append('TEMPEROS')

# Add Food Group column
df_GENEROS.insert(14, 'GRUPO_ALIMENTO', GRUPO_ALIMENTO)

df_GENEROS.head()

In [None]:
df_GENEROS['GRUPO_ALIMENTO'].value_counts()

In [None]:
fig1 = px.scatter(df_GENEROS, x = df_GENEROS.PERIODO,
                  y = df_GENEROS.QTDE_ITENS_RM,
                  color=df_GENEROS.GRUPO_ALIMENTO)
fig1.update_layout(
    title='Visualização do consumo de Alimentos',
    xaxis_title='Período',
    yaxis_title='Quantidade',
    legend_title='Categoria',
)
fig1.show(renderer='png', width=1600, height=500)

In [None]:
# Method to transform groups into columns
pivot_df_GENEROS = df_GENEROS.pivot_table(values='QTDE_ITENS_RM', index='PERIODO', columns='GRUPO_ALIMENTO', aggfunc='sum')

# Display the resulting DataFrame
# print(pivot_df_GENEROS)

In [None]:
grupoGENEROS = df_GENEROS.groupby('GRUPO_ALIMENTO')
grupoGENEROS.first()

In [None]:
grupoGENEROS.get_group('FRUTAS')

In [None]:
# Concatenates the CLG and GENEROS dataframes
df_Singra_T = pd.concat([pivot_df_CLG, pivot_df_GENEROS], axis=1)
# Impute the mean for unknown values
df_Singra_T = df_Singra_T.fillna(df_Singra_T.mean())

In [None]:
# Includes the PERIOD index as a dataframe column
df_Singra_T = df_Singra_T.rename_axis('PERIODO').reset_index()

In [None]:
df_Singra_T.head()

In [None]:
# Add the ANO column
ano = []
for i in df_Singra_T.itertuples():
  ano.append(i.PERIODO.year)

df_Singra_T.insert(loc=1, column='ANO', value=ano)

In [None]:
# Add the TRIMESTRE column
trimestre =[]
for i in df_Singra_T.itertuples():
  if i.PERIODO.month <= 3:
    trimestre.append(1)
  elif i.PERIODO.month <= 6:
    trimestre.append(2)
  elif i.PERIODO.month <= 9:
    trimestre.append(3)
  else:
    trimestre.append(4)

df_Singra_T.insert(loc=2, column='TRIMESTRE', value=trimestre)

In [None]:
df_Singra_T.head(10)

In [None]:
# Sums values based on ANO and TRIMESTRE
df_SINGRA_TRIM = df_Singra_T.groupby(['ANO', 'TRIMESTRE'])[['COMBUSTIVEL', 'GRAXA',	'LUBRIFICANTE',	'ACUCARES',	'ARROZ_MASSA',
                                                            'BEBIDAS',	'CARNES',	'FEIJOES',	'FRUTAS',	'LEGUMES_VERDURAS',
                                                            'LEITE_QUEIJO',	'OLEOS_GORDURAS',	'TEMPEROS']].sum()
df_SINGRA_TRIM.head()

In [None]:
# # Limpeza do RFC dataset
# Eliminates columns for concatenation
df_RFC = df_RFC.drop(['TV', 'DT_INICIO', 'DT_FIM', 'OPERACAO', 'PORTOS'], axis=1)

In [None]:
df_RFC.head()

In [None]:
# Deletes record columns NaN
df_RFC = df_RFC.dropna()
df_RFC.reset_index(drop=True, inplace=True)

In [None]:
df_RFC.head()

In [None]:
# adjust types
df_RFC['ANO'] = df_RFC.ANO.astype('int64')
df_RFC['TRIMESTRE'] = df_RFC.TRIMESTRE.astype('int64')

In [None]:
df_RFC.head()

In [None]:
# Defines the columns ANO and TRIMESTRE as indexes
df_RFC = df_RFC.set_index(['ANO', 'TRIMESTRE'])

In [None]:
df_RFC.head()

In [None]:
# Concatenates the SINGRA_TRIM and RFC dataframes
df_ATOp = pd.concat([df_SINGRA_TRIM, df_RFC], axis=1)

In [None]:
df_ATOp.head()

In [None]:
# Inclui o index ANO e TRIMESTRE como coluna do dataframe
df_ATOp = df_ATOp.rename_axis(['ANO', 'TRIMESTRE']).reset_index()

In [None]:
df_ATOp_rounder = df_ATOp.round(2)

In [None]:
df_ATOp_rounder.head(10)

In [None]:
# Removes records outside the study period (2013 to 2017)
df_ATOp = df_ATOp.drop(df_ATOp.loc[lambda df_ATOp: df_ATOp.ANO > 2017].index, axis=0)
df_ATOp.reset_index(drop=True, inplace=True)

In [None]:
# Correlation Matrix
matriz_correlacao = df_ATOp.corr(method="spearman")
matriz_correlacao.style.background_gradient(cmap="coolwarm")

In [None]:
df_ATOp.corr(method="spearman").DIAS_MAR.sort_values()

In [None]:
sns.pairplot(df_ATOp[['DIAS_MAR', 'MILHAS', 'DIAS_PORTO_SEDE', 'DIAS_PORTO_FORA_SEDE', 'MILITARES']])

### **Preparação dos Dados**

In [None]:
sns.boxplot(df_ATOp['DIAS_MAR'])

In [None]:
# Aplicação da técnica de sobreamostragem sobre os dados reais

## specify phi relevance values
rg_mtrx_iblr = [

    [42,  1, 0],  ## over-sample ("minority")
    [30, 0, 0],  ## under-sample ("majority")
    [20, 0, 0],  ## under-sample
    [31, 0, 0],  ## under-sample
]

## conduct Random Over-sampling

df_ATOp_RO = iblr.gn(
    data = df_ATOp, ## pandas dataframe
    y = 'DIAS_MAR',            ## string ('header name')
    pert = 0.02,              ## perturbation / noise percentage (pos real) #gaussian
    samp_method = 'extreme',   ## string ('balance' or 'extreme')
    drop_na_col = True,        ## boolean (True or False)
    drop_na_row = True,        ## boolean (True or False)
    replace = True,           ## boolean (True or False)
    manual_perc = True,      ## user defines percentage of under-sampling and over-sampling  # added
    perc_u = 0.9,              ## percentage of under-sampling  # added
    perc_o = 100,              ## percentage of over-sampling  # added

    ## phi relevance arguments
    rel_thres = 0.8,               ## real number (0 < R < 1)
    rel_method = 'manual',         ## string ('auto' or 'manual')
    # rel_xtrm_type = 'both',      ## unused (rel_method = 'manual')
    # rel_coef = 1.50,             ## unused (rel_method = 'manual')
    rel_ctrl_pts_rg = rg_mtrx_iblr ## 2d array (format: [x, y])
)

In [None]:
# Conjuntos de Dados

# Atributo Alvo
# Conjunto Real
y = df_ATOp['DIAS_MAR']
# Conjunto Derivado
y_s = df_ATOp_RO['DIAS_MAR']

# Atrbutos prediditivos exceto colunas eliminadas
# Conjunto Real
X = df_ATOp.drop(['ANO', 'TRIMESTRE', 'MILHAS', 'DIAS_MAR'], axis=1)
# Conjunto Derivado
X_s = df_ATOp_RO.drop(['ANO', 'TRIMESTRE', 'MILHAS', 'DIAS_MAR'], axis=1)

## **Treinamento, Teste, and Validação**

In [None]:
# Parâmetros globais

N_SPLITS = 10
RANDOM_STATE=0
KF = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
CV = RepeatedKFold(n_splits=N_SPLITS, n_repeats=5, random_state=RANDOM_STATE)
N_TRIALS=50
N_RUNS=5  # número de vezes que o modelo roda com colunas embaralhadas
N_COLS=2  # número de subconjuntos de atributos

### **Bloco Experimental 1 - Modelos Individuais**

In [None]:
# Função unificada de otimização de hiperparâmetros com Optuna
def otimizar_modelos_individuais(model_name, X_train, y_train):
    def objective(trial):
        if model_name == "LinearRegression":
            model = LinearRegression()

        elif model_name == "DecisionTree":
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
            }
            model = DecisionTreeRegressor(**params, random_state=RANDOM_STATE)

        elif model_name == "RandomForest":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 150),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
            }
            model = RandomForestRegressor(**params, random_state=RANDOM_STATE)

        elif model_name == "GradientBoosting":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
                'max_depth': trial.suggest_int('max_depth', 3, 10)
            }
            model = GradientBoostingRegressor(**params, random_state=RANDOM_STATE)

        elif model_name == "KNeighbors":
            params = {
                'n_neighbors': trial.suggest_int('n_neighbors', 1, 10),
                'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
            }
            model = KNeighborsRegressor(**params)

        else:
            raise ValueError(f"Modelo desconhecido: {model_name}")

        return cross_val_score(model, X_train, y_train, cv=KF, scoring='r2', n_jobs=-1).mean()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=-1)

    return study.best_params, study

# Execução do Pipeline de otimização dos modelos
def executar_pipeline_modelos_individuais(X, y, model_names=None):
    if model_names is None:
        model_names = ["LinearRegression", "DecisionTree", "RandomForest", "GradientBoosting", "KNeighbors"]

    models_params = {}
    study_results = {}
    tempos_execucao_total = {}
    modelos_treinados = {}

    melhor_modelo = None
    melhor_score = -np.inf

    for name in model_names:
        print(f"\nOtimizando modelo: {name}")
        inicio = time.perf_counter()

        # Otimização
        params, study = otimizar_modelos_individuais(name, X, y)
        models_params[name] = params
        study_results[name] = study

        # Criação do modelo com melhores hiperparâmetros
        if name == "LinearRegression":
            modelo = make_pipeline(StandardScaler(), LinearRegression())
        elif name == "DecisionTree":
            modelo = DecisionTreeRegressor(random_state=RANDOM_STATE, **params)
        elif name == "RandomForest":
            modelo = RandomForestRegressor(random_state=RANDOM_STATE, **params)
        elif name == "GradientBoosting":
            modelo = GradientBoostingRegressor(random_state=RANDOM_STATE, **params)
        elif name == "KNeighbors":
            modelo = make_pipeline(StandardScaler(), KNeighborsRegressor(**params))
        else:
            raise ValueError(f"Modelo desconhecido: {name}")

        modelos_treinados[name] = modelo

        # Avaliação
        r2_scores = cross_val_score(modelo, X, y, cv=KF, scoring='r2', n_jobs=-1)
        rmse_scores = np.sqrt(-cross_val_score(modelo, X, y, cv=KF, scoring='neg_mean_squared_error', n_jobs=-1))

        fim = time.perf_counter()
        tempo_total = fim - inicio
        tempos_execucao_total[name] = tempo_total

        # Armazenamento
        media_r2 = np.mean(r2_scores)
        models_params[name]["R2 Média"] = media_r2
        models_params[name]["R2 Std"] = np.std(r2_scores)
        models_params[name]["RMSE Média"] = np.mean(rmse_scores)
        models_params[name]["RMSE Std"] = np.std(rmse_scores)

        # Atualizar melhor modelo
        if media_r2 > melhor_score:
            melhor_score = media_r2
            melhor_modelo = modelo

    # Impressão dos resultados
    print("\nMelhores hiperparâmetros encontrados:")
    for model, study in study_results.items():
        print(f"{model}: {study.best_params}")
    print("\nResultados e Tempos de Treinamento:")
    for name in model_names:
        print(f"\n{name}")
        print(f"Tempo Total: {tempos_execucao_total[name]:.2f} segundos")
        print(f"Média do R²: {models_params[name]['R2 Média']:.4f} ± {models_params[name]['R2 Std']:.4f}")
        print(f"Média do RMSE: {models_params[name]['RMSE Média']:.4f} ± {models_params[name]['RMSE Std']:.4f}")

    return models_params, study_results, tempos_execucao_total, melhor_modelo

### **Bloco Experimental 2 - Comitês de Modelos sem Direcionamento Semântico**

#### Treinamento de Comitês de Modelos sem Direcionamento Semântico com divisão em dois subconjuntos

In [None]:
# ================================================
# # Classe principal
# ================================================
class ComiteModelosStacking(BaseEstimator, RegressorMixin):
    def __init__(self, model_names=("Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "K-Nearest Neighbors"),
                 model_params=None, meta_model=None, n_runs=N_RUNS, n_cols=N_COLS, random_state=None):
        self.model_names = model_names
        self.model_params = model_params or {}
        self.meta_model = meta_model or Ridge()
        self.n_runs = n_runs
        self.n_cols = n_cols
        self.random_state = random_state

    def _get_model(self, name, params):
        if name == "Linear Regression":
            return LinearRegression(**params)
        elif name == "Decision Tree":
            return DecisionTreeRegressor(**params)
        elif name == "Random Forest":
            return RandomForestRegressor(**params)
        elif name == "Gradient Boosting":
            return GradientBoostingRegressor(**params)
        elif name == "K-Nearest Neighbors":
            return KNeighborsRegressor(**params)
        else:
            raise ValueError(f"Modelo desconhecido: {name}")

    def fit(self, X, y):
        self.ensembles_ = []
        self.cols_used_ = []
        rng = np.random.RandomState(self.random_state)
        all_cols = list(X.columns)

        for _ in range(self.n_runs):
            rng.shuffle(all_cols)
            splits = np.array_split(all_cols, self.n_cols)
            cols_list = [list(split) for split in splits]
            self.cols_used_.append([cols.copy() for cols in cols_list])

            estimators = []
            for i, (model_name, cols) in enumerate(zip(self.model_names, cols_list)):
                prefix = f"{i}_"
                params = extract_params(self.model_params.get(model_name, {}), prefix)
                model = self._get_model(model_name, params)
                model.fit(X[cols], y)
                estimators.append((f"model{i+1}", model))

            ensemble = StackingRegressor(
                estimators=estimators,
                final_estimator=clone(self.meta_model),
                passthrough=False
            )
            ensemble.fit(X[all_cols], y)
            self.ensembles_.append((ensemble, [c.copy() for c in cols_list]))

        return self

    def predict(self, X):
        preds = []
        for ensemble, cols_list in self.ensembles_:
            used_cols = [col for cols in cols_list for col in cols]
            X_used = X[used_cols].copy()
            preds.append(ensemble.predict(X_used))
        return np.mean(preds, axis=0)

# ====================================
# Sugestão de hiperparâmetros por modelo
# ====================================
def suggest_params(trial, model_name, prefix=""):
    if model_name == "Decision Tree":
        return {
            f'{prefix}max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10),
            f'{prefix}min_samples_split': trial.suggest_int(f'{prefix}min_samples_split', 2, 10),
            f'{prefix}min_samples_leaf': trial.suggest_int(f'{prefix}min_samples_leaf', 1, 5)
        }
    elif model_name == "Random Forest":
        return {
            f'{prefix}n_estimators': trial.suggest_int(f'{prefix}n_estimators', 50, 150),
            f'{prefix}max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10),
            f'{prefix}min_samples_split': trial.suggest_int(f'{prefix}min_samples_split', 2, 10),
            f'{prefix}min_samples_leaf': trial.suggest_int(f'{prefix}min_samples_leaf', 1, 5)
        }
    elif model_name == "Gradient Boosting":
        return {
            f'{prefix}n_estimators': trial.suggest_int(f'{prefix}n_estimators', 100, 200),
            f'{prefix}learning_rate': trial.suggest_float(f'{prefix}learning_rate', 0.01, 0.2),
            f'{prefix}max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10)
        }
    elif model_name == "K-Nearest Neighbors":
        return {
            f'{prefix}n_neighbors': trial.suggest_int(f'{prefix}n_neighbors', 1, 10),
            f'{prefix}weights': trial.suggest_categorical(f'{prefix}weights', ['uniform', 'distance'])
        }
    elif model_name == "Linear Regression":
        return {}
    else:
        raise ValueError(f"Modelo desconhecido: {model_name}")

def extract_params(params_dict, prefix):
    return {k.replace(prefix, ""): v for k, v in params_dict.items() if k.startswith(prefix)}

# ====================================
# Treinar dois modelos específicos
# ====================================
class ComiteModelos2Stacking(BaseEstimator, RegressorMixin):
    def __init__(self, model_names=("Gradient Boosting", "Linear Regression"),
                 model_params=None, meta_model=None, n_runs=1, random_state=None):
        self.model_names = model_names
        self.model_params = model_params or {}
        self.meta_model = meta_model or Ridge()
        self.n_runs = n_runs
        self.random_state = random_state

    def _get_model(self, name, params):
        if name == "Linear Regression":
            return LinearRegression(**params)
        elif name == "Decision Tree":
            return DecisionTreeRegressor(**params)
        elif name == "Random Forest":
            return RandomForestRegressor(**params)
        elif name == "Gradient Boosting":
            return GradientBoostingRegressor(**params)
        elif name == "K-Nearest Neighbors":
            return KNeighborsRegressor(**params)
        else:
            raise ValueError(f"Modelo desconhecido: {name}")

    def fit(self, X, y):
        self.ensembles_ = []
        self.cols_used_ = []
        rng = np.random.RandomState(self.random_state)
        all_cols = list(X.columns)

        for _ in range(self.n_runs):
            rng.shuffle(all_cols)
            split_idx = len(all_cols) // 2
            cols1, cols2 = all_cols[:split_idx], all_cols[split_idx:]

            params1 = extract_params(self.model_params.get(self.model_names[0], {}), "0_")
            params2 = extract_params(self.model_params.get(self.model_names[1], {}), "1_")

            model1 = self._get_model(self.model_names[0], params1)
            model2 = self._get_model(self.model_names[1], params2)

            model1.fit(X[cols1], y)
            model2.fit(X[cols2], y)

            ensemble = StackingRegressor(
                estimators=[
                    ("model1", model1),
                    ("model2", model2)
                ],
                final_estimator=clone(self.meta_model),
                passthrough=False
            )
            ensemble.fit(X, y)

            self.ensembles_.append((ensemble, cols1, cols2))

        return self

    def predict(self, X):
        preds = []
        for ensemble, cols1, cols2 in self.ensembles_:
            X_used = X.copy()
            preds.append(ensemble.predict(X_used))
        return np.mean(preds, axis=0)

# ====================================
# Pipeline de execução para todas as combinações de 5 modelos, tomados 2 a 2
# ====================================
def executar_pipeline_comite_stacking_5modelos(X, y, n_trials=N_TRIALS, n_splits=N_SPLITS, n_runs=N_RUNS, n_cols=N_COLS):
    all_model_names = [
        "Linear Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting",
        "K-Nearest Neighbors"
    ]

    results = {}
    for model_names in permutations(all_model_names, n_cols):
        print(f"\n Otimizando conjunto com Stacking e 2 divisões de colunas: {model_names}")

        def objective(trial):
            params = {
                name: suggest_params(trial, name, prefix=f"{i}_")
                for i, name in enumerate(model_names)
            }
            model = ComiteModelosStacking(
                model_names=model_names,
                model_params=params,
                meta_model=Ridge(),
                n_runs=n_runs,
                n_cols = n_cols,
                random_state=RANDOM_STATE
            )

            scores = []
            for train_idx, val_idx in KF.split(X):
                model.fit(X.iloc[train_idx], y.iloc[train_idx])
                pred = model.predict(X.iloc[val_idx])
                scores.append(r2_score(y.iloc[val_idx], pred))

            return np.mean(scores)

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)

        best_params = {
            name: extract_params(study.best_params, f"{i}_")
            for i, name in enumerate(model_names)
        }

        model = ComiteModelosStacking(
            model_names=model_names,
            model_params={name: best_params[name] for name in model_names},
            meta_model=Ridge(),
            n_runs=n_runs,
            n_cols = n_cols,
            random_state=RANDOM_STATE
        )

        r2_scores, rmse_scores = [], []
        for train_idx, test_idx in KF.split(X):
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            pred = model.predict(X.iloc[test_idx])
            r2_scores.append(r2_score(y.iloc[test_idx], pred))
            rmse_scores.append(root_mean_squared_error(y.iloc[test_idx], pred))

        results[" + ".join(model_names)] = {
            "R2 Média": np.mean(r2_scores),
            "R2 Std": np.std(r2_scores),
            "RMSE Média": np.mean(rmse_scores),
            "RMSE Std": np.std(rmse_scores),
            "Tempo Total (s)": study.best_trial.duration.total_seconds()
        }

    df_results = pd.DataFrame(results).T.sort_values("R2 Média", ascending=False)
    print("\n Resultados finais com Stacking e 2 divisões de colunas:")
    print(df_results.round(4))
    return df_results

#### Treinamento de Comitês de Modelos sem Direcionamento Semântico com formações completamente aleatórias

In [None]:
# Função para divisão aleatória e desbalanceada de colunas
def dividir_colunas_aleatorio_desbalanceado(colunas, n_modelos, rng):
    rng.shuffle(colunas)
    divisao_inicial = [[coluna] for coluna in colunas[:n_modelos]]
    colunas_restantes = colunas[n_modelos:]
    for coluna in colunas_restantes:
        idx = rng.randint(0, n_modelos)
        divisao_inicial[idx].append(coluna)
    return divisao_inicial

# Pipeline principal
def executar_pipeline_execucoes_aleatorias(X, y, n_trials=N_TRIALS, n_splits=N_SPLITS):
    all_model_names = [
        "Linear Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting",
        "K-Nearest Neighbors"
    ]

    rng_global = np.random.RandomState(RANDOM_STATE)
    results = {}

    for execucao in range(N_EXECUCOES):
        # Sorteia quantos modelos (2, 3, 4 ou 5)
        n_modelos = rng_global.choice([2, 3, 4, 5])
        # Sorteia os modelos
        model_names = rng_global.choice(all_model_names, size=n_modelos, replace=False)
        print(f"\nExecução {execucao+1}/{N_EXECUCOES}: otimizando conjunto de {n_modelos} modelos -> {model_names}")

        def objective(trial):
            params = {
                name: suggest_params(trial, name, prefix=f"{i}_")
                for i, name in enumerate(model_names)
            }
            model = ComiteModelosStacking(
                model_names=model_names,
                model_params=params,
                meta_model=Ridge(),
                n_runs=1,
                random_state=rng_global.randint(0, 10000)
            )

            splits = list(KF.split(X))
            scores = []
            for train_idx, val_idx in splits:
                model.fit(X.iloc[train_idx], y.iloc[train_idx])
                pred = model.predict(X.iloc[val_idx])
                scores.append(r2_score(y.iloc[val_idx], pred))
            return np.mean(scores)

        inicio_total = time.perf_counter()
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)
        fim_total = time.perf_counter()
        tempo_total = fim_total - inicio_total

        best_params = {
            name: extract_params(study.best_params, f"{i}_")
            for i, name in enumerate(model_names)
        }

        model = ComiteModelosStacking(
            model_names=model_names,
            model_params={name: best_params[name] for name in model_names},
            meta_model=Ridge(),
            n_runs=1,
            random_state=rng_global.randint(0, 10000)
        )

        r2_scores, rmse_scores = [], []
        for train_idx, test_idx in KF.split(X):
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            pred = model.predict(X.iloc[test_idx])
            r2_scores.append(r2_score(y.iloc[test_idx], pred))
            rmse_scores.append(root_mean_squared_error(y.iloc[test_idx], pred))

        label = f"Execução {execucao+1}: {' + '.join(model_names)}"
        results[label] = {
            "N Modelos": n_modelos,
            "Modelos": " + ".join(model_names),
            "R2 Média": np.mean(r2_scores),
            "R2 Std": np.std(r2_scores),
            "RMSE Média": np.mean(rmse_scores),
            "RMSE Std": np.std(rmse_scores),
            "Tempo Total (s)": tempo_total
        }

    df_results = pd.DataFrame(results).T.sort_values("R2 Média", ascending=False)
    print("\nResultados finais (ordem decrescente de R²):")
    print(df_results.round(4))
    return df_results

### **Bloco Experimental 3 - ATOp-Predictive-Model**

In [None]:
# ============================
# Classe ATOpPredictiveModel
# ============================
class ATOpPredictiveModel(BaseEstimator, RegressorMixin):
    def __init__(self, model1_name="DecisionTree", model2_name="GradientBoosting",
                 model1_params=None, model2_params=None,
                 ensemble_type="stacking", final_estimator=None):
        self.model1_name = model1_name
        self.model2_name = model2_name
        self.model1_params = model1_params or {}
        self.model2_params = model2_params or {}
        self.ensemble_type = ensemble_type  # 'voting' ou 'stacking'
        self.final_estimator = final_estimator or LinearRegression()

    def _get_model(self, name, params):
        if name == "LinearRegression":
            return LinearRegression(**params)
        elif name == "DecisionTree":
            return DecisionTreeRegressor(**params)
        elif name == "RandomForest":
            return RandomForestRegressor(**params, random_state=RANDOM_STATE)
        elif name == "GradientBoosting":
            return GradientBoostingRegressor(**params, random_state=RANDOM_STATE)
        elif name == "KNeighbors":
            return KNeighborsRegressor(**params)
        else:
            raise ValueError(f"Modelo desconhecido: {name}")

    def fit(self, X, y):
        self.cols1 = X.loc[:, 'COMBUSTIVEL':'LUBRIFICANTE'].columns
        self.cols2 = X.loc[:, 'ACUCARES':'DIAS_PORTO_FORA_SEDE'].columns

        self.model1_ = self._get_model(self.model1_name, self.model1_params)
        self.model2_ = self._get_model(self.model2_name, self.model2_params)

        self.model1_.fit(X[self.cols1], y)
        self.model2_.fit(X[self.cols2], y)
        
        final_estimator = clone(self.final_estimator()) if callable(self.final_estimator) else clone(self.final_estimator)
        
        if self.ensemble_type == "voting":
            self.ensemble_ = VotingRegressor([
                ('model1', self.model1_),
                ('model2', self.model2_)
            ])
        elif self.ensemble_type == "stacking":
            self.ensemble_ = StackingRegressor(
                estimators=[('model1', self.model1_), ('model2', self.model2_)],
                final_estimator=final_estimator,
                passthrough=False,
                cv=KF
            )
        else:
            raise ValueError(f"Tipo de ensemble desconhecido: {self.ensemble_type}")

        self.ensemble_.fit(X, y)
        return self

    def predict(self, X):
        return self.ensemble_.predict(X)

# ========================
# Funções auxiliares
# ========================
def suggest_params(trial, model_name, prefix=""):
    if model_name == "DecisionTree":
        return {
            'max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10),
            'min_samples_split': trial.suggest_int(f'{prefix}min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int(f'{prefix}min_samples_leaf', 1, 5)
        }
    elif model_name == "RandomForest":
        return {
            'n_estimators': trial.suggest_int(f'{prefix}n_estimators', 50, 150),
            'max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10),
            'min_samples_split': trial.suggest_int(f'{prefix}min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int(f'{prefix}min_samples_leaf', 1, 5)
        }
    elif model_name == "GradientBoosting":
        return {
            'n_estimators': trial.suggest_int(f'{prefix}n_estimators', 100, 200),
            'learning_rate': trial.suggest_float(f'{prefix}learning_rate', 0.01, 0.2),
            'max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10)
        }
    elif model_name == "KNeighbors":
        return {
            'n_neighbors': trial.suggest_int(f'{prefix}n_neighbors', 1, 10),
            'weights': trial.suggest_categorical(f'{prefix}weights', ['uniform', 'distance'])
        }
    elif model_name == "LinearRegression":
        return {}
    else:
        raise ValueError(f"Modelo desconhecido: {model_name}")

def extract_params(params_dict, prefix):
    return {k.replace(prefix, ""): v for k, v in params_dict.items() if k.startswith(prefix)}

# =============================================
# Pipeline de execução com todas as combinações
# =============================================
def executar_ATOp_permutacoes(X, y, n_trials=N_TRIALS, n_splits=N_SPLITS, ensemble_type="stacking", final_estimator=None):
    model_names = ["LinearRegression", "DecisionTree", "RandomForest", "GradientBoosting", "KNeighbors"]
    results = {}

    for name1, name2 in permutations(model_names, 2):
        print(f"\nOtimizando ATOp com: {name1} + {name2} ({ensemble_type})")

        #inicio_total = time.perf_counter()

        def objective(trial):
            model1_params = suggest_params(trial, name1, prefix="m1_")
            model2_params = suggest_params(trial, name2, prefix="m2_")
            model = ATOpPredictiveModel(
                model1_name=name1,
                model2_name=name2,
                model1_params=model1_params,
                model2_params=model2_params,
                ensemble_type=ensemble_type,
                final_estimator=final_estimator
            )
            scores = []
            for train_idx, val_idx in KF.split(X):
                model.fit(X.iloc[train_idx], y.iloc[train_idx])
                preds = model.predict(X.iloc[val_idx])
                scores.append(r2_score(y.iloc[val_idx], preds))
            return np.mean(scores)

        inicio_total = time.perf_counter()
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)
        fim_total = time.perf_counter()
        tempo_total = fim_total - inicio_total

        best_params = {
            name1: extract_params(study.best_params, "m1_"),
            name2: extract_params(study.best_params, "m2_")
        }

        model = ATOpPredictiveModel(
            model1_name=name1,
            model2_name=name2,
            model1_params=best_params[name1],
            model2_params=best_params[name2],
            ensemble_type=ensemble_type,
            final_estimator=final_estimator
        )

        r2_scores = []
        rmse_scores = []
        for train_idx, test_idx in KF.split(X):
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            pred = model.predict(X.iloc[test_idx])
            r2_scores.append(r2_score(y.iloc[test_idx], pred))
            rmse_scores.append(root_mean_squared_error(y.iloc[test_idx], pred))

        #fim_total = time.perf_counter()
        #tempo_total = fim_total - inicio_total

        results[f"{name1} + {name2} ({ensemble_type})"] = {
            "R2 Média": np.mean(r2_scores),
            "R2 Std": np.std(r2_scores),
            "RMSE Média": np.mean(rmse_scores),
            "RMSE Std": np.std(rmse_scores),
            "Tempo Total (s)": tempo_total
            #"Tempo Total (s)": study.best_trial.duration.total_seconds()
        }

    df_results = pd.DataFrame(results).T.sort_values("R2 Média", ascending=False)
    print("\nResultados dos pares no ATOpPredictiveModel:")
    print(df_results.round(4))
    return df_results

# =============================================
# Pipeline de execução para um par específico
# =============================================
def treinar_ATOp_par(model1_name, model2_name, X, y, n_trials=N_TRIALS, n_splits=N_SPLITS,
                     ensemble_type="stacking", final_estimator=None):
    print(f"\nOtimizando ATOp com: {model1_name} + {model2_name} ({ensemble_type})")

    #inicio_total = time.perf_counter()

    def objective(trial):
        model1_params = suggest_params(trial, model1_name, prefix="m1_")
        model2_params = suggest_params(trial, model2_name, prefix="m2_")
        model = ATOpPredictiveModel(
            model1_name=model1_name,
            model2_name=model2_name,
            model1_params=model1_params,
            model2_params=model2_params,
            ensemble_type=ensemble_type,
            final_estimator=final_estimator
        )
        scores = []
        for train_idx, val_idx in KF.split(X):
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            preds = model.predict(X.iloc[val_idx])
            scores.append(r2_score(y.iloc[val_idx], preds))
        return np.mean(scores)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    best_params = {
        model1_name: extract_params(study.best_params, "m1_"),
        model2_name: extract_params(study.best_params, "m2_")
    }

    print("\nMelhores hiperparâmetros:")
    print(f"{model1_name}: {best_params[model1_name]}")
    print(f"{model2_name}: {best_params[model2_name]}")

    modelo_final = ATOpPredictiveModel(
        model1_name=model1_name,
        model2_name=model2_name,
        model1_params=best_params[model1_name],
        model2_params=best_params[model2_name],
        ensemble_type=ensemble_type,
        final_estimator=final_estimator
    )

    r2_scores = []
    rmse_scores = []
    for train_idx, test_idx in KF.split(X):
        modelo_final.fit(X.iloc[train_idx], y.iloc[train_idx])
        pred = modelo_final.predict(X.iloc[test_idx])
        r2_scores.append(r2_score(y.iloc[test_idx], pred))
        rmse_scores.append(root_mean_squared_error(y.iloc[test_idx], pred))

    #fim_total = time.perf_counter()
    #tempo_total = fim_total - inicio_total

    print(f"\nResultados com {model1_name} + {model2_name} ({ensemble_type}):")
    print(f"Média do R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
    print(f"Média do RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
    print(f"Tempo Total: {tempo_total:.2f} segundos")

    return modelo_final, best_params, tempo_total

### **Teste Estatístico**

In [None]:
# Criar pasta de saída
os.makedirs('outputs', exist_ok=True)

# ======= DEFINIÇÃO DOS MODELOS =======
modelos = {
    'Exp1_GB': GradientBoostingRegressor(
        random_state=RANDOM_STATE,
        n_estimators=100,
        learning_rate=0.01,
        max_depth=3
    ),
    'Exp2_Comite': ComiteModelos2Stacking(
        model_names=("Gradient Boosting", "Decision Tree"),
        model_params={
            "Gradient Boosting": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3},
            "Decision Tree": {"max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1},
        },
        meta_model=Ridge(),
        n_runs=1,
        random_state=RANDOM_STATE
    ),
    'Exp3_ATOp-PM': ATOpPredictiveModel(
        model1_name="DecisionTree",
        model2_name="GradientBoosting",
        model1_params={"max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1},
        model2_params={'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3},
        ensemble_type="stacking",
        final_estimator=Ridge()
    )
}

resultados = []

# ======= AVALIAÇÃO CROSS-VALIDATION =======
for fold, (train_idx, test_idx) in enumerate(KF.split(X), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for nome, modelo in modelos.items():
        start = time.time()
        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_test)
        end = time.time()

        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        tempo = end - start

        resultados.append({
            'modelo': nome,
            'fold': fold,
            'r2': r2,
            'rmse': rmse,
            'tempo': tempo
        })

# ======= SALVAR RESULTADOS =======
df_resultados = pd.DataFrame(resultados)
df_resultados.to_csv('outputs/metricas_por_modelo.csv', index=False)
# ======= MATRIZES (fold x modelo) =======
r2_matrix = df_resultados.pivot(index="fold", columns="modelo", values="r2")
r2_matrix.to_csv('outputs/r2_matrix.csv')

rmse_matrix = df_resultados.pivot(index="fold", columns="modelo", values="rmse")
rmse_matrix.to_csv('outputs/rmse_matrix.csv')

tempo_matrix = df_resultados.pivot(index="fold", columns="modelo", values="tempo")
tempo_matrix.to_csv('outputs/tempo_matrix.csv')

print("\nProcesso concluído. Arquivos gerados na pasta 'outputs'.")