# Treinamento e inferência do regressor

Este notebook demonstra como executar o pipeline DS em modo de regressão para prever a quantidade produzida por m³ e reaproveitar os artefatos resultantes durante a inferência.


## 1. Imports
Dependências principais para orquestrar o pipeline e persistir o modelo.


In [1]:
# # === CONFIGURAÇÃO DO AMBIENTE ===
# === CONFIGURAÇÃO DO AMBIENTE ===

import sys
from pathlib import Path
import pandas as pd
import numpy as np
from pprint import pprint


# Encontrar raiz do projeto
def find_project_root(marker="pyproject.toml"):
    """Retorna o diretório do projeto procurando pelo arquivo marcador."""
    current = Path.cwd().resolve()
    for path in (current,) + tuple(current.parents):
        if (path / marker).exists():
            return path
    return current


PROJECT_ROOT = find_project_root()
SRC_DIR = PROJECT_ROOT / "src"
DATA_DIR = PROJECT_ROOT / "data"

# if str(SRC_DIR) not in sys.path:
#     sys.path.insert(0, str(SRC_DIR))

# # Importar módulos do pipeline
# import pipelines as ds_pipelines
# from model import save_model_artifacts, load_model_artifacts

# print(f"✓ Projeto: {PROJECT_ROOT}")
# print(f"✓ Source: {SRC_DIR}")
# print(f"✓ Data: {DATA_DIR}")

In [4]:
from pathlib import Path
import pandas as pd

from pipelines.DS import pipelines as ds_pipelines
from model import save_model_artifacts, load_model_artifacts

ModuleNotFoundError: No module named 'pipelines.DS'; 'pipelines' is not a package

## 2. Configurações


In [2]:
PROJECT_ROOT = Path('.')
MACHINE_TYPE = 'flexo'  # ou 'cv'
MODEL_TYPE = 'catboost'
TASK_TYPE = 'regression'  # altere para 'classification' se necessário
SAVE_MODEL = True
MODEL_NAME = f'regressor_m3_{MODEL_TYPE}'
RANDOM_STATE = 42
CLASSIFICATION_THRESHOLD = 0.7

PROJECT_ROOT.resolve()


PosixPath('/home/adami/Documentos/Projeto_IA_AMCOM/project_data_science/src/pipelines/DS')

## 3. Treinamento
Executa o pipeline completo em modo regressão.


In [3]:
results = ds_pipelines.run_pipeline(
    machine_type=MACHINE_TYPE,
    task_type=TASK_TYPE,
    model_type=MODEL_TYPE,
    random_state=RANDOM_STATE,
    shap_sample_size=0,
    classification_threshold=CLASSIFICATION_THRESHOLD,
)
metrics = results['metrics']
metrics


NameError: name 'ds_pipelines' is not defined

In [None]:
metrics_df = pd.Series(metrics).to_frame('value')
metrics_df


Unnamed: 0,value
mae,45315.393545
rmse,115498.67519
r2,0.986687


### Principais features


In [None]:
feature_importance = results['feature_importance']
feature_importance.head(15)


## 4. Salvar artefatos


In [None]:
if SAVE_MODEL:
    model_dir = PROJECT_ROOT / 'src' / 'model'
    model_dir.mkdir(parents=True, exist_ok=True)
    save_path = save_model_artifacts(
        results,
        save_dir=model_dir,
        machine_type=MACHINE_TYPE,
        model_name=MODEL_NAME,
    )
else:
    save_path = None
save_path


## 5. Inferência rápida com o modelo em memória


In [None]:
regressor = results.get('regressor')
selected_features = results.get('selected_features', [])
df_model = results['df'].copy()

sample_ops = df_model.sample(n=5, random_state=21).reset_index(drop=True)
X_infer = sample_ops.reindex(columns=selected_features, fill_value=0)
sample_ops['pred_qt_por_m3'] = regressor.predict(X_infer)

sample_ops[['CD_OP', 'QT_PRODUZIDA', 'y_quantidade_por_m3', 'pred_qt_por_m3']]


## 6. Inferência usando o modelo salvo


In [None]:
if save_path is not None:
    persisted_artifacts = load_model_artifacts(save_path)
    persisted_regressor = persisted_artifacts.get('regressor', regressor)
    persisted_features = persisted_artifacts.get('selected_features', selected_features)
else:
    persisted_artifacts = None
    persisted_regressor = regressor
    persisted_features = selected_features

new_sample = results['df'].sample(n=5, random_state=7).reset_index(drop=True)
X_new = new_sample.reindex(columns=persisted_features, fill_value=0)
new_sample['pred_qt_por_m3_from_disk'] = persisted_regressor.predict(X_new)

new_sample[['CD_OP', 'pred_qt_por_m3_from_disk']]
