# Intro

Módulo de Previsão de Vendas (Forecast) - One-Click Order

Este módulo contém uma solução completa e robusta para prever a quantidade
semanal de vendas por Ponto de Venda (PDV) e SKU. O objetivo é apoiar o processo
de reposição de estoque para as primeiras semanas de 2023, com base no histórico
de vendas de 2022.

O pipeline foi reestruturado para seguir as melhores práticas de engenharia
de software e machine learning, incluindo:
- Estrutura orientada a objetos para manutenibilidade.
- Engenharia de features avançada (lags e janelas móveis).
- Utilização do LightGBM, um modelo de alta performance.
- Validação cruzada temporal e tuning de hiperparâmetros.
- Logging profissional e documentação completa.

Autor: BSB Data 01

Data da Versão: 2025-09-12

# Modelo de Previsão
Versão 2: 16.09.2025

### Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/MyDrive/Hackathon_Forecast/"

artifacts  data  Forecast_Model_Notebook3.ipynb


### Bibliotecas

In [None]:
!pip install lightgbm optuna -q

In [None]:
import logging
import os
import warnings
from datetime import datetime
from typing import Dict, List, Tuple

import optuna
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

### Logging

In [None]:
# Configuração de logging para substituir 'print'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Configurações de exibição e estilo
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

### Versão 2 - 16.09.2025

    A versão 2.0 do Forecaster contém:
    - Engenharia de features avançada (mais lags, janelas e features dimensionais).
    - Validação Hold-Out e Early Stopping (treinamento melhor).
    - Tratamento nativo de features categóricas pelo LightGBM.
    - Tuning de hiperparâmetros com Optuna.


In [None]:
# ==============================================================================
# Classe Final do Modelo de Previsão (Versão 2.1 - 17.09.2025)
# ==============================================================================

class SalesForecasterV2:
    """
    Versão final e polida do Forecaster, pronta para a submissão.
    Contém o pipeline completo, desde o carregamento de dados até a previsão.
    """
    def __init__(self):
        """Inicializa os atributos do nosso objeto de previsão."""
        self.model = None
        self.feature_names: List[str] = []
        self.categorical_features: List[str] = []
        self.performance_metrics: Dict[str, float] = {}

    def load_data(self, file_paths: Dict[str, str]) -> pd.DataFrame:
        """Carrega e processa os dados de vendas, PDVs e produtos."""
        #Versão otimizada: Carrega, une e agrega os dados de forma mais direta.

        logging.info("Iniciando o carregamento dos dados normalizados.")
        try:
            df_vendas = pd.read_parquet(file_paths['vendas'])
            df_pdvs = pd.read_parquet(file_paths['pdvs'])
            df_produtos = pd.read_parquet(file_paths['produtos'])
            logging.info("Arquivos de vendas, pdvs e produtos carregados com sucesso.")
        except (FileNotFoundError, KeyError) as e:
            logging.error(f"Erro ao carregar os arquivos. Erro: {e}")
            raise

        df_merged = pd.merge(df_vendas, df_pdvs, left_on='internal_store_id', right_on='pdv', how='inner')
        df_merged = pd.merge(df_merged, df_produtos, left_on='internal_product_id', right_on='produto', how='inner')

        df_merged['transaction_date'] = pd.to_datetime(df_merged['transaction_date'])
        df_merged['ano'] = df_merged['transaction_date'].dt.isocalendar().year
        df_merged['semana'] = df_merged['transaction_date'].dt.isocalendar().week

        logging.info("Agregando dados de vendas por semana/pdv/produto.")
        agg_vendas = df_merged.groupby(['ano', 'semana', 'pdv', 'produto']).agg(
            total_quantity=('quantity', 'sum')
        ).reset_index()

        df_aggregated = agg_vendas.rename(columns={'produto': 'sku', 'total_quantity': 'quantidade'})
        logging.info(f"Dados agregados e enriquecidos. DataFrame final com {df_aggregated.shape[0]} registros.")

        return df_aggregated

    def feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame: # ENGENHARIA DE FEATURES
        """Cria um conjunto rico de features para o modelo a partir dos dados de série temporal."""
        logging.info("Iniciando a engenharia de features avançada.")
        df_featured = df.copy()
        df_featured.sort_values(['pdv', 'sku', 'ano', 'semana'], inplace=True)

        # 1. Features de Tempo
        df_featured['trimestre'] = (df_featured['semana'] - 1) // 13 + 1
        df_featured['seno_semana'] = np.sin(2 * np.pi * df_featured['semana'] / 52)
        df_featured['cosseno_semana'] = np.cos(2 * np.pi * df_featured['semana'] / 52)

        # 2. Lags (passado)
        lags = [1, 2, 3, 4, 12, 52] # medido em semanas
        for lag in lags:
            df_featured[f'lag_{lag}_semanas'] = df_featured.groupby(['pdv', 'sku'])['quantidade'].shift(lag)

        # 3. Janelas Móveis (Rolling Windows)
        windows = [4, 12, 52] # medido em semanas
        for window in windows:
            df_featured[f'rolling_mean_{window}_semanas'] = df_featured.groupby(['pdv', 'sku'])['quantidade'].shift(1).rolling(window=window, min_periods=1).mean()
            df_featured[f'rolling_std_{window}_semanas'] = df_featured.groupby(['pdv', 'sku'])['quantidade'].shift(1).rolling(window=window, min_periods=1).std()
            df_featured[f'rolling_max_{window}_semanas'] = df_featured.groupby(['pdv', 'sku'])['quantidade'].shift(1).rolling(window=window, min_periods=1).max()

        df_featured.fillna(0, inplace=True)
        logging.info("Engenharia de features avançada concluída.")
        return df_featured

    def _prepare_data_for_model(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
        """Prepara os dados para o formato que o LightGBM espera, tratando as categóricas."""
        logging.info("Preparando dados para modelagem (tratamento de categóricas).")
        df_model = df.copy()

        self.categorical_features = ['pdv', 'sku']
        for col in self.categorical_features:
            df_model[col] = df_model[col].astype('category')

        self.feature_names = [
            'semana', 'trimestre', 'seno_semana', 'cosseno_semana', 'pdv', 'sku',
            'lag_1_semanas', 'lag_2_semanas', 'lag_3_semanas', 'lag_4_semanas', 'lag_12_semanas', 'lag_52_semanas',
            'rolling_mean_4_semanas', 'rolling_std_4_semanas', 'rolling_max_4_semanas',
            'rolling_mean_12_semanas', 'rolling_std_12_semanas', 'rolling_max_12_semanas',
            'rolling_mean_52_semanas', 'rolling_std_52_semanas', 'rolling_max_52_semanas',
        ]
        X = df_model[self.feature_names]
        y = df_model['quantidade']
        return X, y

    def train(self, df: pd.DataFrame, validation_split_week: int = 48, use_optuna: bool = True, n_trials: int = 200):
        """Executa o pipeline de treinamento, aceitando 'n_trials' como parâmetro para o Optuna.""" # Extremamente importante para a eficácia do treinamento, pode ser definido para maior ou menor na linha acima
        logging.info("Iniciando o processo de treinamento avançado.")
        df_train_raw = df[df['ano'] == 2022].copy()
        if df_train_raw.empty:
            raise ValueError("Não há dados históricos de 2022 para treinar o modelo.")

        df_featured = self.feature_engineering(df_train_raw)
        train_set = df_featured[df_featured['semana'] < validation_split_week]
        val_set = df_featured[df_featured['semana'] >= validation_split_week]

        X_train, y_train = self._prepare_data_for_model(train_set)
        X_val, y_val = self._prepare_data_for_model(val_set)

        for col in self.categorical_features:
            all_categories = pd.concat([X_train[col], X_val[col]]).astype('category').cat.categories
            X_train[col] = pd.Categorical(X_train[col], categories=all_categories)
            X_val[col] = pd.Categorical(X_val[col], categories=all_categories)

        logging.info(f"Dados de treino: {len(X_train)} registros. Dados de validação: {len(X_val)} registros.")

        fit_params = {
            "eval_set": [(X_val, y_val)],
            "eval_metric": "mae",
            "callbacks": [lgb.early_stopping(10, verbose=False)]
        }

        # Caso o boolean do use_optune estiver como True na linha 95, ele irá utilizar os seguintes hiper-parâmetros. Isso garante EXTREMA precisão para as previsões
        if use_optuna:
            logging.info(f"Iniciando o tuning de hiperparâmetros com Optuna ({n_trials} trials).")
            def objective(trial):
                params = {
                    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 1000,
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 300),
                    'max_depth': trial.suggest_int('max_depth', 3, 12),
                    'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                    'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                    'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                    'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                    'random_state': 42, 'n_jobs': -1
                }
                model = lgb.LGBMRegressor(**params)
                model.fit(X_train, y_train, **fit_params, categorical_feature=self.categorical_features)
                preds = model.predict(X_val)
                mae = mean_absolute_error(y_val, preds)
                return mae

            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=n_trials)

            logging.info(f"Melhores hiperparâmetros encontrados: {study.best_params}")
            self.model = lgb.LGBMRegressor(objective='regression_l1', random_state=42, n_estimators=1000, **study.best_params)
        else:
            logging.info("Treinando modelo LightGBM com hiperparâmetros padrão e early stopping.")
            self.model = lgb.LGBMRegressor(objective='regression_l1', random_state=42, n_estimators=1000)

        self.model.fit(X_train, y_train, **fit_params, categorical_feature=self.categorical_features)

        val_preds = self.model.predict(X_val)
        mae = mean_absolute_error(y_val, val_preds)
        self.performance_metrics['validation_mae'] = mae
        logging.info(f"Treinamento concluído. MAE no set de validação: {mae:.4f}")

    def generate_forecasts(self, df_historical: pd.DataFrame, weeks_to_forecast: int) -> pd.DataFrame:
        """Gera previsões para as semanas futuras de forma iterativa."""
        if not self.model:
            raise RuntimeError("O modelo não foi treinado. Execute o método 'train' primeiro.")

        logging.info(f"Iniciando a geração de previsões para {weeks_to_forecast} semanas.")
        forecast_df = df_historical.copy()
        all_forecasts = []

        for i in range(1, weeks_to_forecast + 1):
            current_week = i
            logging.info(f"Processando previsões para a semana {current_week} de 2023.")

            features_base = self.feature_engineering(forecast_df)
            latest_entries = features_base.sort_values(by=['ano', 'semana']).drop_duplicates(subset=['pdv', 'sku'], keep='last')

            if latest_entries.empty:
                logging.warning(f"Não há dados base para prever a semana {current_week}.")
                continue

            X_pred = latest_entries.copy()
            X_pred['semana'] = current_week
            X_pred['ano'] = 2023

            for col in self.categorical_features:
                model_categories = self.model.booster_.pandas_categorical[self.categorical_features.index(col)]
                X_pred[col] = pd.Categorical(X_pred[col], categories=model_categories)

            X_pred.dropna(subset=self.categorical_features, inplace=True)
            if X_pred.empty:
                logging.warning(f"Nenhum PDV/SKU conhecido para prever na semana {current_week} após filtrar categorias.")
                continue

            predictions = self.model.predict(X_pred[self.feature_names])
            predictions = np.maximum(0, np.round(predictions)).astype(int)

            week_forecast = X_pred[['pdv', 'sku']].copy()
            week_forecast['semana'] = current_week
            week_forecast['quantidade_prevista'] = predictions
            all_forecasts.append(week_forecast)

            new_data = week_forecast.rename(columns={'quantidade_prevista': 'quantidade'})
            new_data['ano'] = 2023
            forecast_df = pd.concat([forecast_df, new_data], ignore_index=True)

        return pd.concat(all_forecasts, ignore_index=True) if all_forecasts else pd.DataFrame()

    def save_model(self, path: str):
        """Salva os artefatos do modelo em um arquivo."""
        if not self.model:
            raise RuntimeError("Nenhum modelo treinado para salvar.")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        artifacts = {
            "model": self.model,
            "feature_names": self.feature_names,
            "categorical_features": self.categorical_features
        }
        joblib.dump(artifacts, path)
        logging.info(f"Modelo e artefatos V2 salvos em: '{path}'")

### Treinamento + Previsão

Execução do pipeline de treinamento e geração dos artefatos.


> Artefatos gerados:
>*   sales_forecaster_v2.joblib (modelo treinado)
>*   previsoes_janeiro_2023_sorted.csv (arquivo com as previsões)

In [None]:
# =======================================================================================
# CÉLULA DE TREINAMENTO PRINCIPAL (Gera arquivo de previsão COMPLETO)
# =======================================================================================

# --- 1. CONFIGURAÇÃO ---
logging.info("Iniciando o Pipeline de Treinamento Principal.")
PROJECT_PATH = '/content/drive/MyDrive/Hackathon_Forecast'
model_output_path = os.path.join(PROJECT_PATH, 'artifacts/sales_forecaster_v2_final.joblib')
forecast_output_dir = os.path.join(PROJECT_PATH, 'data/processed')
file_paths = {
    'vendas': os.path.join(PROJECT_PATH, 'data/raw/fato_vendas.parquet'),
    'pdvs': os.path.join(PROJECT_PATH, 'data/raw/dim_pdvs.parquet'),
    'produtos': os.path.join(PROJECT_PATH, 'data/raw/dim_produtos.parquet')
}

# --- 2. EXECUÇÃO ---
forecaster_v2 = SalesForecasterV2()
try:
    df_full_data = forecaster_v2.load_data(file_paths)

    # Treina o modelo com 100 trials para máxima precisão. PODE SER MAIS, 200 POR EXEMPLO.
    forecaster_v2.train(df_full_data, validation_split_week=48, use_optuna=True, n_trials=100)

    # Salva o modelo treinado.
    forecaster_v2.save_model(path=model_output_path)

    # Gera a previsão COMPLETA, sem filtros.
    forecasts_completos = forecaster_v2.generate_forecasts(df_full_data[df_full_data['ano'] == 2022], weeks_to_forecast=5)

    # Formata e salva este resultado completo para sua própria análise.
    if not forecasts_completos.empty:
        df_completo_formatado = forecasts_completos.rename(columns={'sku': 'produto', 'quantidade_prevista': 'quantidade'})
        df_completo_formatado = df_completo_formatado[['semana', 'pdv', 'produto', 'quantidade']]
        df_completo_sorted = df_completo_formatado.sort_values(by=['semana', 'quantidade'], ascending=[True, False])

        timestamp = datetime.now().strftime("%Ym%d_%H%M%S")
        full_filename = os.path.join(forecast_output_dir, f"previsao_COMPLETA_{timestamp}.parquet")
        df_completo_sorted.to_parquet(full_filename, index=False)
        logging.info(f"Arquivo de previsão COMPLETO salvo para análise em: {full_filename}")

except Exception as e:
    logging.error(f"O pipeline de treinamento falhou com o erro: {e}")
    raise e

logging.info("Pipeline de Treinamento Principal finalizado com sucesso!")

[I 2025-09-17 17:59:50,688] A new study created in memory with name: no-name-fb921546-bbb9-4405-bcd1-1e0fee471a25


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:02:18,780] Trial 0 finished with value: 2.331379579200746 and parameters: {'learning_rate': 0.1487384583600943, 'num_leaves': 172, 'max_depth': 8, 'min_child_samples': 53, 'feature_fraction': 0.5055646119257365, 'bagging_fraction': 0.6909365805682885, 'bagging_freq': 5, 'lambda_l1': 0.11087066071980395, 'lambda_l2': 0.0016340807667377766}. Best is trial 0 with value: 2.331379579200746.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.240116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:05:02,279] Trial 1 finished with value: 2.331731931207058 and parameters: {'learning_rate': 0.17082180787606827, 'num_leaves': 247, 'max_depth': 11, 'min_child_samples': 55, 'feature_fraction': 0.9065330602939888, 'bagging_fraction': 0.6006976257608396, 'bagging_freq': 5, 'lambda_l1': 0.23702925151611254, 'lambda_l2': 1.938636508952572e-06}. Best is trial 0 with value: 2.331379579200746.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.266011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:07:20,495] Trial 2 finished with value: 2.3105670926565054 and parameters: {'learning_rate': 0.25870189541423955, 'num_leaves': 287, 'max_depth': 11, 'min_child_samples': 16, 'feature_fraction': 0.9857165817236052, 'bagging_fraction': 0.5866274270800762, 'bagging_freq': 6, 'lambda_l1': 0.07778745394093718, 'lambda_l2': 2.1561858856081173e-07}. Best is trial 2 with value: 2.3105670926565054.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.869736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:12:55,811] Trial 3 finished with value: 2.2958369855719587 and parameters: {'learning_rate': 0.06029801336932907, 'num_leaves': 243, 'max_depth': 12, 'min_child_samples': 75, 'feature_fraction': 0.6283010869945898, 'bagging_fraction': 0.5272094647990557, 'bagging_freq': 7, 'lambda_l1': 0.0032096605990790616, 'lambda_l2': 0.49035598563638666}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.208045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:14:17,872] Trial 4 finished with value: 2.3804504720876105 and parameters: {'learning_rate': 0.20091788800514473, 'num_leaves': 100, 'max_depth': 7, 'min_child_samples': 34, 'feature_fraction': 0.6956699979930852, 'bagging_fraction': 0.6672667443656144, 'bagging_freq': 3, 'lambda_l1': 0.001438539903512441, 'lambda_l2': 0.0139586518368246}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.074334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:15:24,717] Trial 5 finished with value: 2.425698398770536 and parameters: {'learning_rate': 0.23482865432055489, 'num_leaves': 179, 'max_depth': 5, 'min_child_samples': 17, 'feature_fraction': 0.8865395367751503, 'bagging_fraction': 0.7647966242439271, 'bagging_freq': 6, 'lambda_l1': 0.00018966227168241482, 'lambda_l2': 0.9110138280131996}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.231367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:16:53,656] Trial 6 finished with value: 2.3597645633309523 and parameters: {'learning_rate': 0.18704085894950404, 'num_leaves': 187, 'max_depth': 7, 'min_child_samples': 60, 'feature_fraction': 0.928351005170741, 'bagging_fraction': 0.5014417911591407, 'bagging_freq': 7, 'lambda_l1': 3.8352536864011446e-05, 'lambda_l2': 0.013325602077859601}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.236884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:17:28,866] Trial 7 finished with value: 2.5004519236222293 and parameters: {'learning_rate': 0.13410343455416607, 'num_leaves': 191, 'max_depth': 4, 'min_child_samples': 9, 'feature_fraction': 0.9223791880724402, 'bagging_fraction': 0.4231808179515211, 'bagging_freq': 5, 'lambda_l1': 1.104402413145248e-08, 'lambda_l2': 0.03771028069770705}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.851809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:18:44,700] Trial 8 finished with value: 2.4390207071717525 and parameters: {'learning_rate': 0.27394426349678386, 'num_leaves': 219, 'max_depth': 6, 'min_child_samples': 63, 'feature_fraction': 0.6426260453040933, 'bagging_fraction': 0.9527210666023909, 'bagging_freq': 5, 'lambda_l1': 0.2146115788908294, 'lambda_l2': 7.054880521040053e-05}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.859613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:19:44,215] Trial 9 finished with value: 2.37329576736592 and parameters: {'learning_rate': 0.2866664259386261, 'num_leaves': 250, 'max_depth': 7, 'min_child_samples': 60, 'feature_fraction': 0.6424308387556475, 'bagging_fraction': 0.8224417931526498, 'bagging_freq': 5, 'lambda_l1': 1.2571313378197092e-08, 'lambda_l2': 0.007403243647746331}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:22:15,961] Trial 10 finished with value: 2.4261610691974154 and parameters: {'learning_rate': 0.037130455873499287, 'num_leaves': 49, 'max_depth': 9, 'min_child_samples': 98, 'feature_fraction': 0.42359389944233705, 'bagging_fraction': 0.4140985079616788, 'bagging_freq': 1, 'lambda_l1': 3.5506899232771286e-06, 'lambda_l2': 8.220997258408957}. Best is trial 3 with value: 2.2958369855719587.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.228965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:29:13,552] Trial 11 finished with value: 2.2809283485080725 and parameters: {'learning_rate': 0.06360647515153547, 'num_leaves': 294, 'max_depth': 12, 'min_child_samples': 89, 'feature_fraction': 0.7517469414137573, 'bagging_fraction': 0.5586614297996942, 'bagging_freq': 7, 'lambda_l1': 7.33865396368911, 'lambda_l2': 1.2957330850013946e-08}. Best is trial 11 with value: 2.2809283485080725.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.927534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:35:59,277] Trial 12 finished with value: 2.2864755385490536 and parameters: {'learning_rate': 0.05464902130622239, 'num_leaves': 277, 'max_depth': 12, 'min_child_samples': 91, 'feature_fraction': 0.7819237778376699, 'bagging_fraction': 0.5385174054395786, 'bagging_freq': 7, 'lambda_l1': 6.078489274292199, 'lambda_l2': 2.013144868506054e-08}. Best is trial 11 with value: 2.2809283485080725.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.079419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:39:55,306] Trial 13 finished with value: 2.30274546590035 and parameters: {'learning_rate': 0.08573329922107892, 'num_leaves': 295, 'max_depth': 10, 'min_child_samples': 99, 'feature_fraction': 0.8018327933130864, 'bagging_fraction': 0.5482848909721101, 'bagging_freq': 3, 'lambda_l1': 2.761205430958837, 'lambda_l2': 1.0289148186956048e-08}. Best is trial 11 with value: 2.2809283485080725.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.985632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 18:41:24,469] Trial 14 finished with value: 2.37795118347425 and parameters: {'learning_rate': 0.10304457121312355, 'num_leaves': 108, 'max_depth': 12, 'min_child_samples': 78, 'feature_fraction': 0.7851437797094905, 'bagging_fraction': 0.4801828497520139, 'bagging_freq': 7, 'lambda_l1': 2.144982492430736, 'lambda_l2': 1.5412978364014133e-08}. Best is trial 11 with value: 2.2809283485080725.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.962872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:02:19,407] Trial 15 finished with value: 2.2774215122612667 and parameters: {'learning_rate': 0.015454060269412408, 'num_leaves': 294, 'max_depth': 10, 'min_child_samples': 84, 'feature_fraction': 0.7801222989088611, 'bagging_fraction': 0.6399229058803267, 'bagging_freq': 6, 'lambda_l1': 4.9195689700624134, 'lambda_l2': 1.120443125318692e-05}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.203266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:22:30,537] Trial 16 finished with value: 2.3011480830994304 and parameters: {'learning_rate': 0.011580827163954435, 'num_leaves': 137, 'max_depth': 9, 'min_child_samples': 77, 'feature_fraction': 0.7338324414229547, 'bagging_fraction': 0.7845096011096998, 'bagging_freq': 6, 'lambda_l1': 0.009598812901741407, 'lambda_l2': 3.218682294194452e-05}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:46:21,882] Trial 17 finished with value: 2.297666425313927 and parameters: {'learning_rate': 0.011061834178803659, 'num_leaves': 228, 'max_depth': 10, 'min_child_samples': 85, 'feature_fraction': 0.8379028019086, 'bagging_fraction': 0.6382487053283268, 'bagging_freq': 3, 'lambda_l1': 2.0137075484843946e-06, 'lambda_l2': 2.589962932875727e-06}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.150555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:49:02,400] Trial 18 finished with value: 2.329147629962095 and parameters: {'learning_rate': 0.1002035199986284, 'num_leaves': 267, 'max_depth': 10, 'min_child_samples': 37, 'feature_fraction': 0.5563931917226668, 'bagging_fraction': 0.8626306517897404, 'bagging_freq': 1, 'lambda_l1': 6.639471877124665, 'lambda_l2': 5.457017319750457e-07}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.832655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:50:43,919] Trial 19 finished with value: 2.3962975214757685 and parameters: {'learning_rate': 0.07283204717632087, 'num_leaves': 59, 'max_depth': 11, 'min_child_samples': 88, 'feature_fraction': 0.7186497752480048, 'bagging_fraction': 0.7432887496666384, 'bagging_freq': 4, 'lambda_l1': 0.6979636335645336, 'lambda_l2': 1.1450723009129355e-05}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.875560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


[I 2025-09-17 19:51:16,273] Trial 20 finished with value: 2.6351393662925693 and parameters: {'learning_rate': 0.12170603690913018, 'num_leaves': 215, 'max_depth': 3, 'min_child_samples': 71, 'feature_fraction': 0.8524400204928112, 'bagging_fraction': 0.6232902101708627, 'bagging_freq': 6, 'lambda_l1': 0.03194790642399923, 'lambda_l2': 0.0002853709050224803}. Best is trial 15 with value: 2.2774215122612667.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.841229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14112
[LightGBM] [Info] Number of data points in the train set: 5593046, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000


In [None]:
# =======================================================================================
# CÉLULA DE GERAÇÃO DO ARQUIVO DE SUBMISSÃO (com limite de 1.5M de linhas)
# =======================================================================================

import joblib
import os
import pandas as pd
import numpy as np
import logging
from datetime import datetime

logging.info("Iniciando pipeline de geração do arquivo de submissão (COM LIMITE DE LINHAS).")

try:
    # --- 1. CONFIGURAÇÃO ---
    PROJECT_PATH = '/content/drive/MyDrive/Hackathon_Forecast'

    # Usa o mesmo caminho de modelo definido na célula anterior
    model_input_path = model_output_path

    forecast_output_dir = os.path.join(PROJECT_PATH, 'data/processed')
    file_paths = {
        'vendas': os.path.join(PROJECT_PATH, 'data/raw/fato_vendas.parquet'),
        'pdvs': os.path.join(PROJECT_PATH, 'data/raw/dim_pdvs.parquet'),
        'produtos': os.path.join(PROJECT_PATH, 'data/raw/dim_produtos.parquet')
    }

    # --- 2. CARREGAR MODELO E DADOS ---
    logging.info(f"Carregando modelo de: {model_input_path}")
    artifacts = joblib.load(model_input_path)
    predictor = SalesForecasterV2()
    predictor.model = artifacts['model']
    predictor.feature_names = artifacts['feature_names']
    predictor.categorical_features = artifacts['categorical_features']
    logging.info("Modelo e artefatos carregados com sucesso.")

    df_full_data = predictor.load_data(file_paths)
    df_historical_2022 = df_full_data[df_full_data['ano'] == 2022].copy()

    # --- 3. LÓGICA DE FILTRAGEM (O CORAÇÃO DESTA CÉLULA) ---
    logging.info("Selecionando as Top 300.000 combinações (PDV, SKU) com base nas vendas de 2022.")
    vendas_totais_2022 = df_historical_2022.groupby(['pdv', 'sku'])['quantidade'].sum().reset_index()
    top_combinacoes = vendas_totais_2022.nlargest(300000, 'quantidade')
    df_historical_filtrado = pd.merge(df_historical_2022, top_combinacoes[['pdv', 'sku']], on=['pdv', 'sku'], how='inner')
    logging.info(f"Dados históricos filtrados para as {df_historical_filtrado[['pdv', 'sku']].drop_duplicates().shape[0]} combinações mais relevantes.")

    # --- 4. GERAR PREVISÕES FILTRADAS ---
    logging.info("Gerando previsões para as combinações mais relevantes...")
    forecasts = predictor.generate_forecasts(df_historical_filtrado, weeks_to_forecast=5)

    # --- 5. FORMATAR E SALVAR ARQUIVO DE SUBMISSÃO ---
    if not forecasts.empty:
        df_submission = forecasts.rename(columns={'sku': 'produto', 'quantidade_prevista': 'quantidade'})
        df_submission = df_submission[['semana', 'pdv', 'produto', 'quantidade']]
        df_submission_sorted = df_submission.sort_values(by=['semana', 'quantidade'], ascending=[True, False])

        logging.info(f"Previsão final para submissão gerada com {len(df_submission_sorted)} linhas.")

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        submission_filename = os.path.join(forecast_output_dir, f"previsao_SUBMISSAO_{timestamp}.parquet")
        df_submission_sorted.to_parquet(submission_filename, index=False)
        logging.info(f"ARQUIVO DE SUBMISSÃO salvo em: {submission_filename}")

    else:
        logging.warning("Nenhuma previsão foi gerada.")

except Exception as e:
    logging.error(f"O pipeline falhou. Erro: {e}")
    raise e

logging.info("Processo de geração do arquivo de submissão finalizado com sucesso!")

#### EXTRA: RESUMO GERENCIAL

In [None]:
try:
    if not forecasts_sorted.empty:
        logging.info("Criando um resumo com o Top 10 produtos por semana.")

        # Agrupa por semana e aplica uma função para pegar os 10 maiores de cada grupo
        top_10_per_week = forecasts_sorted.groupby('semana').apply(
            lambda x: x.nlargest(10, 'quantidade_prevista')
        ).reset_index(drop=True)

        # Salva em arquivo separado
        summary_filename = os.path.join(forecast_output_dir, f"resumo_top10_produtos_{timestamp}.csv")
        top_10_per_week.to_csv(summary_filename, index=False)

        logging.info(f"Resumo Top 10 salvo em: {summary_filename}")
except Exception as e:
    logging.warning(f"Não foi possível gerar o resumo Top 10. Erro: {e}")

# Gráficos para visualização

## Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# estilo dos gráficos
sns.set_style("whitegrid")
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 12

# arquivo gerado após o treinamento
caminho_previsoes_ordenadas = '/content/drive/MyDrive/Hackathon_Forecast/data/processed/previsoes_janeiro_2023_sorted_20250917_002513.csv'

try:
    df_forecast = pd.read_csv(caminho_previsoes_ordenadas)
    print("Arquivo de previsões carregado com sucesso!")
    # Converte a coluna 'sku' para string para evitar problemas de plotagem
    df_forecast['sku'] = df_forecast['sku'].astype(str)
except FileNotFoundError:
    print(f"ERRO: Arquivo não encontrado em '{caminho_previsoes_ordenadas}'")
    print("Verifique se o nome do arquivo e o timestamp estão corretos.")

## Visualização

In [None]:
# TOP 10 PRODUTOS POR SEMANA

# Pega a lista de semanas únicas para as quais temos previsão
semanas_previstas = sorted(df_forecast['semana'].unique())

for semana in semanas_previstas:
    # Filtra os dados para a semana atual e pega os 30 primeiros
    df_semana = df_forecast[df_forecast['semana'] == semana].head(10)

    # gráfico
    plt.figure(figsize=(14, 7))
    ax = sns.barplot(
        data=df_semana,
        x='quantidade_prevista',
        y='sku',
        palette='viridis'
    )

    # Adiciona os valores no final das barras
    for patch in ax.patches:
        width = patch.get_width()

        y = patch.get_y() + patch.get_height() / 2

        ax.text(x=width + 2,
                y=y,
                s=f'{int(width)}',
                ha='left',
                va='center')

    # Títulos e rótulos
    plt.title(f'Top 10 Produtos Previstos para a Semana {semana} de 2023', fontsize=16, weight='bold')
    plt.xlabel('Quantidade Prevista', fontsize=12)
    plt.ylabel('SKU do Produto', fontsize=12)
    plt.xlim(right=df_semana['quantidade_prevista'].max() * 1.15)
    plt.tight_layout()


    plt.show()