In [1]:
import pandas as pd
import numpy as np
import torch

In [None]:
df = pd.read_csv('', sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499543 entries, 0 to 499542
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ear_reservatorio_percentual        488167 non-null  float64
 1   ear_total_mwmes                    499543 non-null  float64
 2   val_volmax                         499543 non-null  float64
 3   id_reservatorio                    499543 non-null  object 
 4   val_volumeutilcon                  499543 non-null  float64
 5   ear_reservatorio_percentual_lag1   488164 non-null  float64
 6   ear_reservatorio_percentual_lag7   488146 non-null  float64
 7   ear_reservatorio_percentual_lag14  488125 non-null  float64
 8   ear_reservatorio_percentual_roll7  488143 non-null  float64
 9   ear_reservatorio_percentual_diff1  488163 non-null  float64
 10  ear_total_mwmes_lag1               499542 non-null  float64
 11  ear_total_mwmes_lag7               4995

In [3]:
import warnings

warnings.filterwarnings("ignore")

# Definições
target = "val_volumeutilcon"
cat_col = "id_reservatorio"

train_list, test_list = [], []

# split por reservatório
for rid, group in df.groupby(cat_col):
    group = group.sort_values(["ano", "mes", "dia"])  # garante ordem temporal
    
    split_idx = int(len(group) * 0.7)
    train_part = group.iloc[:split_idx].copy()
    test_part = group.iloc[split_idx:].copy()
    
    # Expanding mean até t-1 no treino
    train_part["id_encoded"] = (
        train_part[target].expanding().mean().shift(1)
    )
    
    # Preenche valores NaN com média global do target no treino
    global_mean = train_part[target].mean()
    train_part["id_encoded"].fillna(global_mean, inplace=True)
    
    # Para o teste, calculamos encoding usando apenas histórico do treino
    # Concatenamos treino + teste, aplicamos expanding, mas só usamos valores válidos
    full_series = pd.concat([train_part, test_part])
    full_series["id_encoded"] = (
        full_series[target].expanding().mean().shift(1)
    )
    
    # Só aplica para o conjunto de teste
    test_part["id_encoded"] = full_series.loc[test_part.index, "id_encoded"]
    test_part["id_encoded"].fillna(global_mean, inplace=True)

    train_list.append(train_part)
    test_list.append(test_part)

# Junta todos os reservatórios
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

# Define features
features = [
    "id_encoded",
    "val_volmax",
    "ear_reservatorio_percentual_lag1",
    "ear_reservatorio_percentual_lag7",
    "ear_reservatorio_percentual_roll7",
    "dia",
    "mes",
    "ano"
]

X_train = train_df[features]
y_train = train_df[[target]]

X_test = test_df[features]
y_test = test_df[[target]]


In [None]:
X_train = X_train.values.astype(np.float32)
y_train = y_train.values.astype(np.float32)
X_test = X_test.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

from torch.utils.data import TensorDataset, DataLoader

X_train_np = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_np = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

train_ds = TensorDataset(torch.tensor(X_train_np), torch.tensor(y_train))
test_ds = TensorDataset(torch.tensor(X_test_np), torch.tensor(y_test))

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

In [None]:
import lightgbm as lgb

regressor = lgb.train(
    params={
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': 1,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8
    },
    train_set=lgb.Dataset(X_train, label=y_train),
    num_boost_round=1000,
    valid_sets=[lgb.Dataset(X_test, label=y_test)]
)

In [None]:
y_pred = regressor.predict(X_test)
print("RMSE:", np.sqrt(np.mean((y_test - y_pred) ** 2)))