In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('C:/Users/luis2/OneDrive/Documentos/challenge-ML/dados/resultado_unificado_normalized_clean.csv', sep=';')
df.info()

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Definições
target = "val_volumeutilcon"
cat_col = "id_reservatorio"

train_list, test_list = [], []

# split por reservatório
for rid, group in df.groupby(cat_col):
    group = group.sort_values(["ano", "mes", "dia"])  # garante ordem temporal
    
    split_idx = int(len(group) * 0.7)
    train_part = group.iloc[:split_idx].copy()
    test_part = group.iloc[split_idx:].copy()
    
    # Expanding mean até t-1 no treino
    train_part["id_encoded"] = (
        train_part[target].expanding().mean().shift(1)
    )
    
    # Preenche primeiros valores sem histórico com média global do treino
    global_mean = train_part[target].mean()
    train_part["id_encoded"].fillna(global_mean, inplace=True)
    
    # Para o teste, calculamos encoding usando apenas histórico do treino
    # Concatenamos treino + teste, aplicamos expanding, mas só usamos valores válidos
    full_series = pd.concat([train_part, test_part])
    full_series["id_encoded"] = (
        full_series[target].expanding().mean().shift(1)
    )
    
    # Só aplica para o conjunto de teste
    test_part["id_encoded"] = full_series.loc[test_part.index, "id_encoded"]
    test_part["id_encoded"].fillna(global_mean, inplace=True)

    train_list.append(train_part)
    test_list.append(test_part)

# Junta todos os reservatórios
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

# Define features
features = [
    "id_encoded",
    "val_volmax",
    "ear_reservatorio_percentual_lag1",
    "ear_reservatorio_percentual_lag7",
    "ear_reservatorio_percentual_roll7",
    "dia",
    "mes",
    "ano"
]

X_train = train_df[features]
y_train = train_df[[target]]

X_test = test_df[features]
y_test = test_df[[target]]



In [None]:
y_test.info()
y_test.value_counts

In [None]:
import lightgbm as lgb
clf = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42
)
clf.fit(X_train, y_train)

In [None]:
lgb.plot_metric(clf)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print('Training set score: {:.7f}'.format(clf.score(X_train, y_train)))

print('Test set score: {:.7f}'.format(clf.score(X_test, y_test)))

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Métricas principais
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Corrigindo MAPE
mape = np.mean(
    np.abs((y_test.values.flatten() - y_pred) / np.where(y_test.values.flatten() == 0, 1, y_test.values.flatten()))
) * 100

# Alternativa mais robusta
smape = 100 * np.mean(
    2 * np.abs(y_pred - y_test.values.flatten()) / (np.abs(y_test.values.flatten()) + np.abs(y_pred))
)

print(f"MAE:   {mae:.7f}")
print(f"RMSE:  {rmse:.7f}")
print(f"MAPE:  {mape:.4f}%")
print(f"SMAPE: {smape:.4f}%")
print(f"R²:    {r2:.7f}")


In [None]:
feature_importances = pd.DataFrame({
    "feature": clf.feature_name_,
    "importance": clf.feature_importances_
}).sort_values(by="importance", ascending=False)

print(feature_importances.head(10))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7,7))
plt.scatter(y_test, y_pred, alpha=0.5, label="Previsões")
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', lw=2, label="Ideal (y=x)")

plt.xlabel("Valores Reais (y_test)")
plt.ylabel("Valores Previstos (y_pred)")
plt.title("Dispersão: Real vs Previsto")
plt.legend()
plt.grid(True)
plt.show()
