<a href="https://colab.research.google.com/github/ThalesF01/Desafio_Instituto_Tecgraf/blob/main/Desafio_Instituto_Tecgraf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importação de bibliotecas

In [63]:
# === IMPORTAÇÕES ===
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, ParameterSampler

from lightgbm import LGBMRegressor
import lightgbm as lgb

from tqdm import tqdm
import matplotlib.pyplot as plt

import os
import sys
from contextlib import contextmanager


# Carregamento de dados e separação de features e targets


In [None]:
# === CARREGAMENTO DO HISTÓRICO ===

df = pd.read_csv("HISTORICO.csv")

# Convertendo coluna time
df["time"] = pd.to_datetime(df["time"])

df.head()


In [65]:
# === SEPARAÇÃO DE FEATURES E TARGETS ===

target_temp = "temperature_2m (°C)"
target_rain = "rain (mm)"

feature_cols = [c for c in df.columns if c not in ["time", target_temp, target_rain]]

X = df[feature_cols].copy()
y_temp = df[target_temp].copy()
y_rain = df[target_rain].copy()

X.head()


Unnamed: 0,relative_humidity_2m (%),pressure_msl (hPa),surface_pressure (hPa),cloud_cover (%),wind_speed_10m (km/h),wind_speed_100m (km/h),wind_direction_10m (°),wind_direction_100m (°),wind_gusts_10m (km/h),soil_temperature_0_to_7cm (°C),soil_moisture_0_to_7cm (m³/m³)
0,28,1014.7,839.5,65,5.7,10.1,252,253,20.2,23.4,0.099
1,35,1015.6,839.5,55,3.1,5.8,249,277,10.1,21.2,0.099
2,46,1016.0,839.1,49,1.8,1.8,191,307,7.9,19.6,0.099
3,44,1017.0,839.1,47,4.0,2.5,185,225,8.3,18.2,0.099
4,48,1017.5,838.8,46,5.0,4.5,180,194,9.4,17.1,0.099


# Normalização

In [None]:
# === NORMALIZAÇÃO DAS FEATURES ===

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

X_scaled.head()


# Divisão de treino / teste e validação interna

In [67]:
# === SPLIT TEMPORAL (80% TREINO / 20% TESTE) ===

train_size = int(len(X_scaled) * 0.8)

X_train = X_scaled.iloc[:train_size]
X_test  = X_scaled.iloc[train_size:]

y_temp_train = y_temp.iloc[:train_size]
y_temp_test  = y_temp.iloc[train_size:]

y_rain_train = y_rain.iloc[:train_size]
y_rain_test  = y_rain.iloc[train_size:]


In [68]:
# === VALIDAÇÃO INTERNA (20% DO TREINO) ===

valid_size = int(len(X_train) * 0.2)

X_train_internal = X_train.iloc[:-valid_size]
X_valid          = X_train.iloc[-valid_size:]

y_temp_train_internal = y_temp_train.iloc[:-valid_size]
y_temp_valid          = y_temp_train.iloc[-valid_size:]

y_rain_train_internal = y_rain_train.iloc[:-valid_size]
y_rain_valid          = y_rain_train.iloc[-valid_size:]


# Metricas

In [69]:
# === FUNÇÃO GLOBAL DE AVALIAÇÃO ===

def avaliar(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse


# Modelo para temperatura

In [70]:
# === MODELO BASE — TEMPERATURA ===

modelo_temp = LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

modelo_temp.fit(
    X_train_internal,
    y_temp_train_internal,
    eval_set=[(X_valid, y_temp_valid)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

y_pred_train = modelo_temp.predict(X_train_internal)
y_pred_valid = modelo_temp.predict(X_valid)

mae_tr, rmse_tr = avaliar(y_temp_train_internal, y_pred_train)
mae_va, rmse_va = avaliar(y_temp_valid, y_pred_valid)

print("Treino:", mae_tr, rmse_tr)
print("Validação:", mae_va, rmse_va)


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4999]	valid_0's rmse: 0.729524	valid_0's l2: 0.532206
Treino: 0.1597159470101384 0.207500892751219
Validação: 0.482117474788955 0.7295243005368167


In [None]:
# === CROSS-VALIDATION TEMPORAL — TEMPERATURA ===

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []

print("Executando TSCV...\n")

for fold, (train_idx, test_idx) in tqdm(
    enumerate(tscv.split(X_train), start=1),
    total=5
):
    X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_te = y_temp_train.iloc[train_idx], y_temp_train.iloc[test_idx]

    modelo = LGBMRegressor(
        n_estimators=3000,
        learning_rate=0.02,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    modelo.fit(
        X_tr, y_tr,
        eval_set=[(X_te, y_te)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=80)]
    )

    preds = modelo.predict(X_te)
    rmse_scores.append(np.sqrt(mean_squared_error(y_te, preds)))

print("RMSE médio:", np.mean(rmse_scores))
print("Desvio:", np.std(rmse_scores))


In [None]:
# === RANDOM SEARCH — TEMPERATURA ===

param_dist = {
    "num_leaves": [31, 63, 127],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "min_data_in_leaf": [20, 40, 60],
    "feature_fraction": [0.7, 0.8, 0.9],
    "bagging_fraction": [0.7, 0.8, 0.9],
    "bagging_freq": [1, 3],
    "max_depth": [-1, 5, 8],
    "lambda_l1": [0, 0.1, 0.3],
    "lambda_l2": [0, 0.1, 0.3],
}

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old = sys.stdout
        sys.stdout = devnull
        yield
        sys.stdout = old

param_list = list(ParameterSampler(param_dist, n_iter=20, random_state=42))

best_score = float("inf")
best_params = None

for params in tqdm(param_list):
    scores = []
    for train_idx, test_idx in TimeSeriesSplit(n_splits=5).split(X_train):
        X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_tr, y_te = y_temp_train.iloc[train_idx], y_temp_train.iloc[test_idx]

        model = LGBMRegressor(**params, random_state=42, n_jobs=-1, verbosity=-1)

        with suppress_stdout():
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_te, y_te)],
                eval_metric="rmse",
                callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False)]
            )

        rmse = np.sqrt(mean_squared_error(y_te, model.predict(X_te)))
        scores.append(rmse)

    mean_rmse = np.mean(scores)

    if mean_rmse < best_score:
        best_score = mean_rmse
        best_params = params

print("Melhores hiperparâmetros:", best_params)
print("Melhor RMSE:", best_score)


In [None]:
# === MODELO FINAL — TEMPERATURA ===

modelo_temp_final = LGBMRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1
)

modelo_temp_final.fit(
    X_train,
    y_temp_train,
    eval_set=[(X_test, y_temp_test)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False)]
)

y_pred_train_final = modelo_temp_final.predict(X_train)
y_pred_valid_final = modelo_temp_final.predict(X_valid)
y_pred_test_final  = modelo_temp_final.predict(X_test)

print("Treino:", avaliar(y_temp_train, y_pred_train_final))
print("Validação:", avaliar(y_temp_valid, y_pred_valid_final))
print("Teste:", avaliar(y_temp_test, y_pred_test_final))


# Modelo para chuva

In [75]:
# === MODELO BASE — CHUVA (RAIN) ===

modelo_rain = LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

modelo_rain.fit(
    X_train_internal,
    y_rain_train_internal,
    eval_set=[(X_valid, y_rain_valid)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

y_rain_pred_train  = modelo_rain.predict(X_train_internal)
y_rain_pred_valid  = modelo_rain.predict(X_valid)

mae_tr_rain, rmse_tr_rain = avaliar(y_rain_train_internal, y_rain_pred_train)
mae_va_rain, rmse_va_rain = avaliar(y_rain_valid, y_rain_pred_valid)

print("RAIN — Treino:")
print(f"MAE = {mae_tr_rain:.3f}   RMSE = {rmse_tr_rain:.3f}")
print("\nRAIN — Validação Interna:")
print(f"MAE = {mae_va_rain:.3f}   RMSE = {rmse_va_rain:.3f}")


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[363]	valid_0's rmse: 0.374946	valid_0's l2: 0.140584
RAIN — Treino:
MAE = 0.094   RMSE = 0.264

RAIN — Validação Interna:
MAE = 0.116   RMSE = 0.375


In [None]:
# === CROSS-VALIDATION TEMPORAL — RAIN ===

tscv_rain = TimeSeriesSplit(n_splits=5)
rmse_scores_rain = []

print("Executando TSCV para RAIN...\n")

for fold, (train_idx, test_idx) in tqdm(
    enumerate(tscv_rain.split(X_train), start=1),
    total=5
):
    X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_te = y_rain_train.iloc[train_idx], y_rain_train.iloc[test_idx]

    modelo = LGBMRegressor(
        n_estimators=3000,
        learning_rate=0.02,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    modelo.fit(
        X_tr, y_tr,
        eval_set=[(X_te, y_te)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=80)]
    )

    preds = modelo.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_te, preds))
    rmse_scores_rain.append(rmse)

    tqdm.write(f"Fold {fold}/5 — RMSE RAIN: {rmse:.3f}")

print("\n===== RESULTADOS CV - RAIN =====")
print(f"RMSE MÉDIO: {np.mean(rmse_scores_rain):.4f}")
print(f"DESVIO:     {np.std(rmse_scores_rain):.4f}")


In [None]:
# === RANDOM SEARCH — RAIN ===

param_dist_rain = {
    "num_leaves": [31, 63, 127],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "min_data_in_leaf": [20, 40, 60],
    "feature_fraction": [0.7, 0.8, 0.9],
    "bagging_fraction": [0.7, 0.8, 0.9],
    "bagging_freq": [1, 3],
    "max_depth": [-1, 5, 8],
    "lambda_l1": [0, 0.1, 0.3],
    "lambda_l2": [0, 0.1, 0.3],
}

param_list_rain = list(ParameterSampler(param_dist_rain, n_iter=20, random_state=42))

best_score_rain = float("inf")
best_params_rain = None

print("Iniciando busca de hiperparâmetros para RAIN...\n")

for params in tqdm(param_list_rain, desc="Random Search RAIN"):
    fold_scores = []

    for train_idx, test_idx in TimeSeriesSplit(n_splits=5).split(X_train):
        X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_tr, y_te = y_rain_train.iloc[train_idx], y_rain_train.iloc[test_idx]

        model_rain = LGBMRegressor(
            **params,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        )

        with suppress_stdout():
            model_rain.fit(
                X_tr, y_tr,
                eval_set=[(X_te, y_te)],
                eval_metric="rmse",
                callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False)]
            )

        preds = model_rain.predict(X_te)
        rmse = np.sqrt(mean_squared_error(y_te, preds))
        fold_scores.append(rmse)

    mean_rmse = np.mean(fold_scores)

    if mean_rmse < best_score_rain:
        best_score_rain = mean_rmse
        best_params_rain = params

print("\n===== MELHORES HIPERPARÂMETROS RAIN =====")
print(best_params_rain)
print("Melhor RMSE encontrado:", best_score_rain)


In [None]:
# === MODELO FINAL — CHUVA (RAIN) ===

modelo_rain_final = LGBMRegressor(
    **best_params_rain,
    random_state=42,
    n_jobs=-1
)

modelo_rain_final.fit(
    X_train,
    y_rain_train,
    eval_set=[(X_test, y_rain_test)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=False)]
)

# Previsões finais
y_rain_pred_train_final = modelo_rain_final.predict(X_train)
y_rain_pred_valid_final = modelo_rain_final.predict(X_valid)
y_rain_pred_test_final  = modelo_rain_final.predict(X_test)

# Métricas finais
print("\n===== RESULTADOS FINAIS - RAIN =====\n")

print("Treino:")
print("MAE  =", round(mean_absolute_error(y_rain_train, y_rain_pred_train_final), 3))
print("RMSE =", round(np.sqrt(mean_squared_error(y_rain_train, y_rain_pred_train_final)), 3))

print("\nValidação Interna:")
print("MAE  =", round(mean_absolute_error(y_rain_valid, y_rain_pred_valid_final), 3))
print("RMSE =", round(np.sqrt(mean_squared_error(y_rain_valid, y_rain_pred_valid_final)), 3))

print("\nTeste Final:")
print("MAE  =", round(mean_absolute_error(y_rain_test, y_rain_pred_test_final), 3))
print("RMSE =", round(np.sqrt(mean_squared_error(y_rain_test, y_rain_pred_test_final)), 3))


# Previsão próximo ano

In [82]:
# === PREPARO PARA PREVISÃO DO PRÓXIMO ANO ===

# df_features_scaled terá: time + todas as features escaladas
df_features_scaled = pd.concat([df["time"], X_scaled], axis=1)

# Datas futuras
future_times = pd.date_range(
    start=df["time"].max() + pd.Timedelta(hours=1),
    periods=24 * 365,
    freq="H"
)

future_df = pd.DataFrame({"time": future_times})
future_df["time_ref"] = future_df["time"] - pd.Timedelta(days=365)

# Merge com o ano anterior
future_merged = future_df.merge(
    df_features_scaled,
    left_on="time_ref",
    right_on="time",
    how="left",
    suffixes=("_future", "_past")
).drop(columns=["time_past"]).rename(columns={"time_future": "time"})


  future_times = pd.date_range(


In [None]:
# === PREVISÃO FINAL ===

X_future = future_merged[feature_cols].fillna(0)

temp_future = np.round(modelo_temp_final.predict(X_future), 1)
rain_future = np.round(np.clip(modelo_rain_final.predict(X_future), 0, None), 1)

df_prev = pd.DataFrame({
    "time": future_merged["time"],
    "temperature": temp_future,
    "rain": rain_future
})

df_prev.head()


In [None]:
df_prev.to_csv("PREVISAO.csv", index=False)
print("PREVISAO.csv gerado com sucesso!")

# Gráficos

In [None]:
# === GRÁFICO — Temperatura ===
plt.figure(figsize=(14,4))
plt.plot(y_temp_test.values[-500:], label="Real", linewidth=2)
plt.plot(y_pred_test_final[-500:], label="Previsto", linewidth=2)
plt.title("Temperatura — Últimas 500 horas do teste", fontsize=14)
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# === GRÁFICO — Chuva ===
plt.figure(figsize=(14,4))
plt.plot(y_rain_test.values[-500:], label="Real", linewidth=2)
plt.plot(y_rain_pred_test_final[-500:], label="Previsto", linewidth=2)
plt.title("Chuva — Últimas 500 horas do teste", fontsize=14)
plt.legend()
plt.grid(alpha=0.3)
plt.show()
