In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv("tem_historico_groupby_sincos_confecha.csv")


In [3]:
df['fecha'] = pd.to_datetime(df['fecha'], errors='coerce')
df = df.sort_values(by='fecha').reset_index(drop=True)

In [4]:
columnas_cat = ["velmedia", "velmax", "tmed", "tmax", "tmin"]

def create_lag_features(df, group_col, date_col, columnas_cat, lag=1):
    df = df.sort_values(by=[group_col, date_col])
    for col in columnas_cat:
        df[col + "_lag1"] = df.groupby(group_col)[col].shift(lag)
    return df

df_lag = create_lag_features(df, "provincia", "fecha", columnas_cat, lag=1)

# Eliminar filas donde no se pudieron calcular los lags
lag_cols = [col + "_lag1" for col in columnas_cat]
df_lag = df_lag.dropna(subset=lag_cols)

In [5]:
y = df_lag[columnas_cat].copy()
features = ["provincia", "sin_day_of_year", "cos_day_of_year"] + lag_cols
X = df_lag[features].copy()



In [6]:
# Usaremos el percentil 80 de la columna 'fecha' para el split
split_date = df_lag["fecha"].quantile(0.8)
# Agregamos 'fecha' a X para poder filtrar, y luego la quitamos
X['fecha'] = df_lag['fecha']

train_mask = X['fecha'] <= split_date
X_train = X[train_mask].drop(columns=["fecha"])
y_train = y[train_mask]
X_test = X[~train_mask].drop(columns=["fecha"])
y_test = y[~train_mask]

print("Tamaño train:", X_train.shape)
print("Tamaño test :", X_test.shape)

Tamaño train: (67184, 8)
Tamaño test : (16796, 8)


In [7]:
# Preprocesamiento: OneHotEncoder para la columna 'provincia'
col_transformer = ColumnTransformer(
    transformers=[
        ("ohe_prov", OneHotEncoder(handle_unknown="ignore"), ["provincia"])
    ],
    remainder="passthrough"  # Deja el resto de columnas (las numéricas) sin cambios
)

In [8]:
base_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
multi_rf = MultiOutputRegressor(base_rf)

pipeline = Pipeline([
    ("preprocessor", col_transformer),
    ("rf_multi", multi_rf)
])

In [9]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=columnas_cat, index=y_test.index)

for col in columnas_cat:
    mse = mean_squared_error(y_test[col], y_pred_df[col])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[col], y_pred_df[col])
    r2  = r2_score(y_test[col], y_pred_df[col])
    print(f"Variable: {col}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  MAE : {mae:.3f}")
    print(f"  R²  : {r2:.3f}")
    print("-"*30)

Variable: velmedia
  RMSE: 1.080
  MAE : 0.791
  R²  : 0.396
------------------------------
Variable: velmax
  RMSE: 2.985
  MAE : 2.224
  R²  : 0.335
------------------------------
Variable: tmed
  RMSE: 1.880
  MAE : 1.408
  R²  : 0.905
------------------------------
Variable: tmax
  RMSE: 2.446
  MAE : 1.863
  R²  : 0.870
------------------------------
Variable: tmin
  RMSE: 2.115
  MAE : 1.581
  R²  : 0.874
------------------------------


In [10]:
# a) Último registro por provincia
last_data = df_lag.groupby("provincia").apply(lambda d: d.iloc[-1]).reset_index(drop=True)

# b) Calcular la fecha siguiente
last_data['next_date'] = last_data['fecha'] + pd.Timedelta(days=1)

# c) Calcular el día del año para next_date y generar sin y cos para ese día
last_data['day_of_year_next'] = last_data['next_date'].dt.dayofyear
last_data['sin_day_of_year_next'] = np.sin(2 * np.pi * last_data['day_of_year_next'] / 366)
last_data['cos_day_of_year_next'] = np.cos(2 * np.pi * last_data['day_of_year_next'] / 366)


  last_data = df_lag.groupby("provincia").apply(lambda d: d.iloc[-1]).reset_index(drop=True)


In [11]:
X_future = pd.DataFrame({
    "provincia": last_data["provincia"],
    "sin_day_of_year": last_data["sin_day_of_year_next"],
    "cos_day_of_year": last_data["cos_day_of_year_next"],
    "velmedia_lag1": last_data["velmedia"],
    "velmax_lag1": last_data["velmax"],
    "tmed_lag1": last_data["tmed"],
    "tmax_lag1": last_data["tmax"],
    "tmin_lag1": last_data["tmin"]
})

In [12]:
future_pred = pipeline.predict(X_future)
future_pred_df = pd.DataFrame(future_pred, columns=columnas_cat, index=last_data.index)


In [14]:
result_future = pd.concat([last_data[["provincia", "next_date"]], future_pred_df], axis=1)
result_future.rename(columns={"next_date": "predicted_date"}, inplace=True)

print("\nPredicción para el día siguiente (por provincia):")
result_future


Predicción para el día siguiente (por provincia):


Unnamed: 0,provincia,predicted_date,velmedia,velmax,tmed,tmax,tmin
0,A CORUÑA,2025-02-12,3.906105,11.127575,11.077692,13.73367,7.940188
1,ALBACETE,2025-02-12,2.417502,9.107374,10.600645,17.005713,3.898731
2,ALICANTE,2025-02-12,2.012608,7.695911,11.704792,17.33547,5.860023
3,ALMERIA,2025-02-12,2.64039,9.057944,11.733017,16.445222,7.07658
4,ARABA/ALAVA,2025-02-12,2.959329,9.955004,9.993606,13.572201,6.005671
5,ASTURIAS,2025-02-12,2.900603,10.93708,10.492251,14.852841,5.690919
6,AVILA,2025-02-12,2.962145,10.205015,7.930213,11.968868,3.439147
7,BADAJOZ,2025-02-12,3.565269,12.45161,10.056576,13.562665,6.199748
8,BALEARES,2025-02-12,2.695772,8.10554,11.725856,16.954357,6.314088
9,BARCELONA,2025-02-12,2.053689,7.72527,9.935676,13.663985,5.65578
