In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv("/Users/pedrohd/Documents/GIT_METEOROLOGÍA/weather_forecast/Data/transformed/data_transformed_open_meteo.csv")

# Aseguramos que 'date' sea datetime y ordenamos por provincia y fecha
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values(['provincia', 'date']).reset_index(drop=True)

In [3]:
df["precip_lag1"] = df.groupby("provincia")["precipitation_sum"].shift(1)
df["wind_lag1"]   = df.groupby("provincia")["windspeed_10m_max"].shift(1)

# Rolling 3 días (media), aplicamos groupby y rolling, luego reset_index
df["precip_rolling_3"] = (
    df.groupby("provincia")["precipitation_sum"]
      .rolling(window=3, min_periods=1).mean()
      .reset_index(level=0, drop=True)
)
df["wind_rolling_3"] = (
    df.groupby("provincia")["windspeed_10m_max"]
      .rolling(window=3, min_periods=1).mean()
      .reset_index(level=0, drop=True)
)
df

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,windspeed_10m_max,provincia,sin_day_of_year,cos_day_of_year,precip_lag1,wind_lag1,precip_rolling_3,wind_rolling_3
0,2013-01-01,12.3,9.3,10.8,1.2,19.8,A Coruña,0.017166,0.999853,,,1.200000,19.800000
1,2013-01-02,12.4,6.0,9.1,0.0,20.4,A Coruña,0.034328,0.999411,1.2,19.8,0.600000,20.100000
2,2013-01-03,16.1,6.4,10.1,0.0,17.1,A Coruña,0.051479,0.998674,0.0,20.4,0.400000,19.100000
3,2013-01-04,15.2,5.0,9.0,0.0,11.3,A Coruña,0.068615,0.997643,0.0,17.1,0.000000,16.266667
4,2013-01-05,14.2,4.1,8.2,0.0,9.7,A Coruña,0.085731,0.996318,0.0,11.3,0.000000,12.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
230791,2025-02-20,15.6,1.5,7.5,0.0,7.1,Ávila,0.767880,0.640593,0.8,8.7,0.266667,13.733333
230792,2025-02-21,11.1,4.2,7.3,5.1,28.2,Ávila,0.778764,0.627317,0.0,7.1,1.966667,14.666667
230793,2025-02-22,8.9,1.6,4.9,0.4,13.4,Ávila,0.789418,0.613856,5.1,28.2,1.833333,16.233333
230794,2025-02-23,1.4,0.9,4.9,0.4,4.6,Ávila,0.799839,0.600214,0.4,13.4,1.966667,15.400000


In [4]:
target_cols = [
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "precipitation_sum",
    "windspeed_10m_max"
]

In [5]:
for col in target_cols:
    df["target_" + col] = df.groupby("provincia")[col].shift(-1)

df

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,windspeed_10m_max,provincia,sin_day_of_year,cos_day_of_year,precip_lag1,wind_lag1,precip_rolling_3,wind_rolling_3,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum,target_windspeed_10m_max
0,2013-01-01,12.3,9.3,10.8,1.2,19.8,A Coruña,0.017166,0.999853,,,1.200000,19.800000,12.4,6.0,9.1,0.0,20.4
1,2013-01-02,12.4,6.0,9.1,0.0,20.4,A Coruña,0.034328,0.999411,1.2,19.8,0.600000,20.100000,16.1,6.4,10.1,0.0,17.1
2,2013-01-03,16.1,6.4,10.1,0.0,17.1,A Coruña,0.051479,0.998674,0.0,20.4,0.400000,19.100000,15.2,5.0,9.0,0.0,11.3
3,2013-01-04,15.2,5.0,9.0,0.0,11.3,A Coruña,0.068615,0.997643,0.0,17.1,0.000000,16.266667,14.2,4.1,8.2,0.0,9.7
4,2013-01-05,14.2,4.1,8.2,0.0,9.7,A Coruña,0.085731,0.996318,0.0,11.3,0.000000,12.700000,13.5,2.6,7.2,0.0,8.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230791,2025-02-20,15.6,1.5,7.5,0.0,7.1,Ávila,0.767880,0.640593,0.8,8.7,0.266667,13.733333,11.1,4.2,7.3,5.1,28.2
230792,2025-02-21,11.1,4.2,7.3,5.1,28.2,Ávila,0.778764,0.627317,0.0,7.1,1.966667,14.666667,8.9,1.6,4.9,0.4,13.4
230793,2025-02-22,8.9,1.6,4.9,0.4,13.4,Ávila,0.789418,0.613856,5.1,28.2,1.833333,16.233333,1.4,0.9,4.9,0.4,4.6
230794,2025-02-23,1.4,0.9,4.9,0.4,4.6,Ávila,0.799839,0.600214,0.4,13.4,1.966667,15.400000,1.4,0.9,4.9,0.4,4.6


In [6]:
df["target_date"] = df.groupby("provincia")["date"].shift(-1)
df

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,windspeed_10m_max,provincia,sin_day_of_year,cos_day_of_year,precip_lag1,wind_lag1,precip_rolling_3,wind_rolling_3,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum,target_windspeed_10m_max,target_date
0,2013-01-01,12.3,9.3,10.8,1.2,19.8,A Coruña,0.017166,0.999853,,,1.200000,19.800000,12.4,6.0,9.1,0.0,20.4,2013-01-02
1,2013-01-02,12.4,6.0,9.1,0.0,20.4,A Coruña,0.034328,0.999411,1.2,19.8,0.600000,20.100000,16.1,6.4,10.1,0.0,17.1,2013-01-03
2,2013-01-03,16.1,6.4,10.1,0.0,17.1,A Coruña,0.051479,0.998674,0.0,20.4,0.400000,19.100000,15.2,5.0,9.0,0.0,11.3,2013-01-04
3,2013-01-04,15.2,5.0,9.0,0.0,11.3,A Coruña,0.068615,0.997643,0.0,17.1,0.000000,16.266667,14.2,4.1,8.2,0.0,9.7,2013-01-05
4,2013-01-05,14.2,4.1,8.2,0.0,9.7,A Coruña,0.085731,0.996318,0.0,11.3,0.000000,12.700000,13.5,2.6,7.2,0.0,8.4,2013-01-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230791,2025-02-20,15.6,1.5,7.5,0.0,7.1,Ávila,0.767880,0.640593,0.8,8.7,0.266667,13.733333,11.1,4.2,7.3,5.1,28.2,2025-02-21
230792,2025-02-21,11.1,4.2,7.3,5.1,28.2,Ávila,0.778764,0.627317,0.0,7.1,1.966667,14.666667,8.9,1.6,4.9,0.4,13.4,2025-02-22
230793,2025-02-22,8.9,1.6,4.9,0.4,13.4,Ávila,0.789418,0.613856,5.1,28.2,1.833333,16.233333,1.4,0.9,4.9,0.4,4.6,2025-02-23
230794,2025-02-23,1.4,0.9,4.9,0.4,4.6,Ávila,0.799839,0.600214,0.4,13.4,1.966667,15.400000,1.4,0.9,4.9,0.4,4.6,2025-02-24


In [7]:
df["target_day_of_year"] = df["target_date"].dt.dayofyear
df["target_sin_day_of_year"] = np.sin(2 * np.pi * df["target_day_of_year"] / 366)
df["target_cos_day_of_year"] = np.cos(2 * np.pi * df["target_day_of_year"] / 366)


In [8]:
df_model = df.dropna(subset=["target_" + col for col in target_cols]).copy()
df_model

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,windspeed_10m_max,provincia,sin_day_of_year,cos_day_of_year,precip_lag1,...,wind_rolling_3,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum,target_windspeed_10m_max,target_date,target_day_of_year,target_sin_day_of_year,target_cos_day_of_year
0,2013-01-01,12.3,9.3,10.8,1.2,19.8,A Coruña,0.017166,0.999853,,...,19.800000,12.4,6.0,9.1,0.0,20.4,2013-01-02,2.0,0.034328,0.999411
1,2013-01-02,12.4,6.0,9.1,0.0,20.4,A Coruña,0.034328,0.999411,1.2,...,20.100000,16.1,6.4,10.1,0.0,17.1,2013-01-03,3.0,0.051479,0.998674
2,2013-01-03,16.1,6.4,10.1,0.0,17.1,A Coruña,0.051479,0.998674,0.0,...,19.100000,15.2,5.0,9.0,0.0,11.3,2013-01-04,4.0,0.068615,0.997643
3,2013-01-04,15.2,5.0,9.0,0.0,11.3,A Coruña,0.068615,0.997643,0.0,...,16.266667,14.2,4.1,8.2,0.0,9.7,2013-01-05,5.0,0.085731,0.996318
4,2013-01-05,14.2,4.1,8.2,0.0,9.7,A Coruña,0.085731,0.996318,0.0,...,12.700000,13.5,2.6,7.2,0.0,8.4,2013-01-06,6.0,0.102821,0.994700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230790,2025-02-19,12.4,2.9,6.7,0.8,8.7,Ávila,0.756771,0.653680,0.0,...,16.566667,15.6,1.5,7.5,0.0,7.1,2025-02-20,51.0,0.767880,0.640593
230791,2025-02-20,15.6,1.5,7.5,0.0,7.1,Ávila,0.767880,0.640593,0.8,...,13.733333,11.1,4.2,7.3,5.1,28.2,2025-02-21,52.0,0.778764,0.627317
230792,2025-02-21,11.1,4.2,7.3,5.1,28.2,Ávila,0.778764,0.627317,0.0,...,14.666667,8.9,1.6,4.9,0.4,13.4,2025-02-22,53.0,0.789418,0.613856
230793,2025-02-22,8.9,1.6,4.9,0.4,13.4,Ávila,0.789418,0.613856,5.1,...,16.233333,1.4,0.9,4.9,0.4,4.6,2025-02-23,54.0,0.799839,0.600214


In [9]:
df_model.isna().sum()

date                           0
temperature_2m_max             0
temperature_2m_min             0
temperature_2m_mean            0
precipitation_sum              0
windspeed_10m_max              0
provincia                      0
sin_day_of_year                0
cos_day_of_year                0
precip_lag1                   52
wind_lag1                     52
precip_rolling_3               0
wind_rolling_3                 0
target_temperature_2m_max      0
target_temperature_2m_min      0
target_temperature_2m_mean     0
target_precipitation_sum       0
target_windspeed_10m_max       0
target_date                    0
target_day_of_year             0
target_sin_day_of_year         0
target_cos_day_of_year         0
dtype: int64

In [10]:
df_model = df_model.dropna(subset=["precip_lag1", "wind_lag1", "precip_rolling_3", "wind_rolling_3"])
df_model.isna().sum()

date                          0
temperature_2m_max            0
temperature_2m_min            0
temperature_2m_mean           0
precipitation_sum             0
windspeed_10m_max             0
provincia                     0
sin_day_of_year               0
cos_day_of_year               0
precip_lag1                   0
wind_lag1                     0
precip_rolling_3              0
wind_rolling_3                0
target_temperature_2m_max     0
target_temperature_2m_min     0
target_temperature_2m_mean    0
target_precipitation_sum      0
target_windspeed_10m_max      0
target_date                   0
target_day_of_year            0
target_sin_day_of_year        0
target_cos_day_of_year        0
dtype: int64

In [11]:
feature_cols = [
    "provincia",
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "precipitation_sum",
    "windspeed_10m_max",
    "precip_lag1",
    "wind_lag1",
    "precip_rolling_3",
    "wind_rolling_3",
    "target_sin_day_of_year",
    "target_cos_day_of_year"
]

In [12]:
target_output_cols = ["target_" + col for col in target_cols]


In [13]:
X = df_model[feature_cols].copy()
y = df_model[target_output_cols].copy()

In [14]:
split_date = df_model["date"].quantile(0.8)
mask_train = df_model["date"] <= split_date
X_train = X[mask_train]
y_train = y[mask_train]
X_test = X[~mask_train]
y_test = y[~mask_train]

print("Tamaño train:", X_train.shape, y_train.shape)
print("Tamaño test :", X_test.shape, y_test.shape)

Tamaño train: (184600, 12) (184600, 5)
Tamaño test : (46092, 12) (46092, 5)


In [15]:
col_transformer = ColumnTransformer(
    transformers=[
        ("ohe_prov", OneHotEncoder(handle_unknown="ignore"), ["provincia"])
    ],
    remainder="passthrough"
)


In [16]:
base_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
multi_rf = MultiOutputRegressor(base_rf)

pipeline = Pipeline([
    ("preprocessor", col_transformer),
    ("rf_multi", multi_rf)
])

In [17]:
tscv = TimeSeriesSplit(n_splits=2)

param_distributions = {
    "rf_multi__estimator__n_estimators": [50, 100, 200],
    "rf_multi__estimator__max_depth": [5, 10],
    "rf_multi__estimator__min_samples_leaf": [1, 2, 4]
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=5,             # Prueba 5 combinaciones aleatorias
    cv=tscv,
    scoring="neg_mean_absolute_error",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Mejores parámetros:", random_search.best_params_)
print("Mejor score (CV):", random_search.best_score_)

Mejores parámetros: {'rf_multi__estimator__n_estimators': 200, 'rf_multi__estimator__min_samples_leaf': 4, 'rf_multi__estimator__max_depth': 5}
Mejor score (CV): -2.172608042018108


In [18]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=target_output_cols, index=y_test.index)

In [19]:
print("\nMétricas en el test set:")
for col in target_output_cols:
    mse = mean_squared_error(y_test[col], y_pred_df[col])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[col], y_pred_df[col])
    r2  = r2_score(y_test[col], y_pred_df[col])
    print(f"{col} -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")


Métricas en el test set:
target_temperature_2m_max -> RMSE: 2.497, MAE: 1.883, R²: 0.891
target_temperature_2m_min -> RMSE: 2.049, MAE: 1.580, R²: 0.902
target_temperature_2m_mean -> RMSE: 1.769, MAE: 1.335, R²: 0.934
target_precipitation_sum -> RMSE: 5.016, MAE: 2.300, R²: 0.149
target_windspeed_10m_max -> RMSE: 5.973, MAE: 4.533, R²: 0.340


In [21]:
# =============================================================================
# 9. Predicción para el día siguiente (por provincia)
# =============================================================================
# Tomamos el último registro de cada provincia (día t)
last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)

# Calculamos la fecha del día siguiente y su estacionalidad
last_data["next_date"] = last_data["date"] + pd.Timedelta(days=1)
last_data["next_day_of_year"] = last_data["next_date"].dt.dayofyear
last_data["target_sin_day_of_year"] = np.sin(2 * np.pi * last_data["next_day_of_year"] / 366)
last_data["target_cos_day_of_year"] = np.cos(2 * np.pi * last_data["next_day_of_year"] / 366)

# Para las lags y rolling, usamos ya los valores del último registro (día t)
X_future = pd.DataFrame({
    "provincia": last_data["provincia"],
    "temperature_2m_max": last_data["temperature_2m_max"],
    "temperature_2m_min": last_data["temperature_2m_min"],
    "temperature_2m_mean": last_data["temperature_2m_mean"],
    "precipitation_sum": last_data["precipitation_sum"],
    "windspeed_10m_max": last_data["windspeed_10m_max"],
    "precip_lag1": last_data["precip_lag1"],
    "wind_lag1": last_data["wind_lag1"],
    "precip_rolling_3": last_data["precip_rolling_3"],
    "wind_rolling_3": last_data["wind_rolling_3"],
    "target_sin_day_of_year": last_data["target_sin_day_of_year"],
    "target_cos_day_of_year": last_data["target_cos_day_of_year"]
})

future_pred = best_model.predict(X_future)
future_pred_df = pd.DataFrame(future_pred, columns=target_output_cols, index=last_data.index)

result_future = pd.concat([last_data[["provincia", "next_date"]], future_pred_df], axis=1)
result_future.rename(columns={"next_date": "predicted_date"}, inplace=True)

print("\nPredicción para el día siguiente (por provincia):")
result_future

  last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)



Predicción para el día siguiente (por provincia):


Unnamed: 0,provincia,predicted_date,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum,target_windspeed_10m_max
0,A Coruña,2025-02-25,8.726633,6.000671,10.750108,4.770912,13.162629
1,Albacete,2025-02-25,7.011783,5.927516,9.966162,3.910019,12.529829
2,Alicante,2025-02-25,12.100187,11.525761,13.928928,4.761724,12.094419
3,Almería,2025-02-25,11.541212,9.798335,13.125407,5.801811,14.073541
4,Asturias,2025-02-25,7.011783,4.514935,8.363764,4.627758,10.111258
5,Badajoz,2025-02-25,9.083773,6.578337,10.937188,3.250134,12.189498
6,Baleares,2025-02-25,12.212319,11.525761,14.62745,2.299545,9.366058
7,Barcelona,2025-02-25,11.449282,9.798335,13.125407,5.814373,9.358095
8,Burgos,2025-02-25,4.52228,2.806096,7.155515,3.734371,15.160339
9,Cantabria,2025-02-26,8.726633,6.035239,9.97363,0.46808,9.418208


In [None]:
##CROSS VALIDATION