In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report, accuracy_score


## Lectura del csv y tratamiento de algunas columnas y variables

In [2]:
df = pd.read_csv("/Users/pedrohd/Documents/GIT_METEOROLOGÍA/weather_forecast/Data/transformed/transformed_open_meteo.csv")

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values(['provincia', 'date']).reset_index(drop=True)

if 'windspeed_10m_max' in df.columns:
    df = df.drop(columns=['windspeed_10m_max'])

df['precipitation_sum'] = df['precipitation_sum'].apply(lambda x: False if x <= 0.2 else True)

## Creamos lags y ventanas con la media de los 3 días anteriores

In [3]:
temp_vars = ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean"]
for var in temp_vars:
    df[var + "_lag1"] = df.groupby("provincia")[var].shift(1)
    df[var + "_rolling_3"] = df.groupby("provincia")[var] \
                               .rolling(window=3, min_periods=1).mean() \
                               .reset_index(level=0, drop=True)

df["precip_lag1"] = df.groupby("provincia")["precipitation_sum"].shift(1)
df["precip_rolling_3"] = df.groupby("provincia")["precipitation_sum"] \
                            .rolling(window=3, min_periods=1) \
                            .apply(lambda x: np.mean(x.astype(int))) \
                            .reset_index(level=0, drop=True)

In [4]:
df

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,provincia,sin_day_of_year,cos_day_of_year,temperature_2m_max_lag1,temperature_2m_max_rolling_3,temperature_2m_min_lag1,temperature_2m_min_rolling_3,temperature_2m_mean_lag1,temperature_2m_mean_rolling_3,precip_lag1,precip_rolling_3
0,2013-01-01,12.3,9.3,10.8,True,A Coruña,0.017166,0.999853,,12.300000,,9.300000,,10.800000,,1.000000
1,2013-01-02,12.4,6.0,9.1,False,A Coruña,0.034328,0.999411,12.3,12.350000,9.3,7.650000,10.8,9.950000,True,0.500000
2,2013-01-03,16.1,6.4,10.1,False,A Coruña,0.051479,0.998674,12.4,13.600000,6.0,7.233333,9.1,10.000000,False,0.333333
3,2013-01-04,15.2,5.0,9.0,False,A Coruña,0.068615,0.997643,16.1,14.566667,6.4,5.800000,10.1,9.400000,False,0.000000
4,2013-01-05,14.2,4.1,8.2,False,A Coruña,0.085731,0.996318,15.2,15.166667,5.0,5.166667,9.0,9.100000,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230791,2025-02-20,15.6,1.5,7.5,False,Ávila,0.767880,0.640593,12.4,13.233333,2.9,2.666667,6.7,7.133333,True,0.333333
230792,2025-02-21,11.1,4.2,7.3,True,Ávila,0.778764,0.627317,15.6,13.033333,1.5,2.866667,7.5,7.166667,False,0.666667
230793,2025-02-22,8.9,1.6,4.9,True,Ávila,0.789418,0.613856,11.1,11.866667,4.2,2.433333,7.3,6.566667,True,0.666667
230794,2025-02-23,1.4,0.9,4.9,True,Ávila,0.799839,0.600214,8.9,7.133333,1.6,2.233333,4.9,5.700000,True,1.000000


## Creación de targets para predicción diaria (qué pasará mañana)

In [5]:
target_temp_cols = ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean"]
for col in target_temp_cols:
    df["target_" + col] = df.groupby("provincia")[col].shift(-1)

df["target_precipitation_sum"] = df.groupby("provincia")["precipitation_sum"].shift(-1)

df["target_date"] = df.groupby("provincia")["date"].shift(-1)
df["target_day_of_year"] = df["target_date"].dt.dayofyear
df["target_sin_day_of_year"] = np.sin(2 * np.pi * df["target_day_of_year"] / 366)
df["target_cos_day_of_year"] = np.cos(2 * np.pi * df["target_day_of_year"] / 366)

In [6]:
required_targets = ["target_" + col for col in target_temp_cols] + ["target_precipitation_sum"]
df_model = df.dropna(subset=required_targets).copy()

features_required = ["precip_lag1", "precip_rolling_3"] \
    + [var + "_lag1" for var in temp_vars] \
    + [var + "_rolling_3" for var in temp_vars]
df_model = df_model.dropna(subset=features_required)
print("Dataset para modelado:", df_model.shape)

Dataset para modelado: (230692, 24)


In [7]:
df_model

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,provincia,sin_day_of_year,cos_day_of_year,temperature_2m_max_lag1,temperature_2m_max_rolling_3,...,precip_lag1,precip_rolling_3,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum,target_date,target_day_of_year,target_sin_day_of_year,target_cos_day_of_year
1,2013-01-02,12.4,6.0,9.1,False,A Coruña,0.034328,0.999411,12.3,12.350000,...,True,0.500000,16.1,6.4,10.1,False,2013-01-03,3.0,0.051479,0.998674
2,2013-01-03,16.1,6.4,10.1,False,A Coruña,0.051479,0.998674,12.4,13.600000,...,False,0.333333,15.2,5.0,9.0,False,2013-01-04,4.0,0.068615,0.997643
3,2013-01-04,15.2,5.0,9.0,False,A Coruña,0.068615,0.997643,16.1,14.566667,...,False,0.000000,14.2,4.1,8.2,False,2013-01-05,5.0,0.085731,0.996318
4,2013-01-05,14.2,4.1,8.2,False,A Coruña,0.085731,0.996318,15.2,15.166667,...,False,0.000000,13.5,2.6,7.2,False,2013-01-06,6.0,0.102821,0.994700
5,2013-01-06,13.5,2.6,7.2,False,A Coruña,0.102821,0.994700,14.2,14.300000,...,False,0.000000,13.0,3.2,7.2,False,2013-01-07,7.0,0.119881,0.992788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230790,2025-02-19,12.4,2.9,6.7,True,Ávila,0.756771,0.653680,11.7,12.400000,...,False,0.333333,15.6,1.5,7.5,False,2025-02-20,51.0,0.767880,0.640593
230791,2025-02-20,15.6,1.5,7.5,False,Ávila,0.767880,0.640593,12.4,13.233333,...,True,0.333333,11.1,4.2,7.3,True,2025-02-21,52.0,0.778764,0.627317
230792,2025-02-21,11.1,4.2,7.3,True,Ávila,0.778764,0.627317,15.6,13.033333,...,False,0.666667,8.9,1.6,4.9,True,2025-02-22,53.0,0.789418,0.613856
230793,2025-02-22,8.9,1.6,4.9,True,Ávila,0.789418,0.613856,11.1,11.866667,...,True,0.666667,1.4,0.9,4.9,True,2025-02-23,54.0,0.799839,0.600214


## Preparamos y definimos los datos que vamos a utilizar para la predicción diaria

In [8]:
feature_cols_reg = [
    "provincia",
    "temperature_2m_max", "temperature_2m_min", "temperature_2m_mean",
    "temperature_2m_max_lag1", "temperature_2m_min_lag1", "temperature_2m_mean_lag1",
    "temperature_2m_max_rolling_3", "temperature_2m_min_rolling_3", "temperature_2m_mean_rolling_3",
    "precip_lag1", "precip_rolling_3",
    "target_sin_day_of_year", "target_cos_day_of_year"
]
target_cols_reg = ["target_" + col for col in target_temp_cols]

X_reg = df_model[feature_cols_reg].copy()
y_reg = df_model[target_cols_reg].copy()

feature_cols_clf = [
    "provincia",
    "precipitation_sum",      
    "precip_lag1", "precip_rolling_3",
    "target_sin_day_of_year", "target_cos_day_of_year"
]
target_col_clf = "target_precipitation_sum"

X_clf = df_model[feature_cols_clf].copy()
y_clf = df_model[target_col_clf].copy().astype(int)


## Dividimos los datos para el Random Forest Regressor de las temperaturas

In [9]:
split_date = df_model["date"].quantile(0.8)
mask_train = df_model["date"] <= split_date

X_train_reg = X_reg[mask_train]
y_train_reg = y_reg[mask_train]
X_test_reg = X_reg[~mask_train]
y_test_reg = y_reg[~mask_train]

print("Tamaño train (regresión):", X_train_reg.shape, y_train_reg.shape)
print("Tamaño test (regresión):", X_test_reg.shape, y_test_reg.shape)

Tamaño train (regresión): (184600, 14) (184600, 3)
Tamaño test (regresión): (46092, 14) (46092, 3)


## Entrenamiento del modelo multioutput con randomized search para las diferentes variables de temperatura, Encoding y Time Series Split

In [10]:
col_transformer_reg = ColumnTransformer(
    transformers=[
        ("ohe_prov", OneHotEncoder(handle_unknown="ignore"), ["provincia"])
    ],
    remainder="passthrough"
)

base_rf = RandomForestRegressor(random_state=42, n_jobs=-1)
multi_rf = MultiOutputRegressor(base_rf)

pipeline_reg = Pipeline([
    ("preprocessor", col_transformer_reg),
    ("rf_multi", multi_rf)
])

tscv = TimeSeriesSplit(n_splits=2)
param_distributions = {
    "rf_multi__estimator__n_estimators": [50, 100, 200],
    "rf_multi__estimator__max_depth": [5, 10],
    "rf_multi__estimator__min_samples_leaf": [1, 2, 4]
}

random_search_reg = RandomizedSearchCV(
    pipeline_reg,
    param_distributions=param_distributions,
    n_iter=3,  
    cv=tscv,
    scoring="neg_mean_absolute_error",
    random_state=42,
    n_jobs=-1
)

random_search_reg.fit(X_train_reg, y_train_reg)
print("Mejores parámetros (regresión):", random_search_reg.best_params_)
print("Mejor score (CV) (regresión):", random_search_reg.best_score_)

best_model_reg = random_search_reg.best_estimator_

Mejores parámetros (regresión): {'rf_multi__estimator__n_estimators': 200, 'rf_multi__estimator__min_samples_leaf': 4, 'rf_multi__estimator__max_depth': 5}
Mejor score (CV) (regresión): -1.5670998408791632


## Vemos y valoramos las métricas de nuestro modelo

In [11]:
y_pred_reg = best_model_reg.predict(X_test_reg)
y_pred_reg_df = pd.DataFrame(y_pred_reg, columns=target_cols_reg, index=y_test_reg.index)

print("\nMétricas en el test set para temperaturas:")
for col in target_cols_reg:
    mse = mean_squared_error(y_test_reg[col], y_pred_reg_df[col])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_reg[col], y_pred_reg_df[col])
    r2  = r2_score(y_test_reg[col], y_pred_reg_df[col])
    print(f"{col} -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")


Métricas en el test set para temperaturas:
target_temperature_2m_max -> RMSE: 2.499, MAE: 1.884, R²: 0.891
target_temperature_2m_min -> RMSE: 2.048, MAE: 1.579, R²: 0.902
target_temperature_2m_mean -> RMSE: 1.769, MAE: 1.335, R²: 0.934


## División de datos del Random Forest Classifier para la predicción de si llueve o no al día siguiente

In [12]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

print("Tamaño train (clasificación):", X_train_clf.shape)
print("Tamaño test (clasificación):", X_test_clf.shape)

Tamaño train (clasificación): (184553, 6)
Tamaño test (clasificación): (46139, 6)


## One Hot Encoding a provincia y entrenamiento del modelo classifier

In [13]:
col_transformer_clf = ColumnTransformer(
    transformers=[
        ("ohe_prov", OneHotEncoder(handle_unknown="ignore"), ["provincia"])
    ],
    remainder="passthrough"
)

clf_precip = RandomForestClassifier(random_state=42, n_jobs=-1)

pipeline_clf = Pipeline([
    ("preprocessor", col_transformer_clf),
    ("clf", clf_precip)
])

pipeline_clf.fit(X_train_clf, y_train_clf)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Métricas del modelo

In [14]:
y_pred_clf = pipeline_clf.predict(X_test_clf)
print("Métricas para precipitación:")
print(classification_report(y_test_clf, y_pred_clf))
print("Accuracy:", accuracy_score(y_test_clf, y_pred_clf))


Métricas para precipitación:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79     31003
           1       0.56      0.49      0.52     15136

    accuracy                           0.71     46139
   macro avg       0.66      0.65      0.66     46139
weighted avg       0.70      0.71      0.70     46139

Accuracy: 0.7056503175188019


In [15]:
df['precipitation_sum'].value_counts()

precipitation_sum
False    155073
True      75723
Name: count, dtype: int64

## Extracción de las últimas fechas y conversiones, variables futuras y predicción de las variables para cada provincia al día siguiente de la última fecha de los datos. Visualización de tabla de predicciones

In [16]:
last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)

last_data["next_date"] = last_data["date"] + pd.Timedelta(days=1)
last_data["next_day_of_year"] = last_data["next_date"].dt.dayofyear
last_data["target_sin_day_of_year"] = np.sin(2 * np.pi * last_data["next_day_of_year"] / 366)
last_data["target_cos_day_of_year"] = np.cos(2 * np.pi * last_data["next_day_of_year"] / 366)

X_future_reg = pd.DataFrame({
    "provincia": last_data["provincia"],
    "temperature_2m_max": last_data["temperature_2m_max"],
    "temperature_2m_min": last_data["temperature_2m_min"],
    "temperature_2m_mean": last_data["temperature_2m_mean"],
    "temperature_2m_max_lag1": last_data["temperature_2m_max"],    
    "temperature_2m_min_lag1": last_data["temperature_2m_min"],
    "temperature_2m_mean_lag1": last_data["temperature_2m_mean"],
    "temperature_2m_max_rolling_3": last_data["temperature_2m_max_rolling_3"],
    "temperature_2m_min_rolling_3": last_data["temperature_2m_min_rolling_3"],
    "temperature_2m_mean_rolling_3": last_data["temperature_2m_mean_rolling_3"],
    "precip_lag1": last_data["precipitation_sum"],  
    "precip_rolling_3": last_data["precip_rolling_3"],
    "target_sin_day_of_year": last_data["target_sin_day_of_year"],
    "target_cos_day_of_year": last_data["target_cos_day_of_year"]
})

future_pred_reg = best_model_reg.predict(X_future_reg)
future_pred_reg_df = pd.DataFrame(future_pred_reg, columns=target_cols_reg)

X_future_clf = pd.DataFrame({
    "provincia": last_data["provincia"],
    "precipitation_sum": last_data["precipitation_sum"],
    "precip_lag1": last_data["precipitation_sum"],  
    "precip_rolling_3": last_data["precip_rolling_3"],
    "target_sin_day_of_year": last_data["target_sin_day_of_year"],
    "target_cos_day_of_year": last_data["target_cos_day_of_year"]
})

future_pred_clf = pipeline_clf.predict(X_future_clf)

result_future = last_data[["provincia", "next_date"]].copy()
result_future = result_future.join(future_pred_reg_df)
result_future["target_precipitation_sum_pred"] = future_pred_clf
result_future.rename(columns={"next_date": "predicted_date"}, inplace=True)

print("\nPredicción para el día siguiente (por provincia):")
result_future



Predicción para el día siguiente (por provincia):


  last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)


Unnamed: 0,provincia,predicted_date,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum_pred
0,A Coruña,2025-02-25,8.726633,6.453513,10.750108,1
1,Albacete,2025-02-25,7.011783,6.327021,9.966162,0
2,Alicante,2025-02-25,12.109534,11.525761,13.928928,0
3,Almería,2025-02-25,11.545445,9.775246,13.125407,0
4,Asturias,2025-02-25,7.011783,4.518711,8.363764,1
5,Badajoz,2025-02-25,9.083773,6.729268,10.937188,1
6,Baleares,2025-02-25,12.234232,11.525761,14.62745,0
7,Barcelona,2025-02-25,11.453515,9.775246,13.125407,1
8,Burgos,2025-02-25,4.52228,2.806096,7.155515,1
9,Cantabria,2025-02-26,8.726633,6.481549,9.97363,0


In [17]:
df_prediccion_temp_precip = result_future
df_prediccion_temp_precip

Unnamed: 0,provincia,predicted_date,target_temperature_2m_max,target_temperature_2m_min,target_temperature_2m_mean,target_precipitation_sum_pred
0,A Coruña,2025-02-25,8.726633,6.453513,10.750108,1
1,Albacete,2025-02-25,7.011783,6.327021,9.966162,0
2,Alicante,2025-02-25,12.109534,11.525761,13.928928,0
3,Almería,2025-02-25,11.545445,9.775246,13.125407,0
4,Asturias,2025-02-25,7.011783,4.518711,8.363764,1
5,Badajoz,2025-02-25,9.083773,6.729268,10.937188,1
6,Baleares,2025-02-25,12.234232,11.525761,14.62745,0
7,Barcelona,2025-02-25,11.453515,9.775246,13.125407,1
8,Burgos,2025-02-25,4.52228,2.806096,7.155515,1
9,Cantabria,2025-02-26,8.726633,6.481549,9.97363,0


## CSV para visualización en tableau

In [18]:
df_prediccion_temp_precip.to_csv("predicciones_temperatura_precip_diaria.csv", index=False)


## Preparación de datos para predicción a 7 días y creamos horizontes para las variables de 7 días para los dos modelos. En este caso la temperatura media la dejamos de lado

In [17]:
horizons = range(1, 8)  # Días 1 a 7

for col in ["temperature_2m_max", "temperature_2m_min"]:
    for h in horizons:
        df_model[f"target_{col}_day{h}"] = df_model.groupby("provincia")[col].shift(-h)

for h in horizons:
    df_model[f"target_precipitation_sum_day{h}"] = df_model.groupby("provincia")["precipitation_sum"].shift(-h)

target_cols_reg_7 = [f"target_temperature_2m_max_day{h}" for h in horizons] + \
                    [f"target_temperature_2m_min_day{h}" for h in horizons]
target_cols_clf_7 = [f"target_precipitation_sum_day{h}" for h in horizons]


In [18]:
required_targets_7 = target_cols_reg_7 + target_cols_clf_7
df_model_7 = df_model.dropna(subset=required_targets_7).copy()
print("Dataset para modelado 7 días:", df_model_7.shape)

X_reg_7 = df_model_7[feature_cols_reg].copy()  
y_reg_7 = df_model_7[target_cols_reg_7].copy()

X_clf_7 = df_model_7[feature_cols_clf].copy()  
y_clf_7 = df_model_7[target_cols_clf_7].copy().astype(int)  

Dataset para modelado 7 días: (230328, 45)


## División de datos

In [19]:
split_date_7 = df_model_7["date"].quantile(0.8)
mask_train_7 = df_model_7["date"] <= split_date_7

X_train_reg_7 = X_reg_7[mask_train_7]
y_train_reg_7 = y_reg_7[mask_train_7]
X_test_reg_7 = X_reg_7[~mask_train_7]
y_test_reg_7 = y_reg_7[~mask_train_7]

print("Train (regresión 7 días):", X_train_reg_7.shape, y_train_reg_7.shape)
print("Test (regresión 7 días):", X_test_reg_7.shape, y_test_reg_7.shape)

X_train_clf_7, X_test_clf_7, y_train_clf_7, y_test_clf_7 = train_test_split(
    X_clf_7, y_clf_7, test_size=0.2, random_state=42
)
print("Train (clasificación 7 días):", X_train_clf_7.shape)
print("Test (clasificación 7 días):", X_test_clf_7.shape)


Train (regresión 7 días): (184288, 14) (184288, 14)
Test (regresión 7 días): (46040, 14) (46040, 14)
Train (clasificación 7 días): (184262, 6)
Test (clasificación 7 días): (46066, 6)


## Modelo Regressor multisalida para la predicción de la temperatura máxima y mínima

In [20]:
pipeline_reg_7 = Pipeline([
    ("preprocessor", col_transformer_reg),  
    ("rf_multi", MultiOutputRegressor(RandomForestRegressor(
        n_estimators=50,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )))
])
pipeline_reg_7.fit(X_train_reg_7, y_train_reg_7)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Métricas del modelo

In [21]:
y_pred_reg_7 = pipeline_reg_7.predict(X_test_reg_7)

print("Métricas para cada target en el test set (regresión):")
for i, col in enumerate(target_cols_reg_7):
    mse = mean_squared_error(y_test_reg_7[col], y_pred_reg_7[:, i])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_reg_7[col], y_pred_reg_7[:, i])
    r2 = r2_score(y_test_reg_7[col], y_pred_reg_7[:, i])
    print(f"{col}: RMSE = {rmse:.3f}, MAE = {mae:.3f}, R² = {r2:.3f}")


Métricas para cada target en el test set (regresión):
target_temperature_2m_max_day1: RMSE = 2.418, MAE = 1.838, R² = 0.898
target_temperature_2m_max_day2: RMSE = 3.151, MAE = 2.440, R² = 0.826
target_temperature_2m_max_day3: RMSE = 3.443, MAE = 2.669, R² = 0.792
target_temperature_2m_max_day4: RMSE = 3.580, MAE = 2.788, R² = 0.775
target_temperature_2m_max_day5: RMSE = 3.662, MAE = 2.865, R² = 0.765
target_temperature_2m_max_day6: RMSE = 3.722, MAE = 2.913, R² = 0.757
target_temperature_2m_max_day7: RMSE = 3.777, MAE = 2.953, R² = 0.751
target_temperature_2m_min_day1: RMSE = 1.888, MAE = 1.441, R² = 0.917
target_temperature_2m_min_day2: RMSE = 2.534, MAE = 1.942, R² = 0.850
target_temperature_2m_min_day3: RMSE = 2.825, MAE = 2.175, R² = 0.814
target_temperature_2m_min_day4: RMSE = 2.973, MAE = 2.307, R² = 0.793
target_temperature_2m_min_day5: RMSE = 3.047, MAE = 2.379, R² = 0.783
target_temperature_2m_min_day6: RMSE = 3.094, MAE = 2.418, R² = 0.776
target_temperature_2m_min_day7: RMSE

## Modelo Classifier multisalida para los 7 días. Llueve o no.

In [22]:
from sklearn.multioutput import MultiOutputClassifier

pipeline_clf_7 = Pipeline([
    ("preprocessor", col_transformer_clf),  
    ("multi_clf", MultiOutputClassifier(RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    )))
])
pipeline_clf_7.fit(X_train_clf_7, y_train_clf_7)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Métricas del último modelo. Observamos bajada de rendimiento en la predicción de 7 días

In [23]:
y_pred_clf_7 = pipeline_clf_7.predict(X_test_clf_7)
print("Métricas para precipitación (7 días):")
print(classification_report(y_test_clf_7, y_pred_clf_7))
acc_7 = accuracy_score(y_test_clf_7, y_pred_clf_7)
print("Accuracy global (7 días):", acc_7)

Métricas para precipitación (7 días):
              precision    recall  f1-score   support

           0       0.57      0.49      0.52     15163
           1       0.48      0.40      0.44     14958
           2       0.47      0.38      0.42     15145
           3       0.46      0.38      0.41     15107
           4       0.45      0.37      0.40     15002
           5       0.44      0.37      0.40     15155
           6       0.44      0.35      0.39     15041

   micro avg       0.47      0.39      0.43    105571
   macro avg       0.47      0.39      0.43    105571
weighted avg       0.47      0.39      0.43    105571
 samples avg       0.27      0.25      0.23    105571

Accuracy global (7 días): 0.1706247557851778


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Predicción final 7 días

#### Última fecha, predicción de ambos modelos de 7 días. Unión de las predicciones en dataset

In [27]:
last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)

last_data["next_date"] = last_data["date"] + pd.Timedelta(days=1)
last_data["next_day_of_year"] = last_data["next_date"].dt.dayofyear
last_data["target_sin_day_of_year"] = np.sin(2 * np.pi * last_data["next_day_of_year"] / 366)
last_data["target_cos_day_of_year"] = np.cos(2 * np.pi * last_data["next_day_of_year"] / 366)

future_pred_reg_7 = pipeline_reg_7.predict(last_data[feature_cols_reg])
future_pred_reg_7_df = pd.DataFrame(future_pred_reg_7, columns=target_cols_reg_7, index=last_data.index)

future_pred_reg_7_df = future_pred_reg_7_df.reset_index().rename(columns={"index": "provincia_idx"})
future_pred_reg_7_df = future_pred_reg_7_df.melt(id_vars="provincia_idx", var_name="variable_day", value_name="prediccion_temp")
future_pred_reg_7_df[["variable", "day"]] = future_pred_reg_7_df["variable_day"].str.extract(r"target_(.+)_day(\d)")
future_pred_reg_7_df["day"] = future_pred_reg_7_df["day"].astype(int)
future_pred_reg_7_df["provincia"] = future_pred_reg_7_df["provincia_idx"].map(last_data["provincia"])

future_pred_clf_7 = pipeline_clf_7.predict(last_data[feature_cols_clf])
future_pred_clf_7_df = pd.DataFrame(future_pred_clf_7, columns=target_cols_clf_7, index=last_data.index)
future_pred_clf_7_df = future_pred_clf_7_df.reset_index().rename(columns={"index": "provincia_idx"})
future_pred_clf_7_df = future_pred_clf_7_df.melt(id_vars="provincia_idx", var_name="horizon", value_name="prediccion_precip")
future_pred_clf_7_df["day"] = future_pred_clf_7_df["horizon"].str.extract(r"day(\d)").astype(int)
future_pred_clf_7_df["provincia"] = future_pred_clf_7_df["provincia_idx"].map(last_data["provincia"])

result_future_7 = pd.merge(future_pred_reg_7_df, future_pred_clf_7_df, on=["provincia", "day"])
last_dates = last_data.set_index("provincia")["date"].to_dict()
result_future_7["predicted_date"] = result_future_7.apply(
    lambda row: pd.to_datetime(last_dates[row["provincia"]]) + pd.Timedelta(days=row["day"]),
    axis=1
)

result_future_7 = result_future_7.sort_values(["provincia", "variable", "day"])
print("\nPredicción para los próximos 7 días (temp max & temp min y precipitación):")
result_future_7.head(20)


  last_data = df.groupby("provincia").apply(lambda g: g.iloc[-1]).reset_index(drop=True)



Predicción para los próximos 7 días (temp max & temp min y precipitación):


Unnamed: 0,provincia_idx_x,variable_day,prediccion_temp,variable,day,provincia,provincia_idx_y,horizon,prediccion_precip,predicted_date
0,0,target_temperature_2m_max_day1,9.268603,temperature_2m_max,1,A Coruña,0,target_precipitation_sum_day1,1,2025-02-25
52,0,target_temperature_2m_max_day2,9.690016,temperature_2m_max,2,A Coruña,0,target_precipitation_sum_day2,1,2025-02-26
104,0,target_temperature_2m_max_day3,9.094605,temperature_2m_max,3,A Coruña,0,target_precipitation_sum_day3,0,2025-02-27
156,0,target_temperature_2m_max_day4,9.599657,temperature_2m_max,4,A Coruña,0,target_precipitation_sum_day4,1,2025-02-28
208,0,target_temperature_2m_max_day5,11.509094,temperature_2m_max,5,A Coruña,0,target_precipitation_sum_day5,1,2025-03-01
260,0,target_temperature_2m_max_day6,12.178502,temperature_2m_max,6,A Coruña,0,target_precipitation_sum_day6,1,2025-03-02
312,0,target_temperature_2m_max_day7,12.891598,temperature_2m_max,7,A Coruña,0,target_precipitation_sum_day7,0,2025-03-03
364,0,target_temperature_2m_min_day1,7.299239,temperature_2m_min,1,A Coruña,0,target_precipitation_sum_day1,1,2025-02-25
416,0,target_temperature_2m_min_day2,6.723551,temperature_2m_min,2,A Coruña,0,target_precipitation_sum_day2,1,2025-02-26
468,0,target_temperature_2m_min_day3,7.075287,temperature_2m_min,3,A Coruña,0,target_precipitation_sum_day3,0,2025-02-27


## Dividimos los resultados en temperatura y precipitación

In [28]:
df_temp = result_future_7[['provincia', 'variable', 'day', 'prediccion_temp', 'predicted_date']].copy()

df_precip = result_future_7[['provincia', 'day', 'prediccion_precip', 'predicted_date']].copy()

In [30]:
df_temp.head(20)

Unnamed: 0,provincia,variable,day,prediccion_temp,predicted_date
0,A Coruña,temperature_2m_max,1,9.268603,2025-02-25
52,A Coruña,temperature_2m_max,2,9.690016,2025-02-26
104,A Coruña,temperature_2m_max,3,9.094605,2025-02-27
156,A Coruña,temperature_2m_max,4,9.599657,2025-02-28
208,A Coruña,temperature_2m_max,5,11.509094,2025-03-01
260,A Coruña,temperature_2m_max,6,12.178502,2025-03-02
312,A Coruña,temperature_2m_max,7,12.891598,2025-03-03
364,A Coruña,temperature_2m_min,1,7.299239,2025-02-25
416,A Coruña,temperature_2m_min,2,6.723551,2025-02-26
468,A Coruña,temperature_2m_min,3,7.075287,2025-02-27


In [31]:
df_precip.head(20)

Unnamed: 0,provincia,day,prediccion_precip,predicted_date
0,A Coruña,1,1,2025-02-25
52,A Coruña,2,1,2025-02-26
104,A Coruña,3,0,2025-02-27
156,A Coruña,4,1,2025-02-28
208,A Coruña,5,1,2025-03-01
260,A Coruña,6,1,2025-03-02
312,A Coruña,7,0,2025-03-03
364,A Coruña,1,1,2025-02-25
416,A Coruña,2,1,2025-02-26
468,A Coruña,3,0,2025-02-27


In [32]:
df_precip.duplicated().sum()

364

In [33]:
df_precip = df_precip.drop_duplicates()
df_precip.head(20)

Unnamed: 0,provincia,day,prediccion_precip,predicted_date
0,A Coruña,1,1,2025-02-25
52,A Coruña,2,1,2025-02-26
104,A Coruña,3,0,2025-02-27
156,A Coruña,4,1,2025-02-28
208,A Coruña,5,1,2025-03-01
260,A Coruña,6,1,2025-03-02
312,A Coruña,7,0,2025-03-03
1,Albacete,1,1,2025-02-25
53,Albacete,2,0,2025-02-26
105,Albacete,3,0,2025-02-27


In [36]:
df_temp_final = df_temp.pivot(index=['provincia', 'day', 'predicted_date'], 
                              columns='variable', 
                              values='prediccion_temp').reset_index()

df_temp_final.columns.name = None
df_temp_final = df_temp_final.rename(columns={
    'temperature_2m_max': 'prediccion_temp_max',
    'temperature_2m_min': 'prediccion_temp_min'
})

df_temp_final.head(20)


Unnamed: 0,provincia,day,predicted_date,prediccion_temp_max,prediccion_temp_min
0,A Coruña,1,2025-02-25,9.268603,7.299239
1,A Coruña,2,2025-02-26,9.690016,6.723551
2,A Coruña,3,2025-02-27,9.094605,7.075287
3,A Coruña,4,2025-02-28,9.599657,7.091826
4,A Coruña,5,2025-03-01,11.509094,6.981632
5,A Coruña,6,2025-03-02,12.178502,6.94061
6,A Coruña,7,2025-03-03,12.891598,7.005422
7,Albacete,1,2025-02-25,7.967654,6.46065
8,Albacete,2,2025-02-26,9.584852,5.870532
9,Albacete,3,2025-02-27,8.694428,6.555001


## CSV para visualización en tableau

In [37]:
df_temp_final.to_csv("predicciones_temperatura_minmax_7dias.csv", index=False)
df_precip.to_csv("predicciones_precipitacion_7dias.csv", index=False)
