In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('../Data/raw/madrid/madrid.csv')
df

Unnamed: 0,ESTACION_NOMBRE,HUMEDAD RELATIVA,TEMPERATURA,PRECIPITACIÓN,sin_day,cos_day,sin_hour,cos_hour
0,Casa de Campo,85.0,-0.4,0.0,0.017213,0.999852,0.000000e+00,1.000000
1,Casa de Campo,86.0,-0.8,0.0,0.017213,0.999852,2.588190e-01,0.965926
2,Casa de Campo,87.0,-1.3,0.0,0.017213,0.999852,5.000000e-01,0.866025
3,Casa de Campo,89.0,-1.7,0.0,0.017213,0.999852,7.071068e-01,0.707107
4,Casa de Campo,91.0,-1.9,0.0,0.017213,0.999852,8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...
211769,Plaza Elíptica,34.0,27.4,0.0,-0.119881,-0.992788,2.588190e-01,-0.965926
211770,Plaza Elíptica,25.0,29.0,0.0,-0.119881,-0.992788,1.224647e-16,-1.000000
211771,Plaza Elíptica,32.0,19.7,0.0,-0.271234,-0.962513,8.660254e-01,0.500000
211772,Plaza Elíptica,44.0,26.7,0.0,-0.924291,-0.381689,-2.588190e-01,-0.965926


In [2]:
def generate_lags(df, columns, lag):
    df_aux = df.copy()
    for column in columns:
        for i in range(1, lag + 1):
            df_aux[f"{column}_lag_{i}"] = df[column].shift(i)
            
    for i in range(1, lag + 1):
        df_aux[f"TEMPERATURA_mean_lag_{i}"] = df.groupby(['sin_day','cos_day','sin_hour','cos_hour'])['TEMPERATURA'].transform('mean').shift(i)
    
    return df_aux.tail(-lag)


def generate_target(df, column):
    df_aux = df.copy()
    df_aux['Y'] = df.shift(-1)[column]
    return(df_aux.head(-1))

cols_to_lag = ['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN']
df_lag_temp = generate_target(generate_lags(df,cols_to_lag, 10),'TEMPERATURA')
df_lag_humedad = generate_target(generate_lags(df,cols_to_lag, 10),'HUMEDAD RELATIVA')
df_lag_precipitacion = generate_target(generate_lags(df,cols_to_lag, 10),'PRECIPITACIÓN')

df_lag_precipitacion.drop(columns=['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN'], inplace=True)
df_lag_temp.drop(columns=['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN'], inplace=True)
df_lag_humedad.drop(columns=['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN'], inplace=True)

df_lag_humedad

Unnamed: 0,ESTACION_NOMBRE,sin_day,cos_day,sin_hour,cos_hour,HUMEDAD RELATIVA_lag_1,HUMEDAD RELATIVA_lag_2,HUMEDAD RELATIVA_lag_3,HUMEDAD RELATIVA_lag_4,HUMEDAD RELATIVA_lag_5,...,TEMPERATURA_mean_lag_2,TEMPERATURA_mean_lag_3,TEMPERATURA_mean_lag_4,TEMPERATURA_mean_lag_5,TEMPERATURA_mean_lag_6,TEMPERATURA_mean_lag_7,TEMPERATURA_mean_lag_8,TEMPERATURA_mean_lag_9,TEMPERATURA_mean_lag_10,Y
10,Casa de Campo,0.017213,0.999852,5.000000e-01,-0.866025,86.0,93.0,92.0,93.0,92.0,...,2.0750,2.2400,2.4150,2.6500,3.1150,3.40500,3.68500,3.77500,4.00000,51.0
11,Casa de Campo,0.017213,0.999852,2.588190e-01,-0.965926,74.0,86.0,93.0,92.0,93.0,...,3.2350,2.0750,2.2400,2.4150,2.6500,3.11500,3.40500,3.68500,3.77500,40.0
12,Casa de Campo,0.017213,0.999852,1.224647e-16,-1.000000,51.0,74.0,86.0,93.0,92.0,...,5.7800,3.2350,2.0750,2.2400,2.4150,2.65000,3.11500,3.40500,3.68500,34.0
13,Casa de Campo,0.017213,0.999852,-2.588190e-01,-0.965926,40.0,51.0,74.0,86.0,93.0,...,8.2400,5.7800,3.2350,2.0750,2.2400,2.41500,2.65000,3.11500,3.40500,29.0
14,Casa de Campo,0.017213,0.999852,-5.000000e-01,-0.866025,34.0,40.0,51.0,74.0,86.0,...,10.6050,8.2400,5.7800,3.2350,2.0750,2.24000,2.41500,2.65000,3.11500,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211768,Plaza Elíptica,0.969178,-0.246361,-7.071068e-01,-0.707107,43.0,45.0,90.0,92.0,91.0,...,19.4000,7.4250,6.1625,6.3500,21.3875,19.61875,20.67500,21.55000,22.85625,34.0
211769,Plaza Elíptica,-0.119881,-0.992788,2.588190e-01,-0.965926,40.0,43.0,45.0,90.0,92.0,...,20.9625,19.4000,7.4250,6.1625,6.3500,21.38750,19.61875,20.67500,21.55000,25.0
211770,Plaza Elíptica,-0.119881,-0.992788,1.224647e-16,-1.000000,34.0,40.0,43.0,45.0,90.0,...,21.0000,20.9625,19.4000,7.4250,6.1625,6.35000,21.38750,19.61875,20.67500,32.0
211771,Plaza Elíptica,-0.271234,-0.962513,8.660254e-01,0.500000,25.0,34.0,40.0,43.0,45.0,...,28.8000,21.0000,20.9625,19.4000,7.4250,6.16250,6.35000,21.38750,19.61875,44.0


In [3]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_columns = encoder.fit_transform(df[['ESTACION_NOMBRE']])

encoded_df = pd.DataFrame(
    encoded_columns, 
    columns=encoder.get_feature_names_out(['ESTACION_NOMBRE'])
)

df_encoded_temp = pd.concat([df_lag_temp.reset_index(drop=True), encoded_df], axis=1).drop(columns=['ESTACION_NOMBRE'])
df_encoded_temp.dropna(inplace=True)

df_encoded_humedad = pd.concat([df_lag_humedad.reset_index(drop=True), encoded_df], axis=1).drop(columns=['ESTACION_NOMBRE'])
df_encoded_humedad.dropna(inplace=True)

df_encoded_precipitacion = pd.concat([df_lag_precipitacion.reset_index(drop=True), encoded_df], axis=1).drop(columns=['ESTACION_NOMBRE'])
df_encoded_precipitacion.dropna(inplace=True)
df_encoded_temp

Unnamed: 0,sin_day,cos_day,sin_hour,cos_hour,HUMEDAD RELATIVA_lag_1,HUMEDAD RELATIVA_lag_2,HUMEDAD RELATIVA_lag_3,HUMEDAD RELATIVA_lag_4,HUMEDAD RELATIVA_lag_5,HUMEDAD RELATIVA_lag_6,...,TEMPERATURA_mean_lag_6,TEMPERATURA_mean_lag_7,TEMPERATURA_mean_lag_8,TEMPERATURA_mean_lag_9,TEMPERATURA_mean_lag_10,Y,ESTACION_NOMBRE_Casa de Campo,ESTACION_NOMBRE_Ensanche de Vallecas,ESTACION_NOMBRE_Juan Carlos I,ESTACION_NOMBRE_Plaza Elíptica
0,0.017213,0.999852,5.000000e-01,-0.866025,86.0,93.0,92.0,93.0,92.0,91.0,...,3.1150,3.40500,3.68500,3.77500,4.00000,7.3,1.0,0.0,0.0,0.0
1,0.017213,0.999852,2.588190e-01,-0.965926,74.0,86.0,93.0,92.0,93.0,92.0,...,2.6500,3.11500,3.40500,3.68500,3.77500,10.5,1.0,0.0,0.0,0.0
2,0.017213,0.999852,1.224647e-16,-1.000000,51.0,74.0,86.0,93.0,92.0,93.0,...,2.4150,2.65000,3.11500,3.40500,3.68500,13.2,1.0,0.0,0.0,0.0
3,0.017213,0.999852,-2.588190e-01,-0.965926,40.0,51.0,74.0,86.0,93.0,92.0,...,2.2400,2.41500,2.65000,3.11500,3.40500,15.6,1.0,0.0,0.0,0.0
4,0.017213,0.999852,-5.000000e-01,-0.866025,34.0,40.0,51.0,74.0,86.0,93.0,...,2.0750,2.24000,2.41500,2.65000,3.11500,16.7,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211758,0.969178,-0.246361,-7.071068e-01,-0.707107,43.0,45.0,90.0,92.0,91.0,57.0,...,21.3875,19.61875,20.67500,21.55000,22.85625,27.4,0.0,0.0,0.0,1.0
211759,-0.119881,-0.992788,2.588190e-01,-0.965926,40.0,43.0,45.0,90.0,92.0,91.0,...,6.3500,21.38750,19.61875,20.67500,21.55000,29.0,0.0,0.0,0.0,1.0
211760,-0.119881,-0.992788,1.224647e-16,-1.000000,34.0,40.0,43.0,45.0,90.0,92.0,...,6.1625,6.35000,21.38750,19.61875,20.67500,19.7,0.0,0.0,0.0,1.0
211761,-0.271234,-0.962513,8.660254e-01,0.500000,25.0,34.0,40.0,43.0,45.0,90.0,...,7.4250,6.16250,6.35000,21.38750,19.61875,26.7,0.0,0.0,0.0,1.0


In [100]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
print('TEMPERATURA: ')
tscv = TimeSeriesSplit(n_splits=5)
X = df_encoded_temp.drop(columns=['Y'])
y = df_encoded_temp['Y']
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Estandarizar SOLO con datos de entrenamiento
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Entrenar modelo
    model = RandomForestRegressor()
    model.fit(X_train_scaled, y_train)

    # Predecir
    preds = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, preds)

    print(f"Fold {fold + 1}: MSE = {mse:.4f}")

print('HUMEDAD: ')
tscv = TimeSeriesSplit(n_splits=5)
X = df_encoded_humedad.drop(columns=['Y'])
y = df_encoded_humedad['Y']
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Estandarizar SOLO con datos de entrenamiento
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Entrenar modelo
    model = RandomForestRegressor()
    model.fit(X_train_scaled, y_train)

    # Predecir
    preds = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, preds)

    print(f"Fold {fold + 1}: MSE = {mse:.4f}")

print('PRECIPITACIÓN: ')
tscv = TimeSeriesSplit(n_splits=5)
X = df_encoded_precipitacion.drop(columns=['Y'])
y = df_encoded_precipitacion['Y']
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Estandarizar SOLO con datos de entrenamiento
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Entrenar modelo
    model = RandomForestRegressor()
    model.fit(X_train_scaled, y_train)

    # Predecir
    preds = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, preds)

    print(f"Fold {fold + 1}: MSE = {mse:.4f}")



TEMPERATURA: 
Fold 1: MSE = 1.3507
Fold 2: MSE = 0.9747
Fold 3: MSE = 0.8734
Fold 4: MSE = 1.4436
Fold 5: MSE = 0.9470
HUMEDAD: 
Fold 1: MSE = 29.6188
Fold 2: MSE = 24.5758
Fold 3: MSE = 23.8854
Fold 4: MSE = 21.8741
Fold 5: MSE = 19.8242
PRECIPITACIÓN: 
Fold 1: MSE = 0.1184
Fold 2: MSE = 0.1827
Fold 3: MSE = 0.0614
Fold 4: MSE = 0.1053
Fold 5: MSE = 0.1299


In [4]:
from sklearn.ensemble import RandomForestRegressor

cols_to_lag = ['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN']

# Se asume que df_encoded contiene los datos con lags y target "Y"
X = df_encoded_temp.drop(columns=['Y'])
y = df_encoded_temp['Y']

model_temp = RandomForestRegressor(random_state=42)
model_temp.fit(X, y)



In [5]:
cols_to_lag = ['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN']

# Se asume que df_encoded contiene los datos con lags y target "Y"
X = df_encoded_humedad.drop(columns=['Y'])
y = df_encoded_humedad['Y']

model_humedad = RandomForestRegressor(random_state=42)
model_humedad.fit(X, y)



In [6]:
cols_to_lag = ['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN']

# Se asume que df_encoded contiene los datos con lags y target "Y"
X = df_encoded_precipitacion.drop(columns=['Y'])
y = df_encoded_precipitacion['Y']

model_precipitacion = RandomForestRegressor(random_state=42)
model_precipitacion.fit(X, y)


In [14]:
estacion = 'Casa de Campo'
n_lags = 10

# Últimas observaciones reales
last_obs = df[df['ESTACION_NOMBRE'] == estacion].copy().reset_index(drop=True).tail(n_lags)

# Fecha inicial
current_datetime = pd.Timestamp('2025-02-01 00:00')
days_in_year = 366 if current_datetime.is_leap_year else 365

# Variables objetivo
variables = ['HUMEDAD RELATIVA', 'TEMPERATURA', 'PRECIPITACIÓN']

# Almacenar predicciones
predicciones = []

for hour in range(24):
    future_row = {}

    # Características temporales
    future_row['sin_day'] = np.sin(2 * np.pi * current_datetime.dayofyear / days_in_year)
    future_row['cos_day'] = np.cos(2 * np.pi * current_datetime.dayofyear / days_in_year)
    future_row['sin_hour'] = np.sin(2 * np.pi * current_datetime.hour / 24)
    future_row['cos_hour'] = np.cos(2 * np.pi * current_datetime.hour / 24)

    # Generar lags para cada variable
    for var in variables:
        for lag in range(1, n_lags + 1):
            future_row[f'{var}_lag_{lag}'] = last_obs[var].iloc[-lag]

    # Generar lags de las medias del día anterior para cada variable
    sin_day_prev = np.sin(2 * np.pi * (current_datetime.dayofyear - 1) / days_in_year)
    cos_day_prev = np.cos(2 * np.pi * (current_datetime.dayofyear - 1) / days_in_year)
    previous_day_data = df[(df['sin_day'] == sin_day_prev) & (df['cos_day'] == cos_day_prev)]

    
    mean_previous_day = previous_day_data['TEMPERATURA'].mean()
    for lag in range(1, n_lags + 1):
        future_row[f'TEMPERATURA_mean_lag_{lag}'] = mean_previous_day

    # Codificación de estaciones
    future_row[f'ESTACION_NOMBRE_{estacion}'] = 1
    otras_estaciones = ['Ensanche de Vallecas', 'Juan Carlos I', 'Plaza Elíptica']
    for est in otras_estaciones:
        future_row[f'ESTACION_NOMBRE_{est}'] = 0

    future_df = pd.DataFrame([future_row])

    # Predecir
    temp_predict = model_temp.predict(future_df)[0]
    humedad_predict = model_humedad.predict(future_df)[0]
    precipitacion_predict = model_precipitacion.predict(future_df)[0]

    # Guardar predicciones
    predicciones.append({
        'datetime': current_datetime,
        'TEMPERATURA': temp_predict,
        'HUMEDAD RELATIVA': humedad_predict,
        'PRECIPITACIÓN': precipitacion_predict
    })

    # Actualizar observaciones con predicciones
    new_obs = {
        'ESTACION_NOMBRE': estacion,
        'TEMPERATURA': temp_predict,
        'HUMEDAD RELATIVA': humedad_predict,
        'PRECIPITACIÓN': precipitacion_predict
    }

    last_obs = pd.concat([last_obs, pd.DataFrame([new_obs])], ignore_index=True).tail(n_lags)

    # Avanzar una hora
    current_datetime += pd.Timedelta(hours=1)

# Resultados finales
predicciones_df = pd.DataFrame(predicciones)
predicciones_df

Unnamed: 0,datetime,TEMPERATURA,HUMEDAD RELATIVA,PRECIPITACIÓN
0,2025-02-01 00:00:00,4.892,50.48,0.2555
1,2025-02-01 01:00:00,2.784,62.53,1.587
2,2025-02-01 02:00:00,0.537,79.42,1.6855
3,2025-02-01 03:00:00,-0.416,82.31,1.812
4,2025-02-01 04:00:00,-0.997,81.23,1.7315
5,2025-02-01 05:00:00,-1.986,83.89,1.9885
6,2025-02-01 06:00:00,-2.201,77.06,1.653
7,2025-02-01 07:00:00,-1.77,72.42,1.7045
8,2025-02-01 08:00:00,0.272,55.72,1.3975
9,2025-02-01 09:00:00,5.076,45.82,1.5335


In [12]:
temp_predict = model_temp.predict(future_df.iloc[0].values.reshape(1, -1))
humedad_predict = model_humedad.predict(future_df.iloc[0].values.reshape(1, -1))
precipitacion_predict = model_precipitacion.predict(future_df.iloc[0].values.reshape(1, -1))
print(temp_predict, humedad_predict, precipitacion_predict)

[-0.994] [74.86] [1.9455]


