In [1]:

import os, sys
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, REPO_ROOT)


from scripts.data_acquisition.main import load_dataset as load_raw_dataset
from scripts.preprocessing.main import preprocess_dataset, save_outputs
from scripts.feature_selection.main import (
    load_processed,
    select_via_lasso,
    select_via_forest,
    select_via_mutual_info,
    hybrid_selection,
    save_selected,)

from scripts.training.main import available_specs, run_model_search

raw_df = load_raw_dataset(
    input_path=os.path.join(REPO_ROOT, "data.csv"),
    gdrive_id="1tYfm5wJXRHZGa5h3fsRA7tnyFUlWESpa",)

print(raw_df.shape)
raw_df.head()


                  date        p     T    Tpot  Tdew    rh  VPmax  VPact  \
0  2020-01-01 00:10:00  1008.89  0.71  273.18 -1.33  86.1   6.43   5.54   
1  2020-01-01 00:20:00  1008.76  0.75  273.22 -1.44  85.2   6.45   5.49   
2  2020-01-01 00:30:00  1008.66  0.73  273.21 -1.48  85.1   6.44   5.48   
3  2020-01-01 00:40:00  1008.64  0.37  272.86 -1.64  86.3   6.27   5.41   
4  2020-01-01 00:50:00  1008.61  0.33  272.82 -1.50  87.4   6.26   5.47   

   VPdef    sh  ...      rho    wv  max. wv     wd  rain  raining  SWDR  PAR  \
0   0.89  3.42  ...  1280.62  1.02     1.60  224.3   0.0      0.0   0.0  0.0   
1   0.95  3.39  ...  1280.33  0.43     0.84  206.8   0.0      0.0   0.0  0.0   
2   0.96  3.39  ...  1280.29  0.61     1.48  197.1   0.0      0.0   0.0  0.0   
3   0.86  3.35  ...  1281.97  1.11     1.48  206.4   0.0      0.0   0.0  0.0   
4   0.79  3.38  ...  1282.08  0.49     1.40  209.6   0.0      0.0   0.0  0.0   

   max. PAR   Tlog  
0       0.0  11.45  
1       0.0  11.51  
2    

Unnamed: 0,date,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,...,rho,wv,max. wv,wd,rain,raining,SWDR,PAR,max. PAR,Tlog
0,2020-01-01 00:10:00,1008.89,0.71,273.18,-1.33,86.1,6.43,5.54,0.89,3.42,...,1280.62,1.02,1.6,224.3,0.0,0.0,0.0,0.0,0.0,11.45
1,2020-01-01 00:20:00,1008.76,0.75,273.22,-1.44,85.2,6.45,5.49,0.95,3.39,...,1280.33,0.43,0.84,206.8,0.0,0.0,0.0,0.0,0.0,11.51
2,2020-01-01 00:30:00,1008.66,0.73,273.21,-1.48,85.1,6.44,5.48,0.96,3.39,...,1280.29,0.61,1.48,197.1,0.0,0.0,0.0,0.0,0.0,11.6
3,2020-01-01 00:40:00,1008.64,0.37,272.86,-1.64,86.3,6.27,5.41,0.86,3.35,...,1281.97,1.11,1.48,206.4,0.0,0.0,0.0,0.0,0.0,11.7
4,2020-01-01 00:50:00,1008.61,0.33,272.82,-1.5,87.4,6.26,5.47,0.79,3.38,...,1282.08,0.49,1.4,209.6,0.0,0.0,0.0,0.0,0.0,11.81


In [2]:
TARGET = "Tlog"
TEST_SIZE = 0.2
RANDOM_STATE = 42
POLY_DEGREE = 2       
WINSOR_LIMITS = (0.01, 0.99)
DROP_DATETIME = True  

artifacts = preprocess_dataset(
    df=raw_df,
    target=TARGET,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    poly_degree=POLY_DEGREE,
    winsor_limits=WINSOR_LIMITS,
    drop_datetime=DROP_DATETIME)

X_train, X_test = artifacts.X_train, artifacts.X_test
y_train, y_test = artifacts.y_train, artifacts.y_test

print("X_train:", X_train.shape, "X_test:", X_test.shape)

PROCESSED_DIR = os.path.join(REPO_ROOT, "data", "processed")
save_outputs(artifacts, PROCESSED_DIR)

X_train: (42156, 209) X_test: (10540, 209)


In [3]:
X_train_fs, X_test_fs, y_train_fs, y_test_fs = load_processed(PROCESSED_DIR)

selected_cols = hybrid_selection(
    X_train_fs,
    y_train_fs,
    random_state=77,
    k_mi=20,
    top_frac_forest=0.3,)


X_train_sel = X_train_fs[selected_cols]
X_test_sel = X_test_fs[selected_cols]

print(f"Features seleccionadas: {len(selected_cols)}")
print(selected_cols)

SELECTED_DIR = os.path.join(REPO_ROOT, "data", "selected")

save_selected(
    X_train=X_train_fs,
    X_test=X_test_fs,
    y_train=y_train_fs,
    y_test=y_test_fs,
    selected_cols=selected_cols,
    output_dir=SELECTED_DIR,)

display(X_train_sel.describe().T)

[Lasso] Seleccionadas 40 características (alpha=0.0076).
[MutualInfo] Seleccionadas 20 características (top-k=20).
[Forest] Seleccionadas 62 características (top 30%).
[Hybrid] Total combinadas: 90.
Features seleccionadas: 90
['num__T', 'num__rh', 'num__VPmax', 'num__VPdef', 'num__PAR', 'num__max. PAR', 'num__p T', 'num__p Tpot', 'num__p wv', 'num__T rain', 'num__Tpot^2', 'num__Tdew wv', 'num__Tdew max. wv', 'num__Tdew wd', 'num__Tdew raining', 'num__Tdew SWDR', 'num__rh VPdef', 'num__rh wv', 'num__rh max. wv', 'num__rh SWDR', 'num__VPmax VPact', 'num__VPmax sh', 'num__VPmax rain', 'num__VPdef^2', 'num__VPdef wv', 'num__VPdef wd', 'num__VPdef rain', 'num__VPdef raining', 'num__rho wv', 'num__rho wd', 'num__rho PAR', 'num__rho max. PAR', 'num__wv^2', 'num__wv raining', 'num__max. wv raining', 'num__rain raining', 'num__rain SWDR', 'num__raining SWDR', 'num__SWDR max. PAR', 'num__max. PAR^2', 'num__VPmax rho', 'num__T rho', 'num__p VPmax', 'num__T Tpot', 'num__T VPmax', 'num__VPmax^2', '

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num__T,42156.0,-7.403257e-15,1.000012,-1.898403,-0.841734,-0.074972,0.725658,2.389234
num__rh,42156.0,-2.568883e-14,1.000012,-2.242454,-0.714957,0.145949,0.802176,1.432361
num__VPmax,42156.0,-1.126846e-14,1.000012,-1.299300,-0.804845,-0.260404,0.536667,3.300750
num__VPdef,42156.0,-7.297322e-15,1.000012,-0.900463,-0.679248,-0.380512,0.317167,3.761890
num__PAR,42156.0,-6.415886e-15,1.000012,-0.617021,-0.617021,-0.594114,0.252373,3.380673
...,...,...,...,...,...,...,...,...
num__VPdef PAR,42156.0,1.898482e-15,1.000012,-0.428905,-0.428905,-0.427964,-0.192263,6.556660
num__rh rho,42156.0,9.378166e-15,1.000012,-2.248115,-0.723058,0.139866,0.802610,1.693262
num__T max. wv,42156.0,9.531548e-17,1.000012,-1.304163,-0.706538,-0.335887,0.392184,6.621536
num__VPdef max. PAR,42156.0,8.977015e-16,1.000012,-0.437261,-0.437261,-0.436050,-0.197536,6.754748


In [4]:
import pandas as pd
y_train_sel = y_train_fs
y_test_sel = y_test_fs

n_train = min(len(X_train_sel), 5000)
n_test = min(len(X_test_sel), 2000)
idx_train = X_train_sel.sample(n=n_train, random_state=77).index
idx_test = X_test_sel.sample(n=n_test, random_state=77).index

X_train_small = X_train_sel.loc[idx_train]
y_train_small = y_train_sel.loc[idx_train]
X_test_small = X_test_sel.loc[idx_test]
y_test_small = y_test_sel.loc[idx_test]

OUTPUT_DIR = os.path.join(REPO_ROOT, "models_fast")

catalog = available_specs(include_optional=True)
model_names = [
    "linear_predictive",
    "lasso",
    "ridge",
    "random_forest",
    "gradient_boosting",
]

if "xgboost" in catalog:
    model_names.append("xgboost")

selected_specs = [catalog[m] for m in model_names if m in catalog]

results = run_model_search(
    specs=selected_specs,
    X_train=X_train_small,
    y_train=y_train_small,
    X_test=X_test_small,
    y_test=y_test_small,
    n_iter=5,  
    cv=3,      
    random_state=77,
    output_dir=OUTPUT_DIR)

results_df = pd.DataFrame(results).sort_values("test_r2", ascending=False)
display(results_df)

[xgboost] Dependencia no instalada: No module named 'xgboost'. Se omite.
[lightgbm] Dependencia no instalada: No module named 'lightgbm'. Se omite.
[catboost] Dependencia no instalada: No module named 'catboost'. Se omite.

>>> Modelo: linear_predictive

>>> Modelo: lasso

>>> Modelo: ridge

>>> Modelo: random_forest

>>> Modelo: gradient_boosting


Unnamed: 0,model,val_r2,test_r2,test_mse,test_mae
4,gradient_boosting,0.978793,0.979942,1.173117,0.817624
3,random_forest,0.977622,0.978551,1.254463,0.84789
0,linear_predictive,0.975716,0.976441,1.377835,0.900319
2,ridge,0.975833,0.976441,1.377857,0.899422
1,lasso,0.975495,0.976174,1.393486,0.906887


In [11]:
import numpy as np 

y_train_sel = y_train_fs
y_test_sel = y_test_fs

n_sample = min(len(X_train_sel), 10_000)

train_idx = np.random.RandomState(77).choice(X_train_sel.index, size=n_sample, replace=False)
X_train_econ = X_train_sel.loc[train_idx]
y_train_econ = y_train_sel.loc[train_idx]


import importlib, pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scripts.training.main import available_specs

catalog = available_specs(include_optional=False)
econ_model = catalog["linear_econometric"].build_estimator()

econ_model.fit(X_train_econ, y_train_econ )
preds = econ_model.predict(X_train_econ )

print("R2:", r2_score(y_train_econ, preds))
print("MSE:", mean_squared_error(y_train_econ, preds))
print("MAE:", mean_absolute_error(y_train_econ, preds))


res = econ_model.result
conf = res.conf_int()
coefs_df = pd.DataFrame({
    "feature": res.params.index,
    "p_value": res.pvalues.values,
    "coef": res.params.values,
    "ci_low": conf[0].values,
    "ci_high": conf[1].values,})


print("\nFeatures retenidas tras VIF:")
print(econ_model.selected_features_)
print("\nVIF finales:")
print(econ_model.vif_values_)
display(coefs_df)


print(res.summary())


R2: 0.9738960110203473
MSE: 1.5803607383937146
MAE: 0.968334414565599

Features retenidas tras VIF:
['num__p Tpot', 'num__T rain', 'num__Tdew wv', 'num__Tdew wd', 'num__Tdew raining', 'num__Tdew SWDR', 'num__rh wv', 'num__rh max. wv', 'num__rh SWDR', 'num__VPdef^2', 'num__VPdef wv', 'num__VPdef wd', 'num__VPdef rain', 'num__VPdef raining', 'num__rho wd', 'num__rho max. PAR', 'num__wv^2', 'num__wv raining', 'num__rain raining', 'num__rain SWDR', 'num__raining SWDR', 'num__SWDR max. PAR', 'num__VPact VPdef', 'num__p rho', 'num__VPact^2', 'num__rh^2', 'num__VPdef PAR']

VIF finales:
[np.float64(12.535666173399813), np.float64(7.865299889052172), np.float64(5.060198379915188), np.float64(6.221682853050935), np.float64(4.465493846068515), np.float64(5.630823730407745), np.float64(20.356647531406118), np.float64(13.039857800255723), np.float64(10.698577376226837), np.float64(19.41245121919513), np.float64(8.879366136699), np.float64(5.04808206146027), np.float64(2.4245162609680855), np.float

Unnamed: 0,feature,p_value,coef,ci_low,ci_high
0,const,0.0,21.525289,21.500373,21.550204
1,num__p Tpot,0.0,5.018824,4.921947,5.1157
2,num__T rain,1.283003e-06,0.197717,0.11769,0.277743
3,num__Tdew wv,1.358515e-19,0.258147,0.202274,0.314019
4,num__Tdew wd,1.804209e-05,-0.158046,-0.230288,-0.085803
5,num__Tdew raining,1.142888e-11,-0.166768,-0.214926,-0.11861
6,num__Tdew SWDR,2.827012e-10,-0.223253,-0.29262,-0.153886
7,num__rh wv,2.241855e-10,-0.335207,-0.438772,-0.231642
8,num__rh max. wv,8.644802999999999e-19,-0.380019,-0.464167,-0.295871
9,num__rh SWDR,0.06077612,0.078902,-0.00357,0.161374


                            OLS Regression Results                            
Dep. Variable:                   Tlog   R-squared:                       0.974
Model:                            OLS   Adj. R-squared:                  0.974
Method:                 Least Squares   F-statistic:                 1.239e+04
Date:               jue, 04 dic. 2025   Prob (F-statistic):               0.00
Time:                        21:56:31   Log-Likelihood:                -16478.
No. Observations:               10000   AIC:                         3.301e+04
Df Residuals:                    9972   BIC:                         3.321e+04
Df Model:                          27                                         
Covariance Type:                  HC3                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 21.5253      0

### Prophet

In [None]:
import pandas as pd
from prophet import Prophet
from prophet.serialize import model_to_json, model_from_json
import json

df = raw_df.sort_values("date")
prophet_df = df.rename(columns={"date": "ds", "Tlog": "y"})
prophet_df["ds"] = pd.to_datetime(prophet_df["ds"])

In [None]:
m = Prophet()
m.fit(prophet_df)

future = m.make_future_dataframe(periods=60, freq="D")
forecast = m.predict(future)

print(forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail())

In [None]:
import matplotlib.pyplot as plt

fig1 = m.plot(forecast)
fig2 = m.plot_components(forecast)
plt.show()

In [None]:
with open("models_fast/prophet_model.json", "w") as f:
    json.dump(model_to_json(m), f)