In [1]:

import os, sys
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, REPO_ROOT)


from scripts.data_acquisition.main import load_dataset as load_raw_dataset
from scripts.preprocessing.main import preprocess_dataset, save_outputs
from scripts.feature_selection.main import (
    load_processed,
    select_via_lasso,
    select_via_forest,
    select_via_mutual_info,
    hybrid_selection,
    save_selected,)

from scripts.training.model_registry import available_specs
from scripts.training.training_pipeline import run_model_search
from scripts.training.data_utils import build_fast_subset

raw_df = load_raw_dataset(
    input_path=os.path.join(REPO_ROOT, "data.csv"),
    gdrive_id="1tYfm5wJXRHZGa5h3fsRA7tnyFUlWESpa",)

print(raw_df.shape)
raw_df.head()

                  date        p     T    Tpot  Tdew    rh  VPmax  VPact  \
0  2020-01-01 00:10:00  1008.89  0.71  273.18 -1.33  86.1   6.43   5.54   
1  2020-01-01 00:20:00  1008.76  0.75  273.22 -1.44  85.2   6.45   5.49   
2  2020-01-01 00:30:00  1008.66  0.73  273.21 -1.48  85.1   6.44   5.48   
3  2020-01-01 00:40:00  1008.64  0.37  272.86 -1.64  86.3   6.27   5.41   
4  2020-01-01 00:50:00  1008.61  0.33  272.82 -1.50  87.4   6.26   5.47   

   VPdef    sh  ...      rho    wv  max. wv     wd  rain  raining  SWDR  PAR  \
0   0.89  3.42  ...  1280.62  1.02     1.60  224.3   0.0      0.0   0.0  0.0   
1   0.95  3.39  ...  1280.33  0.43     0.84  206.8   0.0      0.0   0.0  0.0   
2   0.96  3.39  ...  1280.29  0.61     1.48  197.1   0.0      0.0   0.0  0.0   
3   0.86  3.35  ...  1281.97  1.11     1.48  206.4   0.0      0.0   0.0  0.0   
4   0.79  3.38  ...  1282.08  0.49     1.40  209.6   0.0      0.0   0.0  0.0   

   max. PAR   Tlog  
0       0.0  11.45  
1       0.0  11.51  
2    

Unnamed: 0,date,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,...,rho,wv,max. wv,wd,rain,raining,SWDR,PAR,max. PAR,Tlog
0,2020-01-01 00:10:00,1008.89,0.71,273.18,-1.33,86.1,6.43,5.54,0.89,3.42,...,1280.62,1.02,1.6,224.3,0.0,0.0,0.0,0.0,0.0,11.45
1,2020-01-01 00:20:00,1008.76,0.75,273.22,-1.44,85.2,6.45,5.49,0.95,3.39,...,1280.33,0.43,0.84,206.8,0.0,0.0,0.0,0.0,0.0,11.51
2,2020-01-01 00:30:00,1008.66,0.73,273.21,-1.48,85.1,6.44,5.48,0.96,3.39,...,1280.29,0.61,1.48,197.1,0.0,0.0,0.0,0.0,0.0,11.6
3,2020-01-01 00:40:00,1008.64,0.37,272.86,-1.64,86.3,6.27,5.41,0.86,3.35,...,1281.97,1.11,1.48,206.4,0.0,0.0,0.0,0.0,0.0,11.7
4,2020-01-01 00:50:00,1008.61,0.33,272.82,-1.5,87.4,6.26,5.47,0.79,3.38,...,1282.08,0.49,1.4,209.6,0.0,0.0,0.0,0.0,0.0,11.81


In [2]:
TARGET = "Tlog"
TEST_SIZE = 0.2
RANDOM_STATE = 42
POLY_DEGREE = 2       
WINSOR_LIMITS = (0.01, 0.99)
DROP_DATETIME = True  

artifacts = preprocess_dataset(
    df=raw_df,
    target=TARGET,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    poly_degree=POLY_DEGREE,
    winsor_limits=WINSOR_LIMITS,
    drop_datetime=DROP_DATETIME)

X_train, X_test = artifacts.X_train, artifacts.X_test
y_train, y_test = artifacts.y_train, artifacts.y_test

print("X_train:", X_train.shape, "X_test:", X_test.shape)

PROCESSED_DIR = os.path.join(REPO_ROOT, "data", "processed")
save_outputs(artifacts, PROCESSED_DIR)

X_train: (42156, 209) X_test: (10540, 209)


In [3]:
X_train_fs, X_test_fs, y_train_fs, y_test_fs = load_processed(PROCESSED_DIR)

selected_cols = hybrid_selection(
    X_train_fs,
    y_train_fs,
    random_state=77,
    k_mi=20,
    top_frac_forest=0.3,)


X_train_sel = X_train_fs[selected_cols]
X_test_sel = X_test_fs[selected_cols]

print(f"Features seleccionadas: {len(selected_cols)}")
print(selected_cols)

SELECTED_DIR = os.path.join(REPO_ROOT, "data", "selected")

save_selected(
    X_train=X_train_fs,
    X_test=X_test_fs,
    y_train=y_train_fs,
    y_test=y_test_fs,
    selected_cols=selected_cols,
    output_dir=SELECTED_DIR,)

display(X_train_sel.describe().T)

[Lasso] Seleccionadas 40 características (alpha=0.0076).
[MutualInfo] Seleccionadas 20 características (top-k=20).
[Forest] Seleccionadas 62 características (top 30%).
[Hybrid] Total combinadas: 90.
Features seleccionadas: 90
['num__T', 'num__rh', 'num__VPmax', 'num__VPdef', 'num__PAR', 'num__max. PAR', 'num__p T', 'num__p Tpot', 'num__p wv', 'num__T rain', 'num__Tpot^2', 'num__Tdew wv', 'num__Tdew max. wv', 'num__Tdew wd', 'num__Tdew raining', 'num__Tdew SWDR', 'num__rh VPdef', 'num__rh wv', 'num__rh max. wv', 'num__rh SWDR', 'num__VPmax VPact', 'num__VPmax sh', 'num__VPmax rain', 'num__VPdef^2', 'num__VPdef wv', 'num__VPdef wd', 'num__VPdef rain', 'num__VPdef raining', 'num__rho wv', 'num__rho wd', 'num__rho PAR', 'num__rho max. PAR', 'num__wv^2', 'num__wv raining', 'num__max. wv raining', 'num__rain raining', 'num__rain SWDR', 'num__raining SWDR', 'num__SWDR max. PAR', 'num__max. PAR^2', 'num__VPmax rho', 'num__T rho', 'num__p VPmax', 'num__T Tpot', 'num__T VPmax', 'num__VPmax^2', '

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num__T,42156.0,-7.403257e-15,1.000012,-1.898403,-0.841734,-0.074972,0.725658,2.389234
num__rh,42156.0,-2.568883e-14,1.000012,-2.242454,-0.714957,0.145949,0.802176,1.432361
num__VPmax,42156.0,-1.126846e-14,1.000012,-1.299300,-0.804845,-0.260404,0.536667,3.300750
num__VPdef,42156.0,-7.297322e-15,1.000012,-0.900463,-0.679248,-0.380512,0.317167,3.761890
num__PAR,42156.0,-6.415886e-15,1.000012,-0.617021,-0.617021,-0.594114,0.252373,3.380673
...,...,...,...,...,...,...,...,...
num__VPdef PAR,42156.0,1.898482e-15,1.000012,-0.428905,-0.428905,-0.427964,-0.192263,6.556660
num__rh rho,42156.0,9.378166e-15,1.000012,-2.248115,-0.723058,0.139866,0.802610,1.693262
num__T max. wv,42156.0,9.531548e-17,1.000012,-1.304163,-0.706538,-0.335887,0.392184,6.621536
num__VPdef max. PAR,42156.0,8.977015e-16,1.000012,-0.437261,-0.437261,-0.436050,-0.197536,6.754748


In [4]:
import pandas as pd
y_train_sel = y_train_fs
y_test_sel = y_test_fs

n_train = min(len(X_train_sel), 5000)
n_test = min(len(X_test_sel), 2000)
idx_train = X_train_sel.sample(n=n_train, random_state=77).index
idx_test = X_test_sel.sample(n=n_test, random_state=77).index

X_train_small = X_train_sel.loc[idx_train]
y_train_small = y_train_sel.loc[idx_train]
X_test_small = X_test_sel.loc[idx_test]
y_test_small = y_test_sel.loc[idx_test]

OUTPUT_DIR = os.path.join(REPO_ROOT, "models_mlflow")

In [None]:
from scripts.training.model_registry import available_specs
from scripts.mlflow_utils.tracking import run_model_search_with_mlflow
from pathlib import Path

catalog = available_specs(include_optional=True)
model_names = ["linear_predictive","lasso","ridge","random_forest","gradient_boosting"]
if "xgboost" in catalog:
    model_names.append("xgboost")
    
specs = [catalog[m] for m in model_names if m in catalog]

tracking_uri = Path(REPO_ROOT, "mlruns").resolve().as_uri()

results, tracking_uri = run_model_search_with_mlflow(
    specs=specs,
    X_train=X_train_small,
    y_train=y_train_small,
    X_test=X_test_small,
    y_test=y_test_small,
    n_iter=5,
    cv=3,
    random_state=77,
    output_dir=OUTPUT_DIR,
    experiment_name="weather_training_fast",
    run_name="fast_sample",
    log_models=True,
    tracking_uri=tracking_uri)

print("Tracking URI:", tracking_uri)
pd.DataFrame(results)

[xgboost] Dependencia no instalada: No module named 'xgboost'. Se omite.
[lightgbm] Dependencia no instalada: No module named 'lightgbm'. Se omite.
[catboost] Dependencia no instalada: No module named 'catboost'. Se omite.

>>> Modelo: linear_predictive

>>> Modelo: lasso

>>> Modelo: ridge

>>> Modelo: random_forest

>>> Modelo: gradient_boosting
Tracking URI: file:///C:/Users/alejo/OneDrive/Escritorio/Universidaad/Diplomado%20ML%20y%20Data%20Science%20Avanzado/Modulo%203/tdsp_template-1/mlruns


Unnamed: 0,model,val_r2,test_r2,test_mse,test_mae
0,linear_predictive,0.975716,0.976441,1.377835,0.900319
1,lasso,0.975495,0.976174,1.393486,0.906887
2,ridge,0.975833,0.976441,1.377857,0.899422
3,random_forest,0.977622,0.978551,1.254463,0.84789
4,gradient_boosting,0.978793,0.979942,1.173117,0.817624
