# Explore here

In [14]:
import os

# Limitar uso de CPU para evitar saturar Codespaces
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import pickle

In [16]:
df = pd.read_csv("../data/Aquifer_Petrignano.csv")
df.head()

Unnamed: 0,Date,Rainfall_Bastia_Umbra,Depth_to_Groundwater_P24,Depth_to_Groundwater_P25,Temperature_Bastia_Umbra,Temperature_Petrignano,Volume_C10_Petrignano,Hydrometry_Fiume_Chiascio_Petrignano
0,14/03/2006,,-22.48,-22.18,,,,
1,15/03/2006,,-22.38,-22.14,,,,
2,16/03/2006,,-22.25,-22.04,,,,
3,17/03/2006,,-22.38,-22.04,,,,
4,18/03/2006,,-22.6,-22.04,,,,


In [17]:
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["WeekOfYear"] = df["Date"].dt.isocalendar().week.astype(int)
df["DayOfYear"] = df["Date"].dt.dayofyear
df["IsWeekend"] = (df["Date"].dt.weekday >= 5).astype(int)

df.head()

Unnamed: 0,Date,Rainfall_Bastia_Umbra,Depth_to_Groundwater_P24,Depth_to_Groundwater_P25,Temperature_Bastia_Umbra,Temperature_Petrignano,Volume_C10_Petrignano,Hydrometry_Fiume_Chiascio_Petrignano,Year,Month,Day,WeekOfYear,DayOfYear,IsWeekend
0,2006-03-14,,-22.48,-22.18,,,,,2006,3,14,11,73,0
1,2006-03-15,,-22.38,-22.14,,,,,2006,3,15,11,74,0
2,2006-03-16,,-22.25,-22.04,,,,,2006,3,16,11,75,0
3,2006-03-17,,-22.38,-22.04,,,,,2006,3,17,11,76,0
4,2006-03-18,,-22.6,-22.04,,,,,2006,3,18,11,77,1


In [18]:
df["Depth_to_Groundwater_P24"] = (
    df["Depth_to_Groundwater_P24"]
        .interpolate(method="linear")
        .fillna(method="bfill")
        .fillna(method="ffill")
)

df["Depth_to_Groundwater_P25"] = (
    df["Depth_to_Groundwater_P25"]
        .interpolate(method="linear")
        .fillna(method="bfill")
        .fillna(method="ffill")
)

  .fillna(method="bfill")
  .fillna(method="ffill")
  .fillna(method="bfill")
  .fillna(method="ffill")


In [19]:
# P24
df["P24_Roll_7"] = df["Depth_to_Groundwater_P24"].rolling(7, min_periods=1).mean()
df["P24_Roll_30"] = df["Depth_to_Groundwater_P24"].rolling(30, min_periods=1).mean()

# P25
df["P25_Roll_7"] = df["Depth_to_Groundwater_P25"].rolling(7, min_periods=1).mean()
df["P25_Roll_30"] = df["Depth_to_Groundwater_P25"].rolling(30, min_periods=1).mean()

In [20]:
for lag in [1, 7, 30]:   # antes incluÃ­a 5 valores â†’ ahora solo 3
    df[f"P24_Lag_{lag}"] = df["Depth_to_Groundwater_P24"].shift(lag)
    df[f"P25_Lag_{lag}"] = df["Depth_to_Groundwater_P25"].shift(lag)

In [21]:
df["P24_Diff_1"] = df["Depth_to_Groundwater_P24"].diff(1)
df["P24_Diff_7"] = df["Depth_to_Groundwater_P24"].diff(7)

df["P25_Diff_1"] = df["Depth_to_Groundwater_P25"].diff(1)
df["P25_Diff_7"] = df["Depth_to_Groundwater_P25"].diff(7)

In [22]:
df["P24_RollStd_30"] = df["Depth_to_Groundwater_P24"].rolling(30, min_periods=1).std()
df["P25_RollStd_30"] = df["Depth_to_Groundwater_P25"].rolling(30, min_periods=1).std()

In [23]:
def fourier_features(df, prefix, period=365, K=2):
    t = np.arange(len(df))
    for k in range(1, K+1):
        df[f"{prefix}_sin_{k}"] = np.sin(2 * np.pi * k * t / period)
        df[f"{prefix}_cos_{k}"] = np.cos(2 * np.pi * k * t / period)
    return df

df = fourier_features(df, "P24")
df = fourier_features(df, "P25")

In [24]:
df = df.fillna(method="bfill").fillna(method="ffill")

  df = df.fillna(method="bfill").fillna(method="ffill")


In [25]:
df = df.drop(columns=["Date"])
df_numeric = df.select_dtypes(include=["float64", "int64"])
df_numeric.head()

Unnamed: 0,Rainfall_Bastia_Umbra,Depth_to_Groundwater_P24,Depth_to_Groundwater_P25,Temperature_Bastia_Umbra,Temperature_Petrignano,Volume_C10_Petrignano,Hydrometry_Fiume_Chiascio_Petrignano,WeekOfYear,IsWeekend,P24_Roll_7,...,P24_RollStd_30,P25_RollStd_30,P24_sin_1,P24_cos_1,P24_sin_2,P24_cos_2,P25_sin_1,P25_cos_1,P25_sin_2,P25_cos_2
0,0.0,-22.48,-22.18,5.2,4.9,-29281.824,2.4,11,0,-22.48,...,0.070711,0.028284,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.0,-22.38,-22.14,5.2,4.9,-29281.824,2.4,11,0,-22.43,...,0.070711,0.028284,0.017213,0.999852,0.034422,0.999407,0.017213,0.999852,0.034422,0.999407
2,0.0,-22.25,-22.04,5.2,4.9,-29281.824,2.4,11,0,-22.37,...,0.115326,0.072111,0.034422,0.999407,0.068802,0.99763,0.034422,0.999407,0.068802,0.99763
3,0.0,-22.38,-22.04,5.2,4.9,-29281.824,2.4,11,0,-22.3725,...,0.094296,0.071181,0.05162,0.998667,0.103102,0.994671,0.05162,0.998667,0.103102,0.994671
4,0.0,-22.6,-22.04,5.2,4.9,-29281.824,2.4,11,1,-22.418,...,0.130461,0.067231,0.068802,0.99763,0.137279,0.990532,0.068802,0.99763,0.137279,0.990532


In [28]:
def train_and_optimize(target, df_numeric):
    
    print(f"\nðŸ”µ OPTIMIZANDO MODELOS PARA: {target}\n")

    y = df_numeric[target]
    X = df_numeric.drop(columns=["Depth_to_Groundwater_P24", "Depth_to_Groundwater_P25"])

    # imputaciÃ³n
    imputer = SimpleImputer(strategy="median")
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # escalado
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    # split
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    # ---------------------------
    #       Random Forest
    # ---------------------------
    rf = RandomForestRegressor(random_state=42)

    rf_params = {
        "n_estimators": [200, 300],  # reducido
        "max_depth": [10, 15],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }

    rf_search = RandomizedSearchCV(
        rf, rf_params, n_iter=4, scoring="neg_root_mean_squared_error",
        cv=3, random_state=42, n_jobs=-1
    )

    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    rf_rmse = mean_squared_error(y_valid, rf_best.predict(X_valid)) ** 0.5

    # ---------------------------
    #       Gradient Boosting 
    # ---------------------------
    gb = GradientBoostingRegressor(random_state=42)

    gb_params = {
        "n_estimators": [200, 300],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 4]
    }

    gb_search = RandomizedSearchCV(
        gb, gb_params, n_iter=4, scoring="neg_root_mean_squared_error",
        cv=3, random_state=42, n_jobs=-1
    )

    gb_search.fit(X_train, y_train)
    gb_best = gb_search.best_estimator_
    gb_rmse = mean_squared_error(y_valid, gb_best.predict(X_valid)) ** 0.5

    print("ðŸ”¹ RF RMSE:", rf_rmse)
    print("ðŸ”¹ GB RMSE:", gb_rmse)

    # SelecciÃ³n del mejor
    if rf_rmse < gb_rmse:
        best_model = rf_best
        best_rmse = rf_rmse
        best_name = "RandomForest"
    else:
        best_model = gb_best
        best_rmse = gb_rmse
        best_name = "GradientBoosting"

    best_model.fit(X_scaled, y)

    return best_model, best_name, best_rmse

In [29]:
model_P24, name_P24, rmse_P24 = train_and_optimize("Depth_to_Groundwater_P24", df_numeric)


ðŸ”µ OPTIMIZANDO MODELOS PARA: Depth_to_Groundwater_P24

ðŸ”¹ RF RMSE: 0.05871249078024511
ðŸ”¹ GB RMSE: 0.04324738864756801


In [30]:
model_P25, name_P25, rmse_P25 = train_and_optimize("Depth_to_Groundwater_P25", df_numeric)


ðŸ”µ OPTIMIZANDO MODELOS PARA: Depth_to_Groundwater_P25

ðŸ”¹ RF RMSE: 0.03951585260930461
ðŸ”¹ GB RMSE: 0.02868326895232739


In [31]:
print("\nðŸ“Œ RESULTADOS FINALES OPTIMIZADOS\n")
print("P24 â†’", name_P24, " | RMSE:", rmse_P24)
print("P25 â†’", name_P25, " | RMSE:", rmse_P25)


ðŸ“Œ RESULTADOS FINALES OPTIMIZADOS

P24 â†’ GradientBoosting  | RMSE: 0.04324738864756801
P25 â†’ GradientBoosting  | RMSE: 0.02868326895232739


In [32]:
with open("model_P24_optimized.pkl", "wb") as f:
    pickle.dump(model_P24, f)

with open("model_P25_optimized.pkl", "wb") as f:
    pickle.dump(model_P25, f)

print("Modelos optimizados guardados correctamente.")

Modelos optimizados guardados correctamente.
