# Imporación datasets

In [2]:
import pandas as pd

In [3]:
df_procesado = pd.read_csv('df_procesado2.csv')

In [6]:
# Asegurar fecha
df_procesado['FL_DATE'] = pd.to_datetime(df_procesado['FL_DATE'])

# Columnas necesarias
features_num = [
    'temperatura',
    'humedad',
    'presion',
    'viento_velocidad',
    'MONTH'
]

features_cat = ['condicion']
target = 'visibilidad'

# Dataset de entrenamiento SOLO con visibilidad real
df_train = df_procesado[
    df_procesado[target].notna()
][features_num + features_cat + [target]].copy()


In [7]:
df_train = df_train.dropna(subset=features_num + features_cat)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), features_num),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), features_cat)
    ]
)

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', model)
])


In [10]:
X = df_train[features_num + features_cat]
y = df_train[target]

pipeline.fit(X, y)


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split

X = df_train[features_num + features_cat]
y = df_train[target]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_val)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")


In [None]:
baseline = y_val.median()
rmse_baseline = np.sqrt(mean_squared_error(y_val, [baseline]*len(y_val)))

print(f"RMSE baseline (mediana): {rmse_baseline:.2f}")


In [None]:
import matplotlib.pyplot as plt

plt.hist(y_val, bins=30, alpha=0.6, label='Real')
plt.hist(y_pred, bins=30, alpha=0.6, label='Predicho')
plt.legend()
plt.title("Visibilidad real vs predicha")
plt.show()


In [None]:
plt.scatter(y_val, y_pred, alpha=0.3)
plt.xlabel("Real")
plt.ylabel("Predicho")
plt.title("Predicción vs Real")
plt.show()


In [None]:
import joblib
joblib.dump(pipeline, "modelo_visibilidad.joblib")
