In [8]:
# --- Configuración y preparación de datos (VSCode Jupyter, Py 3.10) ---

import re
import pandas as pd
import numpy as np
from pathlib import Path

# ======================
# 1) Rutas absolutas
# ======================
# Ajusta esta ruta a donde realmente tengas tu carpeta Data
base_path = Path(r"C:\Users\Jander\Documents\Proyectos-de-Ingenieria\Proyecto_Regresion\Data")

p2022 = base_path / "CO_Nebraska_2022.csv"
p2023 = base_path / "CO_Nebraska_2023.csv"
out_file = base_path / "CO_Nebraska_2022_2023_daily_state_mean.csv"

print("Ruta 2022:", p2022)
print("Ruta 2023:", p2023)
print("Salida  :", out_file)

# =================================
# 2) Funciones auxiliares robustas
# =================================
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r"[^\w\s\-\/]", "", regex=True)
        .str.replace(r"\s+", "_", regex=True)
    )
    return df

def find_date_column(cols) -> str:
    if "date_local" in cols:
        return "date_local"
    candidates = [c for c in cols if "date" in c]
    if candidates:
        return sorted(candidates, key=len)[0]
    raise ValueError("No se encontró columna de fecha")

def find_value_column(cols) -> str:
    patterns = [
        r"daily.*8.*hour.*co.*concentration",
        r"8.*hour.*co.*concentration",
        r"co.*concentration",
        r"daily.*co",
        r"arithmetic_mean",
        r"value",
    ]
    for pat in patterns:
        matches = [c for c in cols if re.search(pat, c)]
        if matches:
            return matches[0]
    co_candidates = [c for c in cols if "co" in c]
    if co_candidates:
        return co_candidates[0]
    raise ValueError("No se encontró columna de concentración de CO")

def read_and_clean(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = normalize_columns(df)
    date_col = find_date_column(df.columns)
    val_col  = find_value_column(df.columns)
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    df[val_col]  = pd.to_numeric(df[val_col], errors="coerce")
    df = df.dropna(subset=[date_col, val_col])
    return df.rename(columns={date_col: "date", val_col: "co_value"})

def state_daily_mean(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df.groupby("date", as_index=False)["co_value"]
          .mean()
          .sort_values("date")
          .reset_index(drop=True)
    )

# ==========================
# 3) Leer y concatenar años
# ==========================
df22 = read_and_clean(p2022)
df23 = read_and_clean(p2023)

df_all = pd.concat([df22, df23], ignore_index=True).sort_values("date").reset_index(drop=True)

print(f"Filas 2022: {len(df22):,} | Filas 2023: {len(df23):,} | Total: {len(df_all):,}")

# =========================================
# 4) Agregar a serie diaria estatal (media)
# =========================================
daily_state_mean = state_daily_mean(df_all)

# ===================================
# 5) Features de calendario (básicas)
# ===================================
daily_state_mean["year"]      = daily_state_mean["date"].dt.year
daily_state_mean["month"]     = daily_state_mean["date"].dt.month
daily_state_mean["dayofweek"] = daily_state_mean["date"].dt.dayofweek
daily_state_mean["dayofyear"] = daily_state_mean["date"].dt.dayofyear

# ==========================
# 6) Guardar dataset final
# ==========================
daily_state_mean.to_csv(out_file, index=False)
print(f"CSV procesado guardado en: {out_file}")

# ==========================
# 7) Vista previa rápida
# ==========================
display(daily_state_mean.head(12))
print(daily_state_mean.shape, daily_state_mean.columns.tolist())


Ruta 2022: C:\Users\Jander\Documents\Proyectos-de-Ingenieria\Proyecto_Regresion\Data\CO_Nebraska_2022.csv
Ruta 2023: C:\Users\Jander\Documents\Proyectos-de-Ingenieria\Proyecto_Regresion\Data\CO_Nebraska_2023.csv
Salida  : C:\Users\Jander\Documents\Proyectos-de-Ingenieria\Proyecto_Regresion\Data\CO_Nebraska_2022_2023_daily_state_mean.csv
Filas 2022: 711 | Filas 2023: 687 | Total: 1,398
CSV procesado guardado en: C:\Users\Jander\Documents\Proyectos-de-Ingenieria\Proyecto_Regresion\Data\CO_Nebraska_2022_2023_daily_state_mean.csv


  df[date_col] = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
  df[date_col] = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)


Unnamed: 0,date,co_value,year,month,dayofweek,dayofyear
0,2022-01-01,0.15,2022,1,5,1
1,2022-01-02,0.2,2022,1,6,2
2,2022-01-03,0.3,2022,1,0,3
3,2022-01-04,0.3,2022,1,1,4
4,2022-01-05,0.3,2022,1,2,5
5,2022-01-06,0.3,2022,1,3,6
6,2022-01-07,0.3,2022,1,4,7
7,2022-01-08,0.3,2022,1,5,8
8,2022-01-09,0.3,2022,1,6,9
9,2022-01-10,0.3,2022,1,0,10


(725, 6) ['date', 'co_value', 'year', 'month', 'dayofweek', 'dayofyear']
