# Modelo de Regresión Supervisada

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from calendar import monthrange

In [3]:
df_2023 = pd.read_csv("CO_Florida_2023.csv")
df_2024 = pd.read_csv("CO_Florida_2024.csv")

In [34]:
df = pd.concat([df_2023, df_2024])
df.head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour CO Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2023,AQS,120110034,1,0.6,ppm,7,Daniela Banu NCORE,19,79.0,...,Carbon monoxide,554,33100,"Miami-Fort Lauderdale-West Palm Beach, FL",12,Florida,11,Broward,26.053889,-80.256944
1,01/02/2023,AQS,120110034,1,0.4,ppm,5,Daniela Banu NCORE,24,100.0,...,Carbon monoxide,554,33100,"Miami-Fort Lauderdale-West Palm Beach, FL",12,Florida,11,Broward,26.053889,-80.256944
2,01/03/2023,AQS,120110034,1,0.2,ppm,2,Daniela Banu NCORE,18,75.0,...,Carbon monoxide,554,33100,"Miami-Fort Lauderdale-West Palm Beach, FL",12,Florida,11,Broward,26.053889,-80.256944
3,01/04/2023,AQS,120110034,1,0.2,ppm,2,Daniela Banu NCORE,24,100.0,...,Carbon monoxide,554,33100,"Miami-Fort Lauderdale-West Palm Beach, FL",12,Florida,11,Broward,26.053889,-80.256944
4,01/05/2023,AQS,120110034,1,0.3,ppm,3,Daniela Banu NCORE,24,100.0,...,Carbon monoxide,554,33100,"Miami-Fort Lauderdale-West Palm Beach, FL",12,Florida,11,Broward,26.053889,-80.256944


In [68]:
# ----- Parámetros -----
date_col = "Date"
site_col = "Local Site Name"
co_col   = "Daily Max 8-hour CO Concentration"
mes_es = {
    1:"enero", 2:"febrero", 3:"marzo", 4:"abril", 5:"mayo", 6:"junio",
    7:"julio", 8:"agosto", 9:"septiembre", 10:"octubre", 11:"noviembre", 12:"diciembre"
}
usecols = [date_col, site_col, co_col]

# Tipos eficientes
df = df.dropna(subset=[date_col])
df[site_col] = df[site_col].astype("category")
df = df.sort_values(date_col)

# 2) Top 9 sitios y filtrado (sin copias innecesarias)
top9_sites = df[site_col].value_counts().index[:9].tolist()
df = df[df[site_col].isin(top9_sites)]

# 3) Pivot (si hay duplicados por día-sitio, promedia)
pivot = (df.pivot_table(index=date_col, columns=site_col, values=co_col, aggfunc="mean")
           .sort_index())

# Asegurar exactamente esas 9 columnas y ese orden
for s in top9_sites:
    if s not in pivot.columns:
        pivot[s] = np.nan
pivot = pivot[top9_sites]

# 4) Rellenar calendario diario completo sin bucles
#    (desde el 1er día del mes mínimo al último día del mes máximo)
start = pivot.index.min().to_period("M").to_timestamp()
end   = pivot.index.max().to_period("M").to_timestamp("M")
full_idx = pd.date_range(start, end, freq="D")
pivot = pivot.reindex(full_idx)

# 5) Añadir Año / Mes (es) / Día y Media por fila
out = pivot.copy()
out["Año"] = out.index.year
out["Mes"] = out.index.month.map(mes_es)
out["Día"] = out.index.day
out["Media CO"] = out[top9_sites].mean(axis=1, skipna=True)

# 6) Orden final
final_cols = top9_sites + ["Año", "Mes", "Día", "Media CO"]
out = out[final_cols].reset_index(drop=True)

print("Forma final:", out.shape)
print("Columnas:", list(out.columns))
# out.to_csv("CO_formato_profesor.csv", index=False)


Forma final: (731, 13)
Columnas: ['Near Road - Fort Lauderdale', 'Daniela Banu NCORE', 'Sawgrass Lake Park (Near-Road)', 'SYDNEY', 'Pepsi Place', 'Munro Street (Near-Road)', 'WINTER PARK', 'Perimeter Road', 'St. Marks Wildlife Refuge', 'Año', 'Mes', 'Día', 'Media CO']


In [69]:
out.head()

Local Site Name,Near Road - Fort Lauderdale,Daniela Banu NCORE,Sawgrass Lake Park (Near-Road),SYDNEY,Pepsi Place,Munro Street (Near-Road),WINTER PARK,Perimeter Road,St. Marks Wildlife Refuge,Año,Mes,Día,Media CO
0,0.9,0.6,0.3,0.3,1.0,0.6,0.5,0.3,0.1,2023,enero,1,0.511111
1,0.8,0.4,0.4,0.3,1.2,0.7,0.4,0.4,0.1,2023,enero,2,0.522222
2,0.6,0.2,0.3,0.2,0.7,0.3,0.4,0.1,0.1,2023,enero,3,0.322222
3,0.6,0.2,0.3,0.2,0.6,0.3,0.4,0.1,,2023,enero,4,0.3375
4,0.6,0.3,0.2,0.1,0.7,0.4,0.4,0.2,0.2,2023,enero,5,0.344444


In [70]:
out.tail()

Local Site Name,Near Road - Fort Lauderdale,Daniela Banu NCORE,Sawgrass Lake Park (Near-Road),SYDNEY,Pepsi Place,Munro Street (Near-Road),WINTER PARK,Perimeter Road,St. Marks Wildlife Refuge,Año,Mes,Día,Media CO
726,0.6,0.2,0.5,0.2,0.4,0.3,0.7,0.1,0.2,2024,diciembre,27,0.355556
727,0.5,0.2,0.4,0.2,0.6,0.3,0.7,0.1,0.2,2024,diciembre,28,0.355556
728,0.6,0.2,0.4,0.1,0.6,0.2,0.7,0.3,0.1,2024,diciembre,29,0.355556
729,1.0,0.4,0.4,0.2,0.6,0.3,0.9,0.3,0.1,2024,diciembre,30,0.466667
730,0.7,0.3,0.4,0.2,0.2,0.2,0.9,0.2,0.1,2024,diciembre,31,0.355556
