# Data Cleaning - Filling Missing Dates

In [49]:
import pandas as pd 
import polars as pl 
import datetime
import numpy as np

In [50]:
start_date = "2022-01-01"
end_date = "2025-04-04"

# Generate the list of dates (as datetime64)
all_dates = pd.date_range(start=start_date, end=end_date).date
all_dates

array([datetime.date(2022, 1, 1), datetime.date(2022, 1, 2),
       datetime.date(2022, 1, 3), ..., datetime.date(2025, 4, 2),
       datetime.date(2025, 4, 3), datetime.date(2025, 4, 4)], dtype=object)

In [51]:
df = pl.read_csv(r"../WeatherData.csv").to_pandas()
df["Dia"] = pd.to_datetime(df["Dia"], format="%Y-%m-%d %H:%M:%S")
df["Fecha"] = df["Dia"].dt.date
df = df[~df["Sensor_id"].isin(["ANL10", "ANL7", "ANL15", "ANL16"])]

In [52]:
df.describe()

Unnamed: 0,Dia,Registros_id,PM10,PM25,O3,CO,NO1,NO2,NOx,SO2,TEMPERATURA,LLUVIA,PRESIONATM,HUMEDAD,RS,VIENTOVEL,Year,hour,month
count,286652,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286622.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0,286652.0
mean,2023-09-03 09:37:24.894855424,2243097.0,59.492172,19.32329,0.026183,1.336474,0.007159,0.01515,0.022717,0.004072,22.155049,0.025695,714.797176,51.80842,0.111183,7.668613,2023.193681,11.481361,6.262625
min,2022-01-01 00:00:00,2048929.0,0.0,0.0,0.0,0.0005,0.0005,0.0,0.0005,1e-05,-8.75,0.0,690.2,0.0,0.0,0.0,2022.0,0.0,1.0
25%,2022-11-09 07:00:00,2145896.0,37.0,10.0,0.012125,0.76,0.0031,0.0073,0.0116,0.0029,17.09,0.0,709.55,35.0,0.0,4.3,2022.0,5.0,3.0
50%,2023-09-23 08:00:00,2242050.0,53.0,16.44,0.023,1.21,0.0047,0.0126,0.0179,0.0038,23.29,0.0,714.6,55.0,0.001,7.1,2023.0,11.0,6.0
75%,2024-06-26 00:00:00,2340835.0,77.0,26.0,0.037,1.78,0.0085,0.0211,0.0298,0.005,27.93,0.0,721.2,73.0,0.184,10.6,2024.0,17.0,9.0
max,2025-04-04 13:00:00,2440719.0,154.0,67.0,0.079,4.57,0.063222,0.0561,0.1105,0.01077,44.94,61.0,742.633333,116.0,0.712,26.4,2025.0,23.0,12.0
std,,113173.8,30.10589,12.221897,0.017152,0.743106,0.00729,0.009957,0.015834,0.001603,8.282676,0.662094,9.277771,25.493862,0.175953,4.427186,0.956874,6.930938,3.604814


In [53]:
# Find missing dates
missing_dates = set(all_dates) - set(df["Fecha"])
missing_dates = sorted(list(missing_dates))  # Sort for clarity

print("Missing dates:", missing_dates)
len(missing_dates)

Missing dates: [datetime.date(2022, 3, 21), datetime.date(2022, 5, 1), datetime.date(2022, 7, 14), datetime.date(2022, 7, 15), datetime.date(2022, 7, 16), datetime.date(2022, 7, 17), datetime.date(2022, 7, 18), datetime.date(2022, 7, 19), datetime.date(2022, 7, 20), datetime.date(2022, 7, 21), datetime.date(2022, 7, 22), datetime.date(2022, 7, 23), datetime.date(2022, 7, 24), datetime.date(2022, 7, 26), datetime.date(2022, 7, 27), datetime.date(2023, 5, 6), datetime.date(2023, 5, 7), datetime.date(2023, 5, 8), datetime.date(2023, 5, 9), datetime.date(2023, 5, 10), datetime.date(2023, 5, 11), datetime.date(2023, 5, 12), datetime.date(2023, 5, 13), datetime.date(2023, 5, 14), datetime.date(2023, 5, 15), datetime.date(2023, 5, 16), datetime.date(2023, 5, 17), datetime.date(2023, 5, 18), datetime.date(2023, 5, 19), datetime.date(2023, 5, 20), datetime.date(2023, 5, 21), datetime.date(2023, 5, 22), datetime.date(2023, 6, 17), datetime.date(2023, 6, 18), datetime.date(2023, 6, 19), datetime.

48

In [71]:
missing_dates_per_sensor = {}

for sensor in df["Sensor_id"].unique():
    sensor_dates = df[df["Sensor_id"] == sensor]["Fecha"].unique()
    missing_dates = sorted(list(set(all_dates) - set(sensor_dates)))
    missing_dates_per_sensor[sensor] = len(missing_dates)

print(missing_dates_per_sensor)

{'ANL1': 48, 'ANL11': 48, 'ANL12': 48, 'ANL13': 48, 'ANL2': 48, 'ANL3': 48, 'ANL4': 48, 'ANL5': 48, 'ANL6': 48, 'ANL8': 48, 'ANL9': 48}


In [54]:
df.columns

Index(['Dia', 'Registros_id', 'PM10', 'PM25', 'O3', 'CO', 'NO1', 'NO2', 'NOx',
       'SO2', 'Sensor_id', 'TEMPERATURA', 'LLUVIA', 'PRESIONATM', 'HUMEDAD',
       'RS', 'VIENTOVEL', 'Year', 'period_signature', 'key', 'hour', 'month',
       'Fecha'],
      dtype='object')

In [55]:
def circular_mean(series):
    """
    Compute the circular mean of angles (in radians).
    """
    sin_sum = np.sum(np.sin(series))
    cos_sum = np.sum(np.cos(series))
    return np.arctan2(sin_sum, cos_sum)

In [56]:
test = df.groupby(["Fecha", "Sensor_id"]).agg({
    "PM10": "mean",
    "PM25": "mean", 
    "O3": "mean", 
    'TEMPERATURA': "mean",
    'LLUVIA': "mean",
    'PRESIONATM': "mean",
    'HUMEDAD': "mean",
    'VIENTOVEL': "mean",
}).reset_index()

# Now compute circular mean for RS separately and merge
rs_circular = df.groupby(["Fecha", "Sensor_id"])["RS"].apply(circular_mean).reset_index()

# Merge with the other aggregates
test = test.merge(rs_circular, on=["Fecha", "Sensor_id"])

In [57]:
# All unique sensors
all_sensors = test["Sensor_id"].unique()

# Create cartesian product: all_dates × all_sensors
full_index = pd.MultiIndex.from_product([all_dates, all_sensors], names=["Fecha", "Sensor_id"]).to_frame(index=False)

# Merge with original data
df_filled = pd.merge(full_index, test, on=["Fecha", "Sensor_id"], how="left")

# Optional: sort for clarity
df_filled = df_filled.sort_values(["Sensor_id", "Fecha"]).reset_index(drop=True)

df_filled

Unnamed: 0,Fecha,Sensor_id,PM10,PM25,O3,TEMPERATURA,LLUVIA,PRESIONATM,HUMEDAD,VIENTOVEL,RS
0,2022-01-01,ANL1,78.000000,13.485972,0.023708,15.975208,0.00000,721.343056,54.687500,5.856944,0.066339
1,2022-01-02,ANL1,63.312500,10.752917,0.022917,15.975208,0.00000,724.288889,54.687500,8.354167,0.044434
2,2022-01-03,ANL1,32.625000,8.228333,0.024792,15.975208,0.00000,723.117708,54.687500,4.841667,0.061191
3,2022-01-04,ANL1,46.750000,16.111667,0.025417,15.975208,0.00000,724.248958,54.687500,9.046528,0.064572
4,2022-01-05,ANL1,62.478261,19.128696,0.023000,15.961957,0.00000,722.626812,54.673913,7.321739,0.100207
...,...,...,...,...,...,...,...,...,...,...,...
13085,2025-03-31,ANL9,56.666667,21.977555,0.036458,30.110833,0.00000,712.108333,0.375000,7.925000,0.000000
13086,2025-04-01,ANL9,59.708333,24.281458,0.037000,27.704167,0.00000,711.379167,0.250000,8.266667,0.000000
13087,2025-04-02,ANL9,70.541667,26.411250,0.030196,31.240000,1.68750,705.995833,4.916667,7.866667,0.000000
13088,2025-04-03,ANL9,71.395833,20.867917,0.034262,32.903542,0.85625,706.914583,2.875000,8.029167,0.000000


In [58]:
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13090 entries, 0 to 13089
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Fecha        13090 non-null  object 
 1   Sensor_id    13090 non-null  object 
 2   PM10         12562 non-null  float64
 3   PM25         12562 non-null  float64
 4   O3           12562 non-null  float64
 5   TEMPERATURA  12562 non-null  float64
 6   LLUVIA       12562 non-null  float64
 7   PRESIONATM   12562 non-null  float64
 8   HUMEDAD      12562 non-null  float64
 9   VIENTOVEL    12562 non-null  float64
 10  RS           12562 non-null  float64
dtypes: float64(9), object(2)
memory usage: 1.1+ MB


In [60]:
# Identify float columns (excluding Fecha and Sensor_id)
float_cols = df_filled.select_dtypes(include="float64").columns

# Sort before interpolation
df_filled = df_filled.sort_values(by=["Sensor_id", "Fecha"])

# Interpolate within each Sensor_id
df_filled[float_cols] = df_filled.groupby("Sensor_id")[float_cols].transform(lambda group: group.interpolate())

df_filled

Unnamed: 0,Fecha,Sensor_id,PM10,PM25,O3,TEMPERATURA,LLUVIA,PRESIONATM,HUMEDAD,VIENTOVEL,RS
0,2022-01-01,ANL1,78.000000,13.485972,0.023708,15.975208,0.00000,721.343056,54.687500,5.856944,0.066339
1,2022-01-02,ANL1,63.312500,10.752917,0.022917,15.975208,0.00000,724.288889,54.687500,8.354167,0.044434
2,2022-01-03,ANL1,32.625000,8.228333,0.024792,15.975208,0.00000,723.117708,54.687500,4.841667,0.061191
3,2022-01-04,ANL1,46.750000,16.111667,0.025417,15.975208,0.00000,724.248958,54.687500,9.046528,0.064572
4,2022-01-05,ANL1,62.478261,19.128696,0.023000,15.961957,0.00000,722.626812,54.673913,7.321739,0.100207
...,...,...,...,...,...,...,...,...,...,...,...
13085,2025-03-31,ANL9,56.666667,21.977555,0.036458,30.110833,0.00000,712.108333,0.375000,7.925000,0.000000
13086,2025-04-01,ANL9,59.708333,24.281458,0.037000,27.704167,0.00000,711.379167,0.250000,8.266667,0.000000
13087,2025-04-02,ANL9,70.541667,26.411250,0.030196,31.240000,1.68750,705.995833,4.916667,7.866667,0.000000
13088,2025-04-03,ANL9,71.395833,20.867917,0.034262,32.903542,0.85625,706.914583,2.875000,8.029167,0.000000


In [63]:
df_filled.to_csv(r"../DailyWeatherData.csv", index=False)