# 🧠 Air Pollution ML Preprocessing: From Raw to Ready (Baby Style)
This notebook walks you through **every step** of preparing air pollution data for ML modeling.

We'll answer:
- Why are there only 2 dots?
- What does interpolation mean?
- What is 2km × 2km?
- How to convert the real-world map into ML-ready 2D arrays


In [None]:
import pandas as pd
import os
import glob

# Path where your .parquet files are stored (edit this if needed)
folder_path = "C:/Users/elhajjas/Downloads/ParquetFiles/E1a"  # Same folder as notebook
parquet_files = glob.glob(os.path.join(folder_path, "*.parquet"))
print(f"Found {len(parquet_files)} parquet files.")


In [39]:
import pandas as pd

# Load all parquet files into a single DataFrame
df_list = [pd.read_parquet(file) for file in parquet_files]
df = pd.concat(df_list, ignore_index=True)

# Preview data
print(df.columns)
print(df.head())


Index(['Samplingpoint', 'Pollutant', 'Start', 'End', 'Value', 'Unit',
       'AggType', 'Validity', 'Verification', 'ResultTime', 'DataCapture',
       'FkObservationLog'],
      dtype='object')
              Samplingpoint  Pollutant               Start  \
0  LU/SPO-LU0101A_00008_101          8 2022-08-31 00:00:00   
1  LU/SPO-LU0101A_00008_101          8 2022-08-31 01:00:00   
2  LU/SPO-LU0101A_00008_101          8 2022-08-31 02:00:00   
3  LU/SPO-LU0101A_00008_101          8 2022-08-31 03:00:00   
4  LU/SPO-LU0101A_00008_101          8 2022-08-31 04:00:00   

                  End                    Value    Unit AggType  Validity  \
0 2022-08-31 01:00:00    17.400000000000000000  ug.m-3    hour         1   
1 2022-08-31 02:00:00    13.200000000000000000  ug.m-3    hour         1   
2 2022-08-31 03:00:00  -999.000000000000000000  ug.m-3    hour        -1   
3 2022-08-31 04:00:00     9.700000000000000000  ug.m-3    hour         1   
4 2022-08-31 05:00:00    16.100000000000000000  ug.m

In [None]:
print(df.columns)

In [40]:
# Load the lookup table
stations_df = pd.read_csv("C:/Users/elhajjas/Downloads/DataExtract.csv/DataExtract.csv")

# Preview
print(stations_df.columns)
print(stations_df.head())


Index(['Country', 'B-G Namespace', 'Year', 'Air Quality Network',
       'Air Quality Network Name', 'Timezone', 'Air Quality Station EoI Code',
       'Air Quality Station Nat Code', 'Air Quality Station Name',
       'Samplingpoint', 'Air Pollutant', 'Longitude', 'Latitude', 'Altitude',
       'Altitude Unit', 'Air Quality Station Area', 'Air Quality Station Type',
       'Operational Activity Begin', 'Operational Activity End', 'Sample Id',
       'Inlet Height', 'Inlet Height Unit', 'Building Distance',
       'Building Distance Unit', 'Kerb Distance', 'Kerb Distance Unit',
       'Distance Source', 'Distance Source Unit', 'Main Emission Sources',
       'Heating Emissions', 'Heating Emissions Unit', 'Mobile',
       'Traffic Emissions', 'Traffic Emissions Unit', 'Industrial Emissions',
       'Industrial Emissions Unit', 'Municipality', 'Dispersion Local',
       'Dispersion Regional', 'Distance Junction', 'Distance Junction Unit',
       'Heavy Duty Fraction', 'Height Facades', '

In [41]:
# Example (adjust based on actual names)
stations_df = stations_df.rename(columns={
    "Latitude": "latitude",
    "Longitude": "longitude"
})


In [50]:
# Merge coordinates into main NO2 data
df = df.merge(stations_df, on="Samplingpoint", how="left")

# # Drop invalid or missing geolocations
# df = df.dropna(subset=["latitude", "longitude", "Value"])  # 'Value' is your NO2 reading


In [51]:
df

Unnamed: 0,Samplingpoint,Pollutant,Start,End,Value,Unit,AggType,Validity,Verification,ResultTime,...,Detection Limit_y,Detection Limit Unit_y,Documentation_y,QA Report_y,Duration_y,Duration Unit_y,Cadence_y,Cadence Unit_y,Source Data URL_y,Imported_y
0,LU/SPO-LU0101A_00008_101,8,2022-08-31 00:00:00,2022-08-31 01:00:00,17.4,ug.m-3,hour,1,1,2023-09-21 08:59:34,...,,,,,,,,,,
1,LU/SPO-LU0101A_00008_101,8,2022-08-31 01:00:00,2022-08-31 02:00:00,13.2,ug.m-3,hour,1,1,2023-09-21 08:59:34,...,,,,,,,,,,
2,LU/SPO-LU0101A_00008_101,8,2022-08-31 02:00:00,2022-08-31 03:00:00,-999.0,ug.m-3,hour,-1,1,2023-09-21 08:59:34,...,,,,,,,,,,
3,LU/SPO-LU0101A_00008_101,8,2022-08-31 03:00:00,2022-08-31 04:00:00,9.7,ug.m-3,hour,1,1,2023-09-21 08:59:34,...,,,,,,,,,,
4,LU/SPO-LU0101A_00008_101,8,2022-08-31 04:00:00,2022-08-31 05:00:00,16.1,ug.m-3,hour,1,1,2023-09-21 08:59:34,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35134,LU/SPO-LU0114A_00008_100,8,2023-12-31 20:00:00,2023-12-31 21:00:00,5.3,ug.m-3,hour,1,1,2024-09-05 07:00:19,...,,,,,,,,,,
35135,LU/SPO-LU0114A_00008_100,8,2023-12-31 21:00:00,2023-12-31 22:00:00,2.7,ug.m-3,hour,1,1,2024-09-05 07:00:19,...,,,,,,,,,,
35136,LU/SPO-LU0114A_00008_100,8,2023-12-31 22:00:00,2023-12-31 23:00:00,2.5,ug.m-3,hour,1,1,2024-09-05 07:00:19,...,,,,,,,,,,
35137,LU/SPO-LU0114A_00008_100,8,2023-12-31 23:00:00,2024-01-01 00:00:00,2.0,ug.m-3,hour,1,1,2024-09-05 07:00:19,...,,,,,,,,,,


In [None]:
pip install folium

In [44]:
print(df[['latitude', 'longitude']].isna().sum())
print(df.dtypes)


latitude     35139
longitude    35139
dtype: int64
Samplingpoint              object
Pollutant                   int32
Start              datetime64[ns]
End                datetime64[ns]
Value                      object
                        ...      
Duration Unit              object
Cadence                   float64
Cadence Unit               object
Source Data URL            object
Imported                   object
Length: 81, dtype: object


In [46]:
# Convert Value column to numeric NO2 concentrations
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

# Drop any rows with missing coordinates or NO2 values
df_clean = df.dropna(subset=['latitude', 'longitude', 'Value'])


In [47]:
print(df_clean[['latitude', 'longitude', 'Value']].head())
print("Rows remaining:", len(df_clean))


Empty DataFrame
Columns: [latitude, longitude, Value]
Index: []
Rows remaining: 0


In [48]:
df_clean = df.dropna(subset=['latitude', 'longitude', 'Value'])  # ensure no NaNs

In [49]:
import folium
from folium.plugins import HeatMap

# Create base map centered on mean location
map_center = [df['latitude'].mean(), df['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=6)

# Create heatmap layer using NO2 concentration as weight
heat_data = [[row['latitude'], row['longitude'], row['NO2']] for index, row in df.iterrows()]
HeatMap(heat_data, radius=8, blur=5).add_to(m)

# Save or display
m.save("NO2_map.html")
m


ValueError: Location values cannot contain NaNs.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sc = plt.scatter(df['longitude'], df['latitude'], c=df['NO2'], cmap='inferno', s=10)
plt.colorbar(sc, label='NO₂ concentration')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('NO₂ Concentration Map')
plt.show()


In [None]:
# Load all .parquet files into one DataFrame
dfs = []
for f in parquet_files:
    df = pd.read_parquet(f)
    df['source_file'] = os.path.basename(f)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.head()

In [None]:
# Show info and example column names
combined_df.info()
print("Columns:", combined_df.columns.tolist())
combined_df[['Start', 'End', 'Pollutant', 'Samplingpoint', 'Value']].head()

In [None]:
combined_df["Pollutant"].unique()

In [None]:
pollutant_map = {6001: "PM10", 5: "PM2.5"}
combined_df["PollutantName"] = combined_df["Pollutant"].map(pollutant_map)

In [None]:
# Load metadata CSV
metadata = pd.read_csv("C:/Users/elhajjas/Downloads/DataExtract.csv/DataExtract.csv")

# Clean and rename columns
metadata.columns = metadata.columns.str.strip()
metadata = metadata.rename(columns={"Sampling Point Id": "Samplingpoint"})

# Keep only relevant columns
metadata = metadata[["Samplingpoint", "Latitude", "Longitude"]]

# Drop duplicates (in case the same Samplingpoint has multiple pollutants)
metadata = metadata.drop_duplicates(subset=["Samplingpoint"])

# Merge with your measurement data
combined_df = combined_df.merge(metadata, on="Samplingpoint", how="left")


In [None]:
combined_df["PollutantName"] = combined_df["Pollutant"].map(pollutant_map)
combined_df["Start"] = pd.to_datetime(combined_df["Start"])
combined_df["StartHour"] = combined_df["Start"].dt.floor("H")
combined_df["Samplingpoint"] = combined_df["Samplingpoint"].str.replace("LU/", "", regex=False)

In [None]:
import pandas as pd

# Load cleaned measurement data (CSV format)
measurements = combined_df
meta = pd.read_csv("C:/Users/elhajjas/Downloads/DataExtract.csv/DataExtract.csv")

# Clean Samplingpoint ID (match format on both sides)
# Load station metadata (one row per Samplingpoint)
meta = pd.read_csv("C:/Users/elhajjas/Downloads/DataExtract.csv/DataExtract.csv")
meta.columns = meta.columns.str.strip()
meta = meta.rename(columns={"Sampling Point Id": "Samplingpoint"})
meta = meta[["Samplingpoint", "Latitude", "Longitude"]].drop_duplicates()
# Final Samplingpoint cleanup
# Clean Samplingpoint on both sides
combined_df["Samplingpoint"] = combined_df["Samplingpoint"].str.upper().str.strip().str.replace("LU/", "", regex=False)
meta["Samplingpoint"] = meta["Samplingpoint"].str.upper().str.strip()

# Merge metadata (Latitude, Longitude)
df = measurements.merge(meta[["Samplingpoint", "Latitude", "Longitude"]], on="Samplingpoint", how="left")

# Map pollutant codes to names
pollutant_map = {5: "PM2.5", 6001: "PM10"}
df["PollutantName"] = df["Pollutant"].map(pollutant_map)

# Parse datetime
df["Start"] = pd.to_datetime(df["Start"])
df["Hour"] = df["Start"].dt.floor("H")

# Drop missing coordinates
df = df.dropna(subset=["Latitude", "Longitude", "Value"])
print(f"After merge: {df.shape[0]} rows, {df['Samplingpoint'].nunique()} stations")
df.head()

In [None]:
import matplotlib.pyplot as plt

station_count = df.groupby("Hour")["Samplingpoint"].nunique()
plt.figure(figsize=(10, 4))
station_count.plot()
plt.title("🛰 Number of Stations Available per Hour")
plt.ylabel("Number of Stations")
plt.xlabel("Hour")
plt.grid(True)
plt.show()

In [None]:
# Find first hour with at least 2 stations
valid_hours = station_count[station_count >= 2].index
example_time = valid_hours[0]
print("Selected hour:", example_time)

df_hour = df[(df["Hour"] == example_time) & (df["PollutantName"] == "PM2.5")]
df_hour = df_hour.dropna(subset=["Latitude", "Longitude", "Value"])
print(f"Stations used: {len(df_hour)}")
df_hour

## 🎨 What is Interpolation (Baby Style)

Imagine you only have **2 weather stations** that tell you the air pollution.

But your ML model needs a **grid** like pixels in an image.

So we use a trick called **Inverse Distance Weighting (IDW)**:
- Pretend pollution spreads out from each station
- Areas **closer to a station** get more influence
- We "guess" the pollution at each pixel (grid cell) based on the nearest stations

The more stations you have, the better your guess 💡

In [None]:
import numpy as np
from scipy.spatial import cKDTree

def idw(x, y, z, xi, yi, power=2):
    tree = cKDTree(np.c_[x, y])
    k = min(4, len(z))
    dist, idx = tree.query(np.c_[xi.ravel(), yi.ravel()], k=k)
    if k == 1:
        idx = idx[:, np.newaxis]
        dist = dist[:, np.newaxis]
    weights = 1 / (dist**power + 1e-12)
    weights /= weights.sum(axis=1)[:, None]
    zi = np.sum(weights * z[idx], axis=1)
    return zi.reshape(xi.shape)

# Create grid from min/max lat/lon
lon_min, lon_max = df_hour["Longitude"].min(), df_hour["Longitude"].max()
lat_min, lat_max = df_hour["Latitude"].min(), df_hour["Latitude"].max()
grid_x = np.linspace(lon_min, lon_max, 100)
grid_y = np.linspace(lat_min, lat_max, 100)
xi, yi = np.meshgrid(grid_x, grid_y)

# Interpolate
x, y, z = df_hour["Longitude"].values, df_hour["Latitude"].values, df_hour["Value"].values
zi = idw(x, y, z, xi, yi)

In [None]:
plt.figure(figsize=(8, 6))
cs = plt.contourf(xi, yi, zi, cmap="plasma", levels=20)
plt.scatter(x, y, c=z, cmap="plasma", edgecolor="white", s=100, label="Stations")
plt.colorbar(cs, label="PM2.5 (µg/m³)")
plt.title(f"🗺 Interpolated PM2.5 Field @ {example_time}")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Final ML input: 100x100 pixel grid of pollution
ml_input = zi.astype(np.float32)
print("ML input shape:", ml_input.shape)

# Save for model
np.save("pm25_grid_input.npy", ml_input)