# Normalised Wind Data EDA
Manual verification of tall-format wind measurements produced by the normalisation pipeline.


## Setup
Load normalised fact files and station dimension metadata.


In [1]:
from pathlib import Path

import pandas as pd

normalised_dir = Path('../data/normalised')
fact_files = sorted(normalised_dir.glob('wind_*_normalised.parquet'))
station_dim_path = normalised_dir / 'station_dim.parquet'

if not fact_files:
    raise FileNotFoundError(f'No normalised wind fact files found in {normalised_dir.resolve()}')

if not station_dim_path.exists():
    raise FileNotFoundError(f'Station dimension not found at {station_dim_path.resolve()}')

station_dim_df = pd.read_parquet(station_dim_path)
station_dim_df.head()


Unnamed: 0,station_pk,station_code,station_name,location_type,description
0,1,ATLAN001,Atlantis AQM Site,AQM,Air Quality Monitoring site in Atlantis
1,2,BELLV001,Bellville South AQM Site,AQM,Air Quality Monitoring site in Bellville South
2,3,BOTHA001,Bothasig AQM Site,AQM,Air Quality Monitoring site in Bothasig
3,4,GOODW001,Goodwood AQM Station,AQM,Air Quality Monitoring station in Goodwood
4,5,KHAYE001,Khayelitsha AQM Site,AQM,Air Quality Monitoring site in Khayelitsha


## Available normalised files


In [2]:
fact_files


[PosixPath('../data/normalised/wind_2016_normalised.parquet'),
 PosixPath('../data/normalised/wind_2017_normalised.parquet'),
 PosixPath('../data/normalised/wind_2018_normalised.parquet'),
 PosixPath('../data/normalised/wind_2019_normalised.parquet'),
 PosixPath('../data/normalised/wind_2020_normalised.parquet')]

## File-level overview
Row counts, station coverage, metric completeness, and date spans for each normalised file.


In [3]:
overview_records = []
for fact_path in fact_files:
    df = pd.read_parquet(fact_path)
    overview_records.append({
        'file': fact_path.name,
        'rows': len(df),
        'stations': df['station_pk'].nunique(),
        'metrics': df['metric'].nunique(),
        'pct_valid': round((df['quality_flag'] == 'VALID').mean() * 100, 2),
        'date_range': f"{df['datetime'].min()} → {df['datetime'].max()}",
    })
overview_df = pd.DataFrame(overview_records)
overview_df


Unnamed: 0,file,rows,stations,metrics,pct_valid,date_range
0,wind_2016_normalised.parquet,48384,7,2,59.75,2016-01-01 00:00:00 → 2016-12-12 23:00:00
1,wind_2017_normalised.parquet,48384,7,2,70.08,2017-01-01 00:00:00 → 2017-12-12 23:00:00
2,wind_2018_normalised.parquet,48384,7,2,57.59,2018-01-01 00:00:00 → 2018-12-12 23:00:00
3,wind_2019_normalised.parquet,48384,7,2,71.73,2019-01-01 00:00:00 → 2019-12-12 23:00:00
4,wind_2020_normalised.parquet,48384,7,2,72.03,2020-01-01 00:00:00 → 2020-12-12 23:00:00


## Station and metric coverage
Ensure each station has both wind metrics and inspect percentage of valid measurements.


In [4]:
coverage_frames = []
for fact_path in fact_files:
    df = pd.read_parquet(fact_path)
    summary = (
        df.groupby(['station_pk', 'metric'])
        .agg(total_points=('value', 'size'), valid_points=('quality_flag', lambda s: (s == 'VALID').sum()))
        .assign(pct_valid=lambda frame: (frame['valid_points'] / frame['total_points'] * 100).round(2))
        .reset_index()
    )
    summary['file'] = fact_path.name
    coverage_frames.append(summary)

coverage_df = pd.concat(coverage_frames, ignore_index=True)
coverage_df = coverage_df.merge(station_dim_df[['station_pk', 'station_code', 'station_name']], on='station_pk', how='left')
coverage_df.sort_values(['file', 'station_pk', 'metric']).reset_index(drop=True)


Unnamed: 0,station_pk,metric,total_points,valid_points,pct_valid,file,station_code,station_name
0,1,wind_direction,3456,564,16.32,wind_2016_normalised.parquet,ATLAN001,Atlantis AQM Site
1,1,wind_speed,3456,287,8.30,wind_2016_normalised.parquet,ATLAN001,Atlantis AQM Site
2,2,wind_direction,3456,1526,44.16,wind_2016_normalised.parquet,BELLV001,Bellville South AQM Site
3,2,wind_speed,3456,1526,44.16,wind_2016_normalised.parquet,BELLV001,Bellville South AQM Site
4,3,wind_direction,3456,3451,99.86,wind_2016_normalised.parquet,BOTHA001,Bothasig AQM Site
...,...,...,...,...,...,...,...,...
65,5,wind_speed,3456,3015,87.24,wind_2020_normalised.parquet,KHAYE001,Khayelitsha AQM Site
66,6,wind_direction,3456,391,11.31,wind_2020_normalised.parquet,SOMER001,Somerset West AQM Site
67,6,wind_speed,3456,391,11.31,wind_2020_normalised.parquet,SOMER001,Somerset West AQM Site
68,7,wind_direction,3456,3356,97.11,wind_2020_normalised.parquet,TABLE001,Tableview AQM Site


## Sample year drill-down
Inspect the latest normalised file. Change `sample_path` to focus on another year.


In [5]:
sample_path = fact_files[-1]
sample_path


PosixPath('../data/normalised/wind_2020_normalised.parquet')

In [6]:
sample_df = pd.read_parquet(sample_path)
sample_df.head()


Unnamed: 0,datetime,station_pk,station_code,station_name,location_type,metric,unit,value,quality_flag,source,year
0,2020-01-01,1,ATLAN001,Atlantis AQM Site,AQM,wind_direction,degrees,173.0,VALID,wind,2020
1,2020-01-01,1,ATLAN001,Atlantis AQM Site,AQM,wind_speed,m/s,4.1,VALID,wind,2020
2,2020-01-01,2,BELLV001,Bellville South AQM Site,AQM,wind_direction,degrees,191.0,VALID,wind,2020
3,2020-01-01,2,BELLV001,Bellville South AQM Site,AQM,wind_speed,m/s,2.5,VALID,wind,2020
4,2020-01-01,3,BOTHA001,Bothasig AQM Site,AQM,wind_direction,degrees,163.7,VALID,wind,2020


In [7]:
pivot = (
    sample_df.pivot_table(
        index='station_code',
        columns='metric',
        values='value',
        aggfunc='mean'
    )
    .round(2)
    .sort_index()
)
pivot


metric,wind_direction,wind_speed
station_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ATLAN001,187.49,3.78
BELLV001,203.06,2.06
BOTHA001,195.03,3.61
GOODW001,242.0,14.34
KHAYE001,38.36,0.69
SOMER001,210.08,2.42
TABLE001,178.4,3.67


In [8]:
missing_summary = (
    sample_df.groupby(['station_code', 'metric'])['quality_flag']
    .apply(lambda s: (s == 'NODATA').mean() * 100)
    .round(2)
    .reset_index(name='pct_nodata')
)
missing_summary.sort_values(['pct_nodata', 'station_code', 'metric'], ascending=[False, True, True])


Unnamed: 0,station_code,metric,pct_nodata
10,SOMER001,wind_direction,88.69
11,SOMER001,wind_speed,88.69
0,ATLAN001,wind_direction,70.66
1,ATLAN001,wind_speed,70.66
2,BELLV001,wind_direction,15.25
8,KHAYE001,wind_direction,12.76
9,KHAYE001,wind_speed,12.76
7,GOODW001,wind_speed,8.56
4,BOTHA001,wind_direction,5.84
5,BOTHA001,wind_speed,5.84
