# Staged Wind Data EDA
Quick checks on the staged wind parquet datasets to validate staging outputs and station mappings.


## Setup
Load libraries, locate staged parquet files, and read station metadata.


In [11]:
from pathlib import Path

import pandas as pd
import yaml

data_dir = Path('../data/staged')
parquet_files = sorted(data_dir.glob('wind_*.parquet'))

if not parquet_files:
    raise FileNotFoundError(f'No staged wind parquet files found under {data_dir.resolve()}')

station_mapping_path = Path('../src/configs/station_mapping.yaml')
with station_mapping_path.open('r', encoding='utf-8') as f:
    station_mapping_cfg = yaml.safe_load(f)

station_mapping_df = pd.DataFrame(station_mapping_cfg['wind_stations']['station_mappings'])
station_mapping_df['station_pk'] = station_mapping_df['station_pk'].astype(int)
station_mapping_df[['station_pk', 'station_code', 'station_name', 'location_type']]


Unnamed: 0,station_pk,station_code,station_name,location_type
0,1,ATLAN001,Atlantis AQM Site,AQM
1,2,BELLV001,Bellville South AQM Site,AQM
2,3,BOTHA001,Bothasig AQM Site,AQM
3,4,GOODW001,Goodwood AQM Station,AQM
4,5,KHAYE001,Khayelitsha AQM Site,AQM
5,6,SOMER001,Somerset West AQM Site,AQM
6,7,TABLE001,Tableview AQM Site,AQM
7,8,FORES001,Foreshore AQM Site,AQM
8,9,MOLTE001,Molteno AQM Site,AQM
9,10,PLATT001,Plattekloof AQM Site,AQM


## Available staged files


In [12]:
parquet_files


[PosixPath('../data/staged/wind_2016.parquet'),
 PosixPath('../data/staged/wind_2017.parquet'),
 PosixPath('../data/staged/wind_2018.parquet'),
 PosixPath('../data/staged/wind_2019.parquet'),
 PosixPath('../data/staged/wind_2020.parquet')]

## File-level overview
Inspect row counts, duplicates, and null presence for each staged file.


In [13]:
overview_records = []
for file_path in parquet_files:
    df = pd.read_parquet(file_path)
    row_count = len(df)
    rows_with_nulls = int(df.isna().any(axis=1).sum())
    overview_records.append({
        'file': file_path.name,
        'rows': row_count,
        'columns': df.shape[1],
        'duplicate_rows': int(df.duplicated().sum()),
        'rows_with_nulls': rows_with_nulls,
        'pct_rows_with_nulls': round(rows_with_nulls / row_count * 100, 2) if row_count else 0.0,
    })

overview_df = pd.DataFrame(overview_records)
overview_df


Unnamed: 0,file,rows,columns,duplicate_rows,rows_with_nulls,pct_rows_with_nulls
0,wind_2016.parquet,3456,15,0,3456,100.0
1,wind_2017.parquet,3456,15,0,3456,100.0
2,wind_2018.parquet,3456,15,0,3456,100.0
3,wind_2019.parquet,3456,15,0,3430,99.25
4,wind_2020.parquet,3456,15,0,3161,91.46


## Station coverage by file
Summarise non-null coverage per station/metric and flag any unmapped station IDs.


In [14]:
def parse_station_metric(column_name: str):
    parts = column_name.split('_')
    if len(parts) == 4 and parts[0] == 'station' and parts[2] == 'wind':
        return int(parts[1]), f"wind_{parts[3]}"
    return None, None

station_summary_records = []
known_station_pks = set(station_mapping_df['station_pk'])

for file_path in parquet_files:
    df = pd.read_parquet(file_path)
    present_station_pks = set()
    for column in df.columns:
        if column == 'datetime':
            continue
        station_pk, metric = parse_station_metric(column)
        if station_pk is None:
            continue
        present_station_pks.add(station_pk)
        non_null = int(df[column].notna().sum())
        station_summary_records.append({
            'file': file_path.name,
            'station_pk': station_pk,
            'metric': metric,
            'non_null_points': non_null,
            'total_rows': len(df),
            'pct_non_null': round(non_null / len(df) * 100, 2) if len(df) else 0.0,
        })

    missing_in_mapping = sorted(present_station_pks - known_station_pks)
    if missing_in_mapping:
        station_summary_records.append({
            'file': file_path.name,
            'station_pk': ','.join(map(str, missing_in_mapping)),
            'metric': 'unmapped_station',
            'non_null_points': None,
            'total_rows': len(df),
            'pct_non_null': None,
        })

station_summary_df = pd.DataFrame(station_summary_records)
station_summary_df = station_summary_df.merge(
    station_mapping_df[['station_pk', 'station_code', 'station_name']],
    on='station_pk', how='left'
)
station_summary_df.sort_values(['file', 'station_pk', 'metric']).reset_index(drop=True)


Unnamed: 0,file,station_pk,metric,non_null_points,total_rows,pct_non_null,station_code,station_name
0,wind_2016.parquet,1,wind_direction,564,3456,16.32,ATLAN001,Atlantis AQM Site
1,wind_2016.parquet,1,wind_speed,287,3456,8.30,ATLAN001,Atlantis AQM Site
2,wind_2016.parquet,2,wind_direction,1526,3456,44.16,BELLV001,Bellville South AQM Site
3,wind_2016.parquet,2,wind_speed,1526,3456,44.16,BELLV001,Bellville South AQM Site
4,wind_2016.parquet,3,wind_direction,3451,3456,99.86,BOTHA001,Bothasig AQM Site
...,...,...,...,...,...,...,...,...
65,wind_2020.parquet,5,wind_speed,3015,3456,87.24,KHAYE001,Khayelitsha AQM Site
66,wind_2020.parquet,6,wind_direction,391,3456,11.31,SOMER001,Somerset West AQM Site
67,wind_2020.parquet,6,wind_speed,391,3456,11.31,SOMER001,Somerset West AQM Site
68,wind_2020.parquet,7,wind_direction,3356,3456,97.11,TABLE001,Tableview AQM Site


## Sample file drill-down
Inspect a specific staged file, reshape to long format, and merge with station metadata. Adjust `sample_path` to analyse other years.


In [15]:
sample_path = parquet_files[-1]
sample_path


PosixPath('../data/staged/wind_2020.parquet')

In [16]:
sample_df = pd.read_parquet(sample_path)
sample_df.head()


Unnamed: 0,datetime,station_1_wind_direction,station_1_wind_speed,station_2_wind_direction,station_2_wind_speed,station_3_wind_direction,station_3_wind_speed,station_4_wind_direction,station_4_wind_speed,station_5_wind_direction,station_5_wind_speed,station_6_wind_direction,station_6_wind_speed,station_7_wind_direction,station_7_wind_speed
0,2020-01-01 00:00:00,173.0,4.1,191.0,2.5,163.7,5.3,247.8,19.2,34.2,1.3,135.0,3.8,179.8,5.2
1,2020-01-01 01:00:00,177.7,4.0,209.7,1.6,159.0,5.4,247.0,17.9,34.9,1.1,132.7,2.1,177.9,5.2
2,2020-01-01 02:00:00,180.7,2.8,202.5,1.4,148.8,5.5,246.4,17.1,35.5,1.1,128.5,2.4,167.8,4.0
3,2020-01-01 03:00:00,183.7,2.3,224.7,1.2,153.0,4.7,245.1,15.7,35.5,1.0,357.6,1.1,177.3,4.4
4,2020-01-01 04:00:00,170.7,2.4,244.3,1.3,153.4,4.1,249.9,15.8,35.1,0.8,319.5,1.4,178.7,3.8


In [17]:
long_frames = []
for column in sample_df.columns:
    if column == 'datetime':
        continue
    station_pk, metric = parse_station_metric(column)
    if station_pk is None:
        continue
    frame = sample_df[['datetime', column]].rename(columns={column: 'value'})
    frame['station_pk'] = station_pk
    frame['metric'] = metric
    long_frames.append(frame)

long_sample_df = pd.concat(long_frames, ignore_index=True)
long_sample_df = long_sample_df.merge(
    station_mapping_df[['station_pk', 'station_code', 'station_name']],
    on='station_pk', how='left'
)
long_sample_df.head()


Unnamed: 0,datetime,value,station_pk,metric,station_code,station_name
0,2020-01-01 00:00:00,173.0,1,wind_direction,ATLAN001,Atlantis AQM Site
1,2020-01-01 01:00:00,177.7,1,wind_direction,ATLAN001,Atlantis AQM Site
2,2020-01-01 02:00:00,180.7,1,wind_direction,ATLAN001,Atlantis AQM Site
3,2020-01-01 03:00:00,183.7,1,wind_direction,ATLAN001,Atlantis AQM Site
4,2020-01-01 04:00:00,170.7,1,wind_direction,ATLAN001,Atlantis AQM Site


In [18]:
station_metric_summary = (
    long_sample_df.groupby(['station_pk', 'station_code', 'station_name', 'metric'])
    .agg(total_points=('value', 'size'), non_null=('value', 'count'))
    .assign(pct_non_null=lambda df: (df['non_null'] / df['total_points'] * 100).round(2))
    .reset_index()
)
station_metric_summary


Unnamed: 0,station_pk,station_code,station_name,metric,total_points,non_null,pct_non_null
0,1,ATLAN001,Atlantis AQM Site,wind_direction,3456,1014,29.34
1,1,ATLAN001,Atlantis AQM Site,wind_speed,3456,1014,29.34
2,2,BELLV001,Bellville South AQM Site,wind_direction,3456,2929,84.75
3,2,BELLV001,Bellville South AQM Site,wind_speed,3456,3363,97.31
4,3,BOTHA001,Bothasig AQM Site,wind_direction,3456,3254,94.16
5,3,BOTHA001,Bothasig AQM Site,wind_speed,3456,3254,94.16
6,4,GOODW001,Goodwood AQM Station,wind_direction,3456,3339,96.61
7,4,GOODW001,Goodwood AQM Station,wind_speed,3456,3160,91.44
8,5,KHAYE001,Khayelitsha AQM Site,wind_direction,3456,3015,87.24
9,5,KHAYE001,Khayelitsha AQM Site,wind_speed,3456,3015,87.24


In [19]:
column_summary = pd.DataFrame({
    'dtype': sample_df.dtypes.astype(str),
    'pct_missing': (sample_df.isna().mean().mul(100)).round(2),
}).sort_values('pct_missing', ascending=False)
column_summary


Unnamed: 0,dtype,pct_missing
station_6_wind_speed,float64,88.69
station_6_wind_direction,float64,88.69
station_1_wind_direction,float64,70.66
station_1_wind_speed,float64,70.66
station_2_wind_direction,float64,15.25
station_5_wind_direction,float64,12.76
station_5_wind_speed,float64,12.76
station_4_wind_speed,float64,8.56
station_3_wind_speed,float64,5.84
station_3_wind_direction,float64,5.84


In [20]:
numeric_cols = sample_df.select_dtypes(include='number')
if numeric_cols.shape[1]:
    numeric_cols.describe().T
else:
    pd.DataFrame({'message': ['No numeric columns detected']})
