In [8]:
%autosave 30
%reload_ext autoreload

Autosaving every 30 seconds


# Imports

In [2]:
import xarray as xr
import pandas as pd

import matplotlib.pyplot as plt

# Data Analysis

In [29]:
ds = xr.open_dataset('data/KMDS__OPER_P___10M_OBS_L2_202402132020.nc')
df = ds.to_dataframe()


print(df.head())
print(df.columns)
print(df.size)

# Export df into csv
df.to_csv('data/KMDS__OPER_P___10M_OBS_L2_202402132020.csv')

                                         wsi stationname      lat     lon  \
station time                                                                
06201   2024-02-13 20:20:00  0-20000-0-06201    D15-FA-1  54.3256  2.9358   
06203   2024-02-13 20:20:00  0-20000-0-06203       P11-B  52.3600  3.3417   
06204   2024-02-13 20:20:00  0-20000-0-06204   K14-FA-1C  53.2694  3.6278   
06205   2024-02-13 20:20:00  0-20000-0-06205     A12-CPP  55.3992  3.8103   
06207   2024-02-13 20:20:00  0-20000-0-06207     L9-FF-1  53.6144  4.9603   

                             height  D1H     dd     dn  dr   dsd  ...  W10  \
station time                                                      ...        
06201   2024-02-13 20:20:00   42.70  NaN  215.8  144.8 NaN  19.2  ...  5.0   
06203   2024-02-13 20:20:00   41.84  NaN  184.7  158.9 NaN   8.6  ...  6.0   
06204   2024-02-13 20:20:00   41.80  NaN  179.8  168.8 NaN   3.9  ...  6.0   
06205   2024-02-13 20:20:00   48.35  NaN  215.1  201.1 NaN   2.8  ... 

In [23]:
# Print global attributes
print("Global Attributes:")
for attr in ds.attrs:
    print(f"{attr}: {ds.attrs[attr]}")

# Print variable attributes
print("\nVariable Attributes:")
for var in ds.variables:
    print(f"{var} Attributes:")
    for attr in ds.variables[var].attrs:
        print(f"  {attr}: {ds.variables[var].attrs[attr]}")

# Print encoding information (useful for saving back to NetCDF)
print("\nEncoding:")
for var in ds.variables:
    print(f"{var} Encoding: {ds.variables[var].encoding}")

Global Attributes:
featureType: timeSeries
Conventions: CF-1.4
title: KMDS__OPER_P___10M_OBS_L2
institution: Royal Netherlands Meteorological Institute (KNMI)
source: Royal Netherlands Meteorological Institute (KNMI)
history: File created from KNMI's Meteorological Data Distribution System (KMDS).
references: https://dataplatform.knmi.nl
comment: Sensor set-ups at 'Platform/AWS' sites deviate from land AWS sites, resulting in different sensor elevations. Similarly, sensor set-ups at 'Mistposten' serve specific aviation-purposes and they do not meet all WMO recommendations for surface observing stations on land.

Variable Attributes:
station Attributes:
  long_name: Station id
  cf_role: timeseries_id
time Attributes:
  long_name: time of measurement
  standard_name: time
wsi Attributes:
  long_name: Station wsi
stationname Attributes:
  long_name: Station name
lat Attributes:
  long_name: station  latitude
  standard_name: latitude
  units: degrees_north
lon Attributes:
  long_name: st

In [25]:
# Check the number of numerical and categorical variables
num_vars = df.select_dtypes(include=['float64', 'int64']).columns
cat_vars = df.select_dtypes(include=['object']).columns

print(f"\nNumber of numerical variables: {len(num_vars)}")
print(f"Number of categorical variables: {len(cat_vars)}")

# Check for each of the categorical variables, the number of unique values and print them
print("\nUnique values for categorical variables:")
for var in cat_vars:
    print(f"{var}: {df[var].nunique()} unique values")
    print(df[var].unique())


Number of numerical variables: 94
Number of categorical variables: 7

Unique values for categorical variables:
wsi: 69 unique values
['0-20000-0-06201' '0-20000-0-06203' '0-20000-0-06204' '0-20000-0-06205'
 '0-20000-0-06207' '0-20000-0-06208' '0-20000-0-06209' '0-20000-0-06211'
 '0-20000-0-06214' '0-20000-0-06215' '0-528-0-06216' '0-20000-0-06225'
 '0-20000-0-06229' '0-528-0-06233' '0-20000-0-06235' '0-528-0-06236'
 '0-528-0-06237' '0-528-0-06238' '0-20000-0-06239' '0-20000-0-06240'
 '0-20000-0-06242' '0-20000-0-06248' '0-20000-0-06249' '0-20000-0-06251'
 '0-20000-0-06252' '0-20000-0-06257' '0-20000-0-06258' '0-20000-0-06260'
 '0-20000-0-06267' '0-20000-0-06269' '0-20000-0-06270' '0-20000-0-06273'
 '0-20000-0-06275' '0-20000-0-06277' '0-20000-0-06278' '0-20000-0-06279'
 '0-20000-0-06280' '0-20000-0-06283' '0-20000-0-06285' '0-20000-0-06286'
 '0-20000-0-06290' '0-20000-0-06308' '0-20000-0-06310' '0-20000-0-06312'
 '0-20000-0-06313' '0-20000-0-06315' '0-20000-0-06316' '0-528-0-06317'
 '

In [27]:
# Filter out only the columns where the station name has "EINDHOVEN" in it
eindhoven_df = df[df['stationname'].str.contains('EINDHOVEN')]

print(eindhoven_df.head())

                                         wsi   stationname      lat     lon  \
station time                                                                  
06370   2024-02-13 20:20:00  0-20000-0-06370  EINDHOVEN AP  51.4497  5.3769   

                             height  D1H     dd     dn   dr   dsd  ...  W10  \
station time                                                       ...        
06370   2024-02-13 20:20:00   20.69  0.0  179.8  149.1  0.0  10.9  ...  0.0   

                             W10-10    ww  ww-10       zm  iso_dataset  \
station time                                                             
06370   2024-02-13 20:20:00     0.0  22.0   22.0  48900.0          b''   

                             product  projection  nhc   za  
station time                                                
06370   2024-02-13 20:20:00      b''         b''  b''  b''  

[1 rows x 101 columns]
