In [128]:
import pandas as pd
import numpy as np
from datetime import datetime

In [139]:
knmi = pd.read_csv('../data/raw/knmi_raw.csv')

# get rid of whitespace in headers
col_names = knmi.columns
col_names = {name : name.strip() for name in col_names}
knmi.rename(columns = col_names, inplace=True)

# get rid of columns with empty values
knmi.replace(r'^\s*$', np.nan, regex=True, inplace=True)
knmi.T10N = knmi.T10N.astype('float64')

# fix a datetime column
knmi.insert(0, "datetime", [datetime.strptime(f'{d} {(h-1):02}', '%Y%m%d %H') for d, h in zip(knmi.YYYYMMDD, knmi.H)])
# drop station location
knmi.drop(['STN', 'YYYYMMDD', 'H'], axis=1, inplace=True)

# up-sample to 15 minute intervals
knmi['datetime'] = pd.to_datetime(knmi['datetime'])
knmi.set_index('datetime', inplace=True) # need a dt index for resample
knmi = knmi.resample('15min').ffill(limit = 3) # up-sample and forward fill the empty rows
knmi.reset_index(inplace=True) # revert dt index to column

knmi.head(20)

Unnamed: 0,datetime,DD,FH,FF,FX,T,T10N,TD,SQ,Q,...,VV,N,U,WW,IX,M,R,S,O,Y
0,2020-08-31 00:00:00,350,40,40,80,148,,92,0,0,...,80,8,69,,5,0,0,0,0,0
1,2020-08-31 00:15:00,350,40,40,80,148,,92,0,0,...,80,8,69,,5,0,0,0,0,0
2,2020-08-31 00:30:00,350,40,40,80,148,,92,0,0,...,80,8,69,,5,0,0,0,0,0
3,2020-08-31 00:45:00,350,40,40,80,148,,92,0,0,...,80,8,69,,5,0,0,0,0,0
4,2020-08-31 01:00:00,350,40,40,70,145,,87,0,0,...,80,8,68,,5,0,0,0,0,0
5,2020-08-31 01:15:00,350,40,40,70,145,,87,0,0,...,80,8,68,,5,0,0,0,0,0
6,2020-08-31 01:30:00,350,40,40,70,145,,87,0,0,...,80,8,68,,5,0,0,0,0,0
7,2020-08-31 01:45:00,350,40,40,70,145,,87,0,0,...,80,8,68,,5,0,0,0,0,0
8,2020-08-31 02:00:00,360,30,30,50,138,,81,0,0,...,80,8,68,,5,0,0,0,0,0
9,2020-08-31 02:15:00,360,30,30,50,138,,81,0,0,...,80,8,68,,5,0,0,0,0,0


## transform like gemeente does it

In [140]:
knmi.rename(
    columns = {
        "DD": "wind_direction", "FH": "wind_speed", "FF": "wind_speed_10m", "FX": "wind_gust",
        "T": "temperature", "T10N": "temperature_min", "TD": "dew_point_temperature",
        "SQ": "radiation_duration", "Q": "global_radiation",
        "DR": "precipitation_duration", "RH": "precipitation_h",
        "P": "pressure", "VV": "sight", "N": "cloud_cover", "U": "relative_humidity",
        "WW": "weather_code", "IX": "weather_index",
        "M": "fog", "R": "rain", "S": "snow", "O": "thunder", "Y": "ice"
    }, inplace=True)
# divide some columns by ten (because using 0.1 degrees C etc. as units)
col10 = ["wind_speed", "wind_speed_10m", "wind_gust", "temperature", "temperature_min", "dew_point_temperature",
         "radiation_duration", "precipitation_duration", "precipitation_h", "pressure"]
knmi[col10] = knmi[col10] / 10
knmi.to_csv('../data/knmi_for_us.csv')

# drop unwanted columns
knmi = knmi[['datetime', 'global_radiation', 'pressure', 'precipitation_h',
         'relative_humidity', 'temperature', 'cloud_cover', 'sight',
         'wind_direction', 'wind_speed']]

knmi.head()

Unnamed: 0,datetime,global_radiation,pressure,precipitation_h,relative_humidity,temperature,cloud_cover,sight,wind_direction,wind_speed
0,2020-08-31 00:00:00,0,1017.4,0.0,69,14.8,8,80,350,4.0
1,2020-08-31 00:15:00,0,1017.4,0.0,69,14.8,8,80,350,4.0
2,2020-08-31 00:30:00,0,1017.4,0.0,69,14.8,8,80,350,4.0
3,2020-08-31 00:45:00,0,1017.4,0.0,69,14.8,8,80,350,4.0
4,2020-08-31 01:00:00,0,1017.7,0.0,68,14.5,8,80,350,4.0


In [134]:
knmi.to_csv('../data/knmi_for_baseline.csv')