In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

---

### **Import the data**

These are the two relevant stations:

In [2]:
stations = {
    603: "Bonn",
    1303: "Essen"
}

Load .csv files from `cdc_download/data` folder into DataFrames.  
`prd.csv` contains information about the different tables: each table has a `Produkt_Code` .  
Each `Product_Code` has a `Produkt_Titel`.  

`sdo` data was included in the download from [dwd](https://cdc.dwd.de/portal/), but has been removed as it was not useful. 

In [3]:
def format_timestamp(ts):
    '''
    Turn integer like 201901010000 into datetime object. 
    '''
    ts = str(ts)
    year, month, day = int(ts[:4]), int(ts[4:6]), int(ts[6:8])
    hour, minute = int(ts[8:10]), int(ts[10:12])
    return datetime(year, month, day, hour, minute)

In [4]:
data = dict()
path = "data/cdc_download/data"
# prd.csv contains names of different "products" (tables)
prd = pd.read_csv(f"{path}/prd.csv")
# Load each of the downloaded weather table into its own dataframe:
for filename in os.listdir(path):
    if "data" in filename:
        # Add DataFrame to data dictionary
        code = filename.replace("data_", "").replace(".csv", "") # Produkt_Code
        data[code] = pd.read_csv(f"{path}/{filename}")
        
for i, code in enumerate(data.keys()):
    # Rename columns:
    data[code] = data[code].rename(columns={
        "Wert": prd[prd["Produkt_Code"] == code]["Produkt_Titel"].iloc[0],
        "SDO_ID": "station",
        "Zeitstempel": "timestamp"
    })
    # Replace SDO_ID (station ID) by station name:
    data[code]["station"] = data[code]["station"].apply(lambda x: stations[x])
    # Replace timestamp by formatted timestamp:
    data[code]["timestamp"] = data[code]["timestamp"].apply(format_timestamp)
    # Drop Produkt_Code (we don't need a column for this?)
    # Drop Qualitaet_Niveau, Qualitaet_Byte (Do we need them??)
    data[code] = data[code].drop(
        ["Produkt_Code", "Qualitaet_Niveau", "Qualitaet_Byte"], axis=1)
    # Reset the index
    data[code] = data[code].reset_index()
    data[code] = data[code].drop("index", axis=1)

In [8]:
prd[["Produkt_Code", "Produkt_Titel", "Einheit"]]

Unnamed: 0,Produkt_Code,Produkt_Titel,Einheit
0,TT_TU_MN009,Hourly station observations of air temperature...,°C
1,TD_MN008,Hourly station observations of dew point tempe...,°C
2,RF_TU_MN009,Hourly station observations of relative humidi...,%
3,R1_MN008,Hourly station observations of precipitation a...,mm
4,WRTR_MN008,Hourly station observations of form of precipi...,numerical code
5,RS_IND_MN008,Hourly station observations of index whether p...,numerical code
6,FF_MN008,Hourly station observations of wind velocity 1...,m/s
7,DD_MN008,Hourly station observations of wind direction ...,degree
8,F_MN003,Hourly mean of station observations of wind sp...,m/s
9,D_MN003,Hourly mean of station observations of wind di...,degree


In [13]:
data["TT_TU_MN009"]

Unnamed: 0,station,timestamp,Hourly station observations of air temperature at 2 m above ground in °C
0,Essen,2019-02-01 00:00:00,-1.5
1,Essen,2019-02-01 01:00:00,-1.3
2,Essen,2019-02-01 02:00:00,-0.9
3,Essen,2019-02-01 03:00:00,-0.5
4,Essen,2019-02-01 04:00:00,-0.4
...,...,...,...
7181,Bonn,2019-06-30 19:00:00,25.5
7182,Bonn,2019-06-30 20:00:00,24.4
7183,Bonn,2019-06-30 21:00:00,22.8
7184,Bonn,2019-06-30 22:00:00,20.3


In [14]:
# Which tables are there?
for i in range(prd.shape[0]):
    print(prd["Produkt_Titel"].iloc[i])

Hourly station observations of air temperature at 2 m above ground in °C
Hourly station observations of dew point temperature 2 m above ground in °C
Hourly station observations of relative humidity in %
Hourly station observations of precipitation amount in mm
Hourly station observations of form of precipitation (WR code)
Hourly station observations of index whether precipitation has fallen
Hourly station observations of wind velocity 10 m above ground in m/s
Hourly station observations of wind direction 10 m above ground in degree
Hourly mean of station observations of wind speed ca. 10 m above ground in m/s
Hourly mean of station observations of wind direction at ca. 10 m above ground in degree
Hourly station observations of air pressure at station level in hpa
Hourly station observations of air pressure at mean sea level in hpa
Hourly station observations of cloud coverage in eighths
Hourly station observations of soil temperature at the depth of 5 cm in °C
Hourly station observatio

---

### **Missing datapoints**

Not all features have data for all timestamps (especially for Bonn).  
Let's look a bit more closely at this: 

In [40]:
print("Essen")
for product_code, table in data.items():
    series = table[table["station"] == "Essen"]["timestamp"]
    annotation = "!!!" if series.count() < 3600 else ""
    # !!! means for this feature (in this city), there are datapoints missing!
    print(product_code, "\t", series.count(),"/ 3600", annotation)

Essen
TE05_MN002 	 3600 / 3600 
P0_MN008 	 3600 / 3600 
R1_MN008 	 3600 / 3600 
RF_TU_MN009 	 3557 / 3600 !!!
TE10_MN002 	 3600 / 3600 
P_MN008 	 3600 / 3600 
FF_MN008 	 3600 / 3600 
F_MN003 	 3600 / 3600 
TD_MN008 	 3557 / 3600 !!!
RS_IND_MN008 	 3600 / 3600 
N_MN008 	 3594 / 3600 !!!
DD_MN008 	 3600 / 3600 
TT_TU_MN009 	 3600 / 3600 
TE100_MN002 	 3600 / 3600 
TE20_MN002 	 3600 / 3600 
TE50_MN002 	 3600 / 3600 
D_MN003 	 3600 / 3600 
WRTR_MN008 	 2398 / 3600 !!!


In [41]:
print("Bonn")
for product_code, table in data.items():
    series = table[table["station"] == "Bonn"]["timestamp"]
    annotation = "!!!" if series.count() < 3600 else ""
    # !!! means for this feature (in this city), there are datapoints missing!
    print(product_code, "\t", series.count(),"/ 3600", annotation)

Bonn
TE05_MN002 	 3586 / 3600 !!!
P0_MN008 	 3586 / 3600 !!!
R1_MN008 	 3585 / 3600 !!!
RF_TU_MN009 	 3586 / 3600 !!!
TE10_MN002 	 3586 / 3600 !!!
P_MN008 	 3586 / 3600 !!!
FF_MN008 	 3586 / 3600 !!!
F_MN003 	 3586 / 3600 !!!
TD_MN008 	 3586 / 3600 !!!
RS_IND_MN008 	 3585 / 3600 !!!
N_MN008 	 3573 / 3600 !!!
DD_MN008 	 3586 / 3600 !!!
TT_TU_MN009 	 3586 / 3600 !!!
TE100_MN002 	 3586 / 3600 !!!
TE20_MN002 	 3586 / 3600 !!!
TE50_MN002 	 3586 / 3600 !!!
D_MN003 	 3587 / 3600 !!!
WRTR_MN008 	 2390 / 3600 !!!


In [42]:
data["WRTR_MN008"]

Unnamed: 0,station,timestamp,Hourly station observations of form of precipitation (WR code)
0,Essen,2019-02-01 01:00:00,0
1,Essen,2019-02-01 02:00:00,0
2,Essen,2019-02-01 04:00:00,7
3,Essen,2019-02-01 05:00:00,8
4,Essen,2019-02-01 07:00:00,8
...,...,...,...
4783,Bonn,2019-06-30 17:00:00,0
4784,Bonn,2019-06-30 19:00:00,0
4785,Bonn,2019-06-30 20:00:00,0
4786,Bonn,2019-06-30 22:00:00,0
