In [2]:
import plotly.express as px
from plotly.subplots import make_subplots
from weather_data import WeatherData, ModelBasedOptions, HourlyData
from datetime import datetime
import pandas as pd
import numpy as np

# import seaborn as sns


In [3]:
latitude = 50.732817
longitude = 16.648050

start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 1, 31)

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
meta_data_model, daily_model, hourly_model = WeatherData.getModelBasedData(
    latitude, longitude, start_date, end_date, options
)

meta_data_station, daily_station, hourly_station = WeatherData.getStationData(
    latitude, longitude, start_date, end_date, require_daily=False, require_hourly=True, skip_stations=["12150"]
)


In [4]:
fig = px.scatter_mapbox(pd.DataFrame({"lat": [latitude], "lng": [longitude]}), lat="lat", lon="lng", zoom=5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()


In [5]:
pollution_raw_df = pd.read_csv("../../data/pollution/raw/2022/2022_16_101.csv")
pollution_raw_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

# JANUAR
januar = pollution_raw_df[
    (pollution_raw_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_raw_df["date"] <= pd.Timestamp(year=2022, month=1, day=31, hour=23, minute=59))
]
result_januar = januar.groupby(januar.index // 24).agg({"value": "mean"}).round(5)
result_januar["date"] = index = pd.date_range("2022-01-01", periods=31)
result_januar = result_januar.set_index("date")

# add missing values to Januar
januar = januar.__deepcopy__()
januar.loc[194.5] = ["2022-01-09 03:00:00", 0.00]
januar.loc[194.7] = ["2022-01-09 04:00:00", 0.00]

januar = januar.sort_index().reset_index(drop=True)


In [6]:
pm10 = januar["value"]

temp = hourly_model["temperature_2m"]
humidity = hourly_model["relativehumidity_2m"]
winddirection = hourly_model["winddirection_10m"]
windspeed = hourly_model["windspeed_10m"]
precipitation = hourly_model["precipitation"]

print(pm10.size, temp.size, humidity.size, winddirection.size, windspeed.size, precipitation.size)


744 744 744 744 744 744


In [7]:
correlation_df = pd.DataFrame(
    {
        "temp": temp.to_numpy(),
        "humidity": humidity.to_numpy(),
        "winddirection": winddirection.to_numpy(),
        "windspeed": windspeed.to_numpy(),
        "precipitation": precipitation.to_numpy(),
        "pm10": pm10.to_numpy(),
    }
)


In [8]:
# sns.heatmap(correlation_df.corr(), vmin=-1, vmax=1, annot=True, cmap="rocket_r")


### Korrelation über einen Monat (01.2022)


In [9]:
px.imshow(correlation_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


### Korrelation über einen Tag (01.01.2022)


In [10]:
correlation_day_df = correlation_df.loc[0:23]
px.imshow(correlation_day_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


### Korrelation über 14 Tage


In [11]:
correlation_month_df = correlation_df.loc[0:167]
px.imshow(correlation_month_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


### Korrelation in einem Zeitraum ohne Regen


In [12]:
correlation_rain_df = correlation_df.loc[225:301]
px.imshow(correlation_rain_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


In [13]:
pollution_raw_df = pd.read_csv("../../data/pollution/raw/2022/2022_16_101.csv")
pollution_raw_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

# March
march = pollution_raw_df[
    (pollution_raw_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_raw_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
march.reset_index(inplace=True, drop=True)

# add missing values to March
march = march.__deepcopy__()
march.loc[625.5] = ["2022-03-27 02:00:00", 0.00]
march.loc[707.5] = ["2022-03-30 12:00:00", 0.00]

march = march.sort_index().reset_index(drop=True)


In [14]:
start_date = datetime(2022, 3, 1)
end_date = datetime(2022, 3, 31)

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
_, _, hourly_model_march = WeatherData.getModelBasedData(latitude, longitude, start_date, end_date, options)


In [15]:
pm10 = march["value"].iloc[:-1]

temp = hourly_model_march["temperature_2m"]
humidity = hourly_model_march["relativehumidity_2m"]
winddirection = hourly_model_march["winddirection_10m"]
windspeed = hourly_model_march["windspeed_10m"]
precipitation = hourly_model_march["precipitation"]

print(pm10.size, temp.size, humidity.size, winddirection.size, windspeed.size, precipitation.size)


744 744 744 744 744 744


In [16]:
correlation_march_df = pd.DataFrame(
    {
        "temp": temp.to_numpy(),
        "humidity": humidity.to_numpy(),
        "winddirection": winddirection.to_numpy(),
        "windspeed": windspeed.to_numpy(),
        "precipitation": precipitation.to_numpy(),
        "pm10": pm10.to_numpy(),
    }
)


In [17]:
correlation_march_df = correlation_march_df.loc[384:720]
px.imshow(correlation_march_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


In [18]:
pollution_837_df = pd.read_csv("../../data/pollution/raw/2022/2022_837_5480.csv")
pollution_837_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_813_df = pd.read_csv("../../data/pollution/raw/2022/2022_813_5349.csv")
pollution_813_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_814_df = pd.read_csv("../../data/pollution/raw/2022/2022_814_5376.csv")
pollution_814_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_805_df = pd.read_csv("../../data/pollution/raw/2022/2022_805_5286.csv")
pollution_805_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

pollution_837_df = pollution_837_df[
    (pollution_837_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_837_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_837_df.reset_index(inplace=True, drop=True)

pollution_813_df = pollution_813_df[
    (pollution_813_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_813_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_813_df.reset_index(inplace=True, drop=True)

pollution_814_df = pollution_814_df[
    (pollution_814_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_814_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_814_df.reset_index(inplace=True, drop=True)

pollution_805_df = pollution_805_df[
    (pollution_805_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_805_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_805_df.reset_index(inplace=True, drop=True)

# # add missing values to March
# march = march.__deepcopy__()
# march.loc[625.5] = ["2022-03-27 02:00:00", 0.00]
# march.loc[707.5] = ["2022-03-30 12:00:00", 0.00]

# march = march.sort_index().reset_index(drop=True)


In [19]:
pm10_837 = pollution_837_df["value"]
pm10_813 = pollution_813_df["value"]
pm10_814 = pollution_814_df["value"]
pm10_805 = pollution_805_df["value"]

print(pm10_837.size, pm10_813.size, pm10_814.size, pm10_805.size)

correlation_station_df = pd.DataFrame(
    {
        "805": pm10_805.to_numpy(),
        "813": pm10_813.to_numpy(),
        "814": pm10_814.to_numpy(),
        "837": pm10_837.to_numpy(),
    }
)


743 743 743 743


In [23]:
coord_df = pd.DataFrame(
    {
        "lat": [50.285956, 50.246795, 50.264611, 50.329111],
        "lng": [19.184399, 19.019469, 18.975028, 19.231222],
        "label": [837, 813, 814, 805],
    }
)
fig = px.scatter_mapbox(coord_df, lat="lat", lon="lng", hover_name="label", zoom=9, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()


In [20]:
px.imshow(correlation_station_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)


In [39]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pollution_814_2_df = pd.read_csv("../../data/pollution/raw/2022/2022_814_5377.csv")
pollution_814_2_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_814_1_df = pd.read_csv("../../data/pollution/raw/2022/2022_814_5376.csv")
pollution_814_1_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

pollution_814_2_df = pollution_814_2_df[
    (pollution_814_2_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_814_2_df["date"] <= pd.Timestamp(year=2022, month=1, day=14, hour=23, minute=59))
]
pollution_814_2_df.reset_index(inplace=True, drop=True)

pollution_814_1_df = pollution_814_1_df[
    (pollution_814_1_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_814_1_df["date"] <= pd.Timestamp(year=2022, month=1, day=14, hour=23, minute=59))
]
pollution_814_2_df.reset_index(inplace=True, drop=True)

pm10_814_1 = pollution_814_1_df["value"]
pm10_814_2 = pollution_814_2_df["value"]

correlation_station_df = pd.DataFrame(
    {
        "Sensor 1": pm10_814_1.to_numpy(),
        "Sensor 2": pm10_814_2.to_numpy(),
    }
)

fig_model = make_subplots(column_titles=["Comparison of the two sensors of station 814"])

fig_model.add_trace(go.Scatter(x=pollution_814_1_df.index, y=pollution_814_1_df["value"], name="Sensor 1"))
fig_model.add_trace(go.Scatter(x=pollution_814_2_df.index, y=pollution_814_2_df["value"], name="Sensor 2"))


# px.imshow(correlation_station_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)


In [None]:
pollution_814_3_df = pd.read_csv("../../data/pollution/raw/2022/2022_814_5378.csv")
pollution_814_3_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_814_4_df = pd.read_csv("../../data/pollution/raw/2022/2022_814_5379.csv")
pollution_814_4_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

pollution_814_3_df = pollution_814_3_df[
    (pollution_814_3_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_814_3_df["date"] <= pd.Timestamp(year=2022, month=1, day=14, hour=23, minute=59))
]
pollution_814_3_df.reset_index(inplace=True, drop=True)

pollution_814_4_df = pollution_814_4_df[
    (pollution_814_4_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_814_4_df["date"] <= pd.Timestamp(year=2022, month=1, day=14, hour=23, minute=59))
]
pollution_814_4_df.reset_index(inplace=True, drop=True)

pm10_814_1 = pollution_814_1_df["value"]
pm10_814_3 = pollution_814_3_df["value"]

correlation_station_df = pd.DataFrame(
    {
        "pm10": pm10_814_1.to_numpy(),
        "pm25": pm10_814_3.to_numpy(),
    }
)

fig_model = make_subplots(column_titles=["Comparison of the two sensors of station 814"])

fig_model.add_trace(go.Scatter(x=pollution_814_1_df.index, y=pollution_814_1_df["value"], name="Sensor 1"))
fig_model.add_trace(go.Scatter(x=pollution_814_2_df.index, y=pollution_814_2_df["value"], name="Sensor 2"))
