### Correlation analysis of different stations and timeframes


In [1]:
import sys

sys.path.append("../../")

import plotly.express as px
from src.weatherdata.weather_data import (  # pylint: disable=import-error
    HourlyData, ModelBasedOptions, WeatherData)
from datetime import datetime
import pandas as pd


#### Visualize all stations on a map


In [2]:
pd.read_json("../../data/pollution/station_meta_data.json")


Unnamed: 0,station_id,station_code,station_code_international,station_name,station_code_old,start_date,start_end,district_type,region_type,station_type,county,city,address,latitude,longitude
0,1,DsBialka,,Białka,,1990-01-03,2005-12-31,industrial,suburban,stationary container,DOLNOŚLĄSKIE,Białka,,51.197783,16.117390
1,2,DsBielGrot,,Bielawa - ul. Grota Roweckiego,,1994-01-02,2003-12-31,ground,city,in building,DOLNOŚLĄSKIE,Bielawa,ul. Grota Roweckiego 6,50.682510,16.617348
2,3,DsBogatFrancMOB,PL0602A,Bogatynia Mobil,DsBogatMob,2015-01-01,2015-12-31,ground,city,mobile,DOLNOŚLĄSKIE,Bogatynia,ul. Francuska/Kręta,50.940998,14.916790
3,4,DsBogChop,PL0315A,Bogatynia - Chopina,,1996-01-01,2013-12-31,industrial,city,stationary container,DOLNOŚLĄSKIE,Bogatynia,ul. Chopina 35,50.905856,14.967175
4,5,DsBogZatonieMob,PL0576A,Bogatynia - Mobil,,2012-01-01,2012-12-31,industrial,city,mobile,DOLNOŚLĄSKIE,Bogatynia,"ul. Konrada, Zatonie",50.943245,14.913327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073,1074,ZpSzczPilsud,PL0249A,"Szczecin, ul. Piłsudskiego","ZpSzczecin002, ZpSzczPils02",38352,,komunikacyjna,city,stationary container,ZACHODNIOPOMORSKIE,Szczecin,ul. Piłsudskiego 1,53.432169,14.553900
1074,1075,ZpSzczWSSEEnerg,,Energetyków,,33604,37986,komunikacyjna,city,in building,ZACHODNIOPOMORSKIE,Szczecin,ul. Energetyków 2,53.420475,14.561934
1075,1076,ZpSzczWSSESped6,,Spedytorska,,33604,37987,ground,city,stationary container,ZACHODNIOPOMORSKIE,Szczecin,ul. Spedytorska 6,53.415043,14.555347
1076,1077,ZpWalWalczWSSE,,Wałcz,,33604,38353,ground,city,in building,ZACHODNIOPOMORSKIE,Wałcz,ul. Bydgoska 86,53.263667,16.492596


In [3]:
df = pd.read_json("../../data/pollution/raw/2022/meta_data.json")
fig = px.scatter_mapbox(df, lat="latitide", lon="longitude", zoom=5, height=600, hover_name="station_id")
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()


#### Correlation Analysis for station 16


In [4]:
latitude = 50.732817
longitude = 16.648050

start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 1, 31)

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
meta_data_model, daily_model, hourly_model = WeatherData.getModelBasedData(
    latitude, longitude, start_date, end_date, options
)

meta_data_station, daily_station, hourly_station = WeatherData.getStationData(
    latitude, longitude, start_date, end_date, require_daily=False, require_hourly=True, skip_stations=["12150"]
)


##### Location of station 16


In [5]:
fig = px.scatter_mapbox(pd.DataFrame({"lat": [latitude], "lng": [longitude]}), lat="lat", lon="lng", zoom=5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()


In [6]:
pollution_raw_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_16.csv")
pollution_raw_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

# JANUAR
januar = pollution_raw_df[
    (pollution_raw_df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (pollution_raw_df["date"] <= pd.Timestamp(year=2022, month=1, day=31, hour=23, minute=59))
]
result_januar = januar.groupby(januar.index // 24).agg({"value": "mean"}).round(5)
result_januar["date"] = index = pd.date_range("2022-01-01", periods=31)
result_januar = result_januar.set_index("date")

# add missing values to Januar
januar = januar.__deepcopy__()
januar.loc[194.5] = ["2022-01-09 03:00:00", 0.00]
januar.loc[194.7] = ["2022-01-09 04:00:00", 0.00]

januar = januar.sort_index().reset_index(drop=True)


In [7]:
pm10 = januar["value"]

temp = hourly_model["temperature_2m"]
humidity = hourly_model["relativehumidity_2m"]
winddirection = hourly_model["winddirection_10m"]
windspeed = hourly_model["windspeed_10m"]
precipitation = hourly_model["precipitation"]

print(pm10.size, temp.size, humidity.size, winddirection.size, windspeed.size, precipitation.size)


744 744 744 744 744 744


In [8]:
correlation_df = pd.DataFrame(
    {
        "temp": temp.to_numpy(),
        "humidity": humidity.to_numpy(),
        "winddirection": winddirection.to_numpy(),
        "windspeed": windspeed.to_numpy(),
        "precipitation": precipitation.to_numpy(),
        "pm10": pm10.to_numpy(),
    }
)


##### Korrelation über einen Monat (01.2022)


In [9]:
px.imshow(correlation_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


##### Korrelation über einen Tag (01.01.2022)


In [10]:
correlation_day_df = correlation_df.loc[0:23]
px.imshow(correlation_day_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


##### Korrelation über 14 Tage


In [11]:
correlation_month_df = correlation_df.loc[0:167]
px.imshow(correlation_month_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


##### Korrelation in einem Zeitraum ohne Regen


In [12]:
correlation_rain_df = correlation_df.loc[225:301]
px.imshow(correlation_rain_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


##### Correlation for march


In [13]:
pollution_raw_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_16.csv")
pollution_raw_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

# March
march = pollution_raw_df[
    (pollution_raw_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_raw_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
march.reset_index(inplace=True, drop=True)

# add missing values to March
march = march.__deepcopy__()
march.loc[625.5] = ["2022-03-27 02:00:00", 0.00]
march.loc[707.5] = ["2022-03-30 12:00:00", 0.00]

march = march.sort_index().reset_index(drop=True)


In [14]:
start_date = datetime(2022, 3, 1)
end_date = datetime(2022, 3, 31)

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
_, _, hourly_model_march = WeatherData.getModelBasedData(latitude, longitude, start_date, end_date, options)


In [15]:
pm10 = march["value"].iloc[:-1]

temp = hourly_model_march["temperature_2m"]
humidity = hourly_model_march["relativehumidity_2m"]
winddirection = hourly_model_march["winddirection_10m"]
windspeed = hourly_model_march["windspeed_10m"]
precipitation = hourly_model_march["precipitation"]

print(pm10.size, temp.size, humidity.size, winddirection.size, windspeed.size, precipitation.size)


744 744 744 744 744 744


In [16]:
correlation_march_df = pd.DataFrame(
    {
        "temp": temp.to_numpy(),
        "humidity": humidity.to_numpy(),
        "winddirection": winddirection.to_numpy(),
        "windspeed": windspeed.to_numpy(),
        "precipitation": precipitation.to_numpy(),
        "pm10": pm10.to_numpy(),
    }
)


In [17]:
correlation_march_df = correlation_march_df.loc[384:720]
px.imshow(correlation_march_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")


#### Correlation Analysis for station 814


In [18]:
latitude = 50.264611
longitude = 18.975028

start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 1, 31)

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
meta_data_model, daily_model, hourly_model = WeatherData.getModelBasedData(
    latitude, longitude, start_date, end_date, options
)

meta_data_station, daily_station, hourly_station = WeatherData.getStationData(
    latitude, longitude, start_date, end_date, require_daily=False, require_hourly=True, skip_stations=["12150"]
)

In [19]:
df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_814.csv")
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d %H:%M:%S")

# JANUAR
januar = df[
    (df["date"] >= pd.Timestamp(year=2022, month=1, day=1))
    & (df["date"] <= pd.Timestamp(year=2022, month=1, day=31, hour=23, minute=59))
]
result_januar = januar.groupby(januar.index // 24).agg({"value": "mean"}).round(5)
result_januar["date"] = index = pd.date_range("2022-01-01", periods=31)
result_januar = result_januar.set_index("date")

# add missing values to Januar
januar = januar.__deepcopy__()
januar.loc[194.5] = ["2022-01-09 03:00:00", 0.00]
januar.loc[194.7] = ["2022-01-09 04:00:00", 0.00]

januar = januar.sort_index().reset_index(drop=True)

In [20]:
januar.set_index("date", inplace=True)
pm10 = januar["value"][:-2]

temp = hourly_model["temperature_2m"]
humidity = hourly_model["relativehumidity_2m"]
winddirection = hourly_model["winddirection_10m"]
windspeed = hourly_model["windspeed_10m"]
precipitation = hourly_model["precipitation"]

print(pm10.size, temp.size, humidity.size, winddirection.size, windspeed.size, precipitation.size)


data = hourly_model.join(pm10)

pm10 = data["value"]
temp = data["temperature_2m"]
humidity = data["relativehumidity_2m"]
winddirection = data["winddirection_10m"]
windspeed = data["windspeed_10m"]
precipitation = data["precipitation"]

744 744 744 744 744 744


In [21]:
correlation_df = pd.DataFrame(
    {
        "temp": temp.to_numpy(),
        "humidity": humidity.to_numpy(),
        "winddirection": winddirection.to_numpy(),
        "windspeed": windspeed.to_numpy(),
        "precipitation": precipitation.to_numpy(),
        "pm10": pm10.to_numpy(),
    }
)


##### Location of station 814


In [22]:
fig = px.scatter_mapbox(pd.DataFrame({"lat": [latitude], "lng": [longitude]}), lat="lat", lon="lng", zoom=5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

##### Korrelation über einen Monat (01.2022)


In [23]:
fig = px.imshow(correlation_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)


fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, Jan 22, pearson", title_x=0.5)


In [24]:
px.imshow(correlation_df.corr("spearman").round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, Jan 22, spearman", title_x=0.5)


##### Korrelation über einen Tag (01.01.2022)


In [25]:
correlation_day_df = correlation_df.loc[0:23]
px.imshow(correlation_day_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, 01 Jan 22, pearson", title_x=0.5)


In [26]:
px.imshow(correlation_day_df.corr("spearman").round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, 01 Jan 22, spearman", title_x=0.5)


##### Korrelation über 14 Tage


In [27]:
correlation_month_df = correlation_df.loc[0:167]
px.imshow(correlation_month_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, 01-14 Jan 22, pearson", title_x=0.5)


In [28]:
px.imshow(correlation_month_df.corr("spearman").round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, 01-14 Jan 22, spearman", title_x=0.5)


##### Korrelation in einem Zeitraum ohne Regen


In [29]:
correlation_rain_df = correlation_df.loc[225:301]
px.imshow(correlation_rain_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, timeframe without rain, spearman", title_x=0.5)


In [30]:
px.imshow(correlation_rain_df.corr("spearman").round(2), text_auto=True, color_continuous_scale="RdBu")
fig.update_layout({"height": 480, "width": 600}, title_text="Correlation, station 814, timeframe without rain, spearman", title_x=0.5)


#### Analyse correlation for multiple station within one city


In [31]:
pollution_837_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_837.csv")
pollution_837_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_813_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_813.csv")
pollution_813_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_814_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_814.csv")
pollution_814_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")
pollution_805_df = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_805.csv")
pollution_805_df["date"] = pd.to_datetime(pollution_raw_df["date"], format="%Y-%m-%d %H:%M:%S")

pollution_837_df = pollution_837_df[
    (pollution_837_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_837_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_837_df.reset_index(inplace=True, drop=True)

pollution_813_df = pollution_813_df[
    (pollution_813_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_813_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_813_df.reset_index(inplace=True, drop=True)

pollution_814_df = pollution_814_df[
    (pollution_814_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_814_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_814_df.reset_index(inplace=True, drop=True)

pollution_805_df = pollution_805_df[
    (pollution_805_df["date"] >= pd.Timestamp(year=2022, month=3, day=1))
    & (pollution_805_df["date"] <= pd.Timestamp(year=2022, month=3, day=31, hour=23, minute=59))
]
pollution_805_df.reset_index(inplace=True, drop=True)


In [32]:
pm10_837 = pollution_837_df["value"]
pm10_813 = pollution_813_df["value"]
pm10_814 = pollution_814_df["value"]
pm10_805 = pollution_805_df["value"]

print(pm10_837.size, pm10_813.size, pm10_814.size, pm10_805.size)

correlation_station_df = pd.DataFrame(
    {
        "805": pm10_805.to_numpy(),
        "813": pm10_813.to_numpy(),
        "814": pm10_814.to_numpy(),
        "837": pm10_837.to_numpy(),
    }
)


743 743 743 743


In [33]:
coord_df = pd.DataFrame(
    {
        "lat": [50.285956, 50.246795, 50.264611, 50.329111],
        "lng": [19.184399, 19.019469, 18.975028, 19.231222],
        "label": [837, 813, 814, 805],
    }
)
fig = px.scatter_mapbox(coord_df, lat="lat", lon="lng", hover_name="label", zoom=9, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()


In [34]:
px.imshow(correlation_station_df.corr().round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)


#### Correlation analysis for station 813 with pm10 and pm2.5


In [36]:
df_year = pd.read_csv("../../data/pollution/raw/2022/pm10/2022_813.csv")
df_year["date"] = pd.to_datetime(df_year["date"], format="%Y-%m-%d %H:%M:%S")
df_year = df_year.set_index("date")
df_year = df_year.rename(columns={"value": "pm10"})

df_year_pm25 = pd.read_csv("../../data/pollution/raw/2022/pm25/2022_813.csv")
df_year_pm25["date"] = pd.to_datetime(df_year_pm25["date"], format="%Y-%m-%d %H:%M:%S")
df_year_pm25 = df_year_pm25.set_index("date")
df_year_pm25 = df_year_pm25.rename(columns={"value": "pm25"})

data_range = pd.date_range(datetime(2022, 1, 1), datetime(2022, 12, 31, 23, 59, 59), freq="1H").transpose()

df_base = pd.DataFrame(data_range, columns=["date"])
df_base = df_base.set_index("date")

df_base = df_base.join(df_year)
df_base = df_base.join(df_year_pm25)

df_base = df_base.interpolate(method="time", limit=10, limit_direction="both", limit_area="inside", inplace=False)

day = 24 * 60 * 60
year = (365.2425) * day

# timestamp_s = df_base.index.map(pd.Timestamp.timestamp)

# df_base["Day sin"] = np.sin(timestamp_s * (2 * np.pi / day))
# df_base["Day cos"] = np.cos(timestamp_s * (2 * np.pi / day))
# df_base["Year sin"] = np.sin(timestamp_s * (2 * np.pi / year))
# df_base["Year cos"] = np.cos(timestamp_s * (2 * np.pi / year))

options = ModelBasedOptions(
    hourly=[
        HourlyData.Temperature_2m,
        HourlyData.RelativeHumidity_2m,
        HourlyData.WindDirection_10m,
        HourlyData.WindSpeed_10m,
        HourlyData.Precipitation_rain_showers_snow,
    ]
)
_, _, weather = WeatherData.getModelBasedData(50.246795, 19.019469, df_base.index[0], df_base.index[-1], options)

df_base = df_base.join(weather)
df_base = df_base.rename(
    columns={
        "temperature_2m": "temperature",
        "relativehumidity_2m": "relativehumidity",
        "winddirection_10m": "winddirection",
        "windspeed_10m": "windspeed",
    }
)


##### Correlation for 2022


In [37]:
fig = px.imshow(
    df_base.corr().round(2),
    text_auto=True,
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title="Correlation analysis 2022, station 813 (Katowice), pearson",
    height=600,
    width=750,
)
fig.update_layout(title_x=0.5)

In [38]:
fig = px.imshow(
    df_base.corr("spearman").round(2),
    text_auto=True,
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title="Correlation analysis 2022, station 813 (Katowice), spearman",
    height=600,
    width=750,
)
fig.update_layout(title_x=0.5)

##### Correlation for pm10 values > threshold


In [39]:
threshold = 50

df_extreme = df_base[df_base["pm10"] >= threshold]

fig = px.imshow(
    df_extreme.corr().round(2),
    text_auto=True,
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title=f"Correlation analysis 2022, station 813 (Katowice); PM10 values >{threshold}ug/m^3",
    height=600,
    width=750,
)
fig.update_layout(title_x=0.5)

In [40]:
fig = px.imshow(
    df_extreme.corr("spearman").round(2),
    text_auto=True,
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title="Correlation analysis 2022, station 813 (Katowice); PM10 values >120ug/m^3",
    height=600,
    width=750,
)
fig.update_layout(title_x=0.5)

In [41]:
df_extreme.shape

(1051, 7)