### Correlation Analysis of Feature Vector

Generates a plot containing the mean correlation of all stations


In [1]:
import pandas as pd
import os
import numpy as np
import plotly.express as px

In [10]:
base_path = "../../data/pollution/processed/interpolated/pm10/"
files = os.listdir(base_path)

In [11]:
corrs: list[pd.DataFrame] = []
for file in files:
    data = pd.read_feather(base_path + file)
    
    data.set_index("timestamp", inplace=True)
    timestamp_s = data.index.map(pd.Timestamp.timestamp)

    day = 24 * 60 * 60
    year = (365.2425) * day

    data["day_sin"] = np.sin(timestamp_s * (2 * np.pi / day))
    data["day_cos"] = np.cos(timestamp_s * (2 * np.pi / day))
    data["year_sin"] = np.sin(timestamp_s * (2 * np.pi / year))
    data["year_cos"] = np.cos(timestamp_s * (2 * np.pi / year))
    data.reset_index(inplace=True)

    wd_rad = data.pop("winddirection_10m") * np.pi / 180
    data["winddirection_10m_sin"] = np.sin(wd_rad)
    data["winddirection_10m_cos"] = np.cos(wd_rad)
    
    data.reset_index(inplace=True)
    data.set_index("timestamp", inplace=True)
    corrs.append(data.corr("spearman"))


In [19]:
columns = corrs[0].columns
rows = corrs[0].index
numpy_data = []
for corr in corrs:
    numpy_data.append(corr.to_numpy())

data = np.mean(np.array(numpy_data), axis=0)
df = pd.DataFrame(data, columns=columns, index=rows)
fig = px.imshow(df.round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)
fig.update_layout({"height": 650, "width": 750}, title_text="Mean correlation of all stations", title_x=0.5)


#### Display the correlation of one Station

Index can be chosen with `station_index`


In [24]:
station_index = 10
px.imshow(corrs[station_index].round(2), text_auto=True, color_continuous_scale="RdBu", zmin=-1, zmax=1)
fig.update_layout({"height": 650, "width": 750}, title_text=f"Correlation of station {files[station_index].split('.')[0]}", title_x=0.5)

#### Display the data from on station

Index can be chosen with `station_index`


In [25]:
station_index = 10
pd.read_feather(base_path + files[station_index])

Unnamed: 0,timestamp,temperature_2m,relativehumidity_2m,winddirection_10m,windspeed_10m,precipitation,pm10
0,2019-01-01 01:00:00,2.3,88,219,18.2,0.0,53.753
1,2019-01-01 02:00:00,2.6,87,220,19.7,0.0,31.742
2,2019-01-01 03:00:00,3.2,87,224,20.9,0.0,29.132
3,2019-01-01 04:00:00,3.8,91,228,21.2,0.0,27.218
4,2019-01-01 05:00:00,4.4,95,233,21.6,0.0,24.695
...,...,...,...,...,...,...,...
34488,2022-12-30 20:00:00,4.0,93,195,15.3,0.0,45.000
34489,2022-12-30 21:00:00,4.0,89,188,16.3,0.0,43.200
34490,2022-12-30 22:00:00,4.2,89,181,18.7,0.0,21.500
34491,2022-12-30 23:00:00,4.5,88,180,20.5,0.0,23.500
