In [3]:
from src.data.UWOtools import *

from bokeh.layouts import column
from bokeh.plotting import figure, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Legend, LegendItem
from bokeh.palettes import Category20
from bokeh.io import output_file, show

output_notebook() # enables bokeh in Jupyter Notebook
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
sites = GetSitesWithVariable("flow_rate") # All measurement sites which measure the flowrate
locations = list(sites["name"])
print(sites)

                    name  site_id
0          23_bahnhofstr       34
1    rubpw80sbw_industry       32
2   rub128basin_usterstr       19
3           rubbasin_ara       21
4   3r_rub_morg_overflow       74
5             inflow_ara       15
6         166_luppmenweg       40
7         22a_bahnhofstr       41
8         555_mesikerstr       42
9        11e_russikerstr        2
10        47a_zurcherstr       57


In [5]:
df = GetFlowRateTimeSeries() # Use the specific designed query, to get all flow_rates 

Starting


In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp']) # 
df = df.pivot_table(index='timestamp', columns='source_name', values='value') # Arrange df to our needs

In [7]:
df.head(20) # Eyeball data to see if it is what we want
df.isna().sum()

source_name
bf_f02_555_mesikerstr                   1264733
bf_f03_11e_russikerstr                  1266247
bf_f07_23_bahnhofstr                    1273396
bf_f08_166_luppmenweg                   1288164
bf_f10_22a_bahnhofstr                   1432077
bf_f12_47a_zurcherstr                   1310557
bf_plsRKBA1101_rubbasin_ara_overflow       1086
bf_plsRKBM1101_3r_rub_morg_overflow        1086
bf_plsRKBU1101_rub128basin_usterstr        1086
bf_plsRKBU1102_rub128basin_overflow        1086
bf_plsRKPI1102_rubpw80sbw_overflow         1086
bf_plsZUL1100_inflow_ara                   1086
dtype: int64

Let's have a look at the sampling rate of the different sensors:

In [8]:
def get_sampling_rate(column):
    # Drop NaN values to focus only on timestamps where data present
    timestamps = df[column].dropna().index
    # Calculate differences between consecutive timestamps
    differences = timestamps.diff().dropna()
    #Find the most common difference
    most_common_interval = differences.value_counts().idxmax()
    
    return most_common_interval

# Check sampling rate for each column
sampling_rates = {}
for column in df.columns:
    sampling_rate = get_sampling_rate(column)
    sampling_rates[column] = sampling_rate

# Display the sampling rates
for column, rate in sampling_rates.items():
    print(f"{column}: {rate}")

bf_f02_555_mesikerstr: 0 days 00:05:00
bf_f03_11e_russikerstr: 0 days 00:05:00
bf_f07_23_bahnhofstr: 0 days 00:05:00
bf_f08_166_luppmenweg: 0 days 00:05:00
bf_f10_22a_bahnhofstr: 0 days 00:05:00
bf_f12_47a_zurcherstr: 0 days 00:05:00
bf_plsRKBA1101_rubbasin_ara_overflow: 0 days 00:01:00
bf_plsRKBM1101_3r_rub_morg_overflow: 0 days 00:01:00
bf_plsRKBU1101_rub128basin_usterstr: 0 days 00:01:00
bf_plsRKBU1102_rub128basin_overflow: 0 days 00:01:00
bf_plsRKPI1102_rubpw80sbw_overflow: 0 days 00:01:00
bf_plsZUL1100_inflow_ara: 0 days 00:01:00


# Resample the ts to an average sampling time of 2.5min let's use interpolate("linear") for upsampling and .mean() for downsampling

In [9]:
# Columns to upsample
upsample_cols = ['bf_f02_555_mesikerstr', 'bf_f03_11e_russikerstr', 'bf_f07_23_bahnhofstr','bf_f08_166_luppmenweg','bf_f10_22a_bahnhofstr','bf_f12_47a_zurcherstr']

# Columns to downsample - Replace these with your actual column names
downsample_cols = ['bf_plsRKBA1101_rubbasin_ara_overflow', 'bf_plsRKBM1101_3r_rub_morg_overflow','bf_plsRKBU1101_rub128basin_usterstr', 'bf_plsRKBU1102_rub128basin_overflow','bf_plsRKPI1102_rubpw80sbw_overflow','bf_plsZUL1100_inflow_ara' ]

# Upsample and interpolate
df_upsampled = df[upsample_cols].resample('2.5min').interpolate(method='linear', limit= 20) # limit defines how many consecutive NAN's to fill 20 represent 4 measurement cycles which is about ~20min

# Downsample and take the mean
df_downsampled = df[downsample_cols].resample('2.5min').mean()

# Combine the results
df_resampled = pd.concat([df_upsampled, df_downsampled], axis=1)

In [10]:
df_resampled.isna().sum() # Depends alot on the limit param in interpolate, how much data generation makes sense?
df_resampled

source_name,bf_f02_555_mesikerstr,bf_f03_11e_russikerstr,bf_f07_23_bahnhofstr,bf_f08_166_luppmenweg,bf_f10_22a_bahnhofstr,bf_f12_47a_zurcherstr,bf_plsRKBA1101_rubbasin_ara_overflow,bf_plsRKBM1101_3r_rub_morg_overflow,bf_plsRKBU1101_rub128basin_usterstr,bf_plsRKBU1102_rub128basin_overflow,bf_plsRKPI1102_rubpw80sbw_overflow,bf_plsZUL1100_inflow_ara
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01 00:00:00,4.813899,15.042012,15.9600,,19.9040,,0.0,0.0,15.700000,0.0,0.0,37.369167
2019-01-01 00:02:30,4.647061,14.994626,16.0710,,20.2855,,0.0,0.0,15.611400,0.0,0.0,36.431400
2019-01-01 00:05:00,4.480223,14.947239,16.1820,15.6850,20.6670,,0.0,0.0,15.396033,0.0,0.0,36.842633
2019-01-01 00:07:30,4.591586,14.620884,16.0905,15.6660,20.7725,,0.0,0.0,16.112200,0.0,0.0,38.571350
2019-01-01 00:10:00,4.702949,14.294528,15.9990,15.6470,20.8780,,0.0,0.0,16.246167,0.0,0.0,39.908133
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 23:47:30,9.723166,,3.2390,30.8080,,4.404,0.0,0.0,27.921200,0.0,0.0,55.268750
2021-12-31 23:50:00,9.936902,,3.2470,31.0030,,4.404,0.0,0.0,28.272133,0.0,0.0,53.711100
2021-12-31 23:52:30,10.034887,,3.4180,30.9465,,4.421,0.0,0.0,28.500300,0.0,0.0,52.756600
2021-12-31 23:55:00,10.132872,,3.5890,30.8900,,4.438,0.0,0.0,28.921300,0.0,0.0,53.585033


In [11]:
df_resampled.to_feather('./temp/resampled_to_2_30')

# Plot the data with bokeh

In [None]:
df_resampled = df_resampled.reset_index()
df_resampled.dtypes


In [None]:
# Prepare the data for Bokeh
source = ColumnDataSource(df_resampled)
# Define the initial view range

# Create a figure with a datetime x-axis and tools for zooming and panning
p = figure(width=1200, height=500, x_axis_type="datetime",
           title="Flow rate Over Time", tools="xpan,xwheel_zoom,reset,save",
           background_fill_color="#efefef") 

p.xaxis.axis_label = "Timestamp"
p.yaxis.axis_label = "Sensor Value"

# Add hover tool
hover = HoverTool(tooltips=[('Sensor', '$name'), ('Value', '@$name')])
p.add_tools(hover)

# Keep track of legend items
legend_items = []

# Plot each sensor data
colors = Category20[12]  # Make sure to check the number of columns doesn't exceed the palette length
for i, column in enumerate(df_resampled.columns):
    # Skip the timestamp column when plotting sensor data
    if column == 'timestamp':
        continue
    line = p.line(x='timestamp', y=column, source=source, line_width=2, color=colors[i % 12], name=column)
    legend_items.append(LegendItem(label=column, renderers=[line]))

# Create and add legend
legend = Legend(items=legend_items, click_policy="hide")
p.add_layout(legend, 'right')

# Output
#output_file("flow_rate.html")
output_notebook()
# Display the plot 
show(p)# 


Loading this plot takes ages, I can't really read anything from it, it's pretty much useless.. Besides from seeing all the data at once. What actually helps to easily identfy outliers and anomalies in our data. In particular the Zürcherstrasse & Bahnhofstrasse 23 sensors seem to produce weird data