# Exploratory Data Analysis on the UWO dataset
WV.DSIOT

Done by KPP (pern) in 2024!

## Setup notebook

In [2]:
# enable autoreload when modules are changed externally
%load_ext autoreload
%autoreload 2

In [3]:
import os

from bokeh.io import output_notebook, show 
from bokeh.plotting import figure
from bokeh.models import DatetimeTickFormatter
from bokeh.palettes import Category20
output_notebook()   # enables Bokeh in Jupyter notebook

import src.data.UWOtools as uwot

## Global definitions

In [4]:
# specify start and end date for all evaluations; use None for no restriction
if True:
    # # range check shows something
    # StartDate = "2021-11-1"
    # EndDate = "2021-12-31"

    # # metadata shows something
    # StartDate = "2020-10-01"
    # EndDate = "2020-10-31"

    # year 2020
    StartDate = "2020-01-01"
    EndDate = "2020-12-31" 
else:
    StartDate = None
    EndDate = None

# specify how many data points should be used for the evaluation; use None for all data points
NrDataPoints = 50000 # 10000 / None

# Get the directory that contains the current file
current_dir = os.getcwd()

Load sensor data of one sensor, drop duplicates with keep= first, last, False and inspect the difference.

In [5]:
# === create figure Flow Rates
p = figure(title="Flow Rates", 
                 x_axis_type="datetime", #x_range=(flow_rate_data.first_valid_index(), flow_rate_data.last_valid_index()),
                 y_axis_label='Flow rate [l/s]', 
                 y_range=(0, 150),
                 sizing_mode="stretch_width", height=350,
                 tools="pan,wheel_zoom,box_zoom,reset,zoom_in,zoom_out,yzoom_in,yzoom_out") # , toolbar_location=None, background_fill_color="#efefef"

# format the plot
p.xaxis.formatter = DatetimeTickFormatter(hours="%y%m%d %H", days="%Y %m %d", months="%Y %m", years="%Y")
colors = Category20[len(Category20)]

# check some sensors
for sensor_name in ['bf_f10_22a_bahnhofstr']: #'bf_f07_23_bahnhofstr']:

    # get time series
    ts_all = uwot.GetTimeSeries(source_name=sensor_name, start_date=StartDate, end_date=EndDate, limit=NrDataPoints)
    print(f"{sensor_name:<35}: {ts_all.shape[0]} entries found")
    

    # drop duplicates, but keep first occurence
    ts_keep_first = ts_all.drop_duplicates(subset=['timestamp']) # implies keep='first'
    print(f"{sensor_name:<35}: {ts_keep_first.shape[0]} entries after drop duplicates with keep first")

    # drop duplicates, but keep last occurence
    ts_keep_last = ts_all.drop_duplicates(subset=['timestamp'], keep='last')
    print(f"{sensor_name:<35}: {ts_keep_last.shape[0]} entries after drop duplicates with keep last")

    # drop duplicates w/o keeping any occurence
    ts_no_duplicate = ts_all.drop_duplicates(subset=['timestamp'], keep=False)
    print(f"{sensor_name:<35}: {ts_no_duplicate.shape[0]} entries after drop duplicates w/o any")

    
    p.line(x='timestamp', y='value', source=ts_all,
           line_color='red', line_width=1, legend_label='all')
    p.line(x='timestamp', y='value', source=ts_keep_first,
           line_color='black', line_width=1, legend_label='keep first')
    p.line(x='timestamp', y='value', source=ts_keep_last,
           line_color='blue', line_width=1, legend_label='keep last')
    p.line(x='timestamp', y='value', source=ts_no_duplicate,
           line_color='green', line_width=1, legend_label='keep none')
      
# show the results
p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p) # type: ignore
ts_all.head()

bf_f10_22a_bahnhofstr              : 50000 entries found
bf_f10_22a_bahnhofstr              : 6250 entries after drop duplicates with keep first
bf_f10_22a_bahnhofstr              : 6250 entries after drop duplicates with keep last
bf_f10_22a_bahnhofstr              : 0 entries after drop duplicates w/o any


- Note that for keep=False no data points remain.
- Zoom into the red region to see how the data points look in detail