In [1]:

def scrap_data():
    """
    Scraps waterlevel data from https://pegel.bonn.de/php/rheinpegel.php 
    via `Selenium` & `BeautifulSoup`.
    """
    # Selenium is used to retrieve raw HTML-data
    # Imports for Selenium + Chromebrowser
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from chromedriver_py import binary_path

    service_object = Service(binary_path)
    # Invoke new browser window

    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    
    driver = webdriver.Chrome(service=service_object, options=chrome_options)

    # Navigate to website
    driver.get("https://pegel.bonn.de/php/rheinpegel.php")
    # Select button element
    show_table_button = driver.find_element(by='id', value='btn_table')
    # Click button
    show_table_button.click()
    # Retrieve generated HTML element by id
    waterlevel_data_element = driver.find_elements(by='id', value='dataTable')
    # Get innerHTML content of table
    waterlevel_data_html = waterlevel_data_element[0].get_attribute(name='innerHTML')
    driver.close()

    # BeautifulSoup used to retrieve elements inside specific HTML-tags.
    from bs4 import BeautifulSoup
    # Create Soup from HTML
    soup = BeautifulSoup(waterlevel_data_html, 'html.parser')
    # Find all table rows
    table_rows = soup.find_all('tr')
    table_header = table_rows[0].text
    table_data = table_rows[1:]
    # Split into single td (tabledata) elements
    html_rows = [BeautifulSoup(str(table_data[i])).find_all('td')
                 for i in range(len(table_data))]
    rows = [[row[0].text, row[1].text] for row in html_rows]
    date_time_raw = [row[0] for row in rows]
    waterlevel_raw = [row[1] for row in rows]
    return date_time_raw, waterlevel_raw

def create_dataframe(date_time_raw, waterlevel_raw):
    """
    Create pandas DataFrame from date_time, waterlevel data.
    """
    import pandas as pd
    dt = pd.to_datetime(date_time_raw, format="%m/%d/%Y, %H:%M:%S Uhr")
    
    import re
    waterlevel = [int(re.findall(pattern=r'(\d+)', string=waterlevel_raw[i])[0])
                  for i in range(len(waterlevel_raw))]
    data_dict = {
        'Datetime': dt,
        'Waterlevel': waterlevel
    }
    dataFrame = pd.DataFrame(data_dict)
    return dataFrame

def map_duplicates_to_24_hour_format(dataFrame):
    """
    As the dates are stored in 12 hour format wihtout indication of AM/PM,
    later duplicates have to be mapped to the correct time in 24 hour format manually.
    """
    import pandas as pd
    duplicated_mask = dataFrame['Datetime'].duplicated(keep='last')
    dataFrame.loc[duplicated_mask,
                  'Datetime'] = dataFrame['Datetime'][duplicated_mask] + pd.Timedelta(hours=12)
    return dataFrame

def scrap_data_into_dataframe():
    dt_data, waterlevel_data = scrap_data()
    dataframe = create_dataframe(dt_data, waterlevel_data)
    dataframe = map_duplicates_to_24_hour_format(dataframe)
    return dataframe

df = scrap_data_into_dataframe()
print(df.head())

             Datetime  Waterlevel
0 2022-12-02 20:00:00         234
1 2022-12-02 19:45:00         234
2 2022-12-02 19:30:00         234
3 2022-12-02 19:15:00         236
4 2022-12-02 19:00:00         235


In [None]:
# Init influxDB credentials
from dotenv import load_dotenv
import os

load_dotenv()
influx_token = os.environ.get('influx_token')
influx_org   = os.environ.get('influx_org')
influx_bucket = os.environ.get('influx_bucket')


In [None]:
# Start up influxDB container
import subprocess

p = subprocess.Popen("bash run_influx_db.sh", stdout=subprocess.PIPE, shell=True)

print(p.communicate())

In [None]:
# Taken from https://www.influxdata.com/blog/getting-started-with-python-and-influxdb-v2-0/

from influxdb_client import InfluxDBClient
from influxdb_client.client.write_api import SYNCHRONOUS

client = InfluxDBClient(url="http://localhost:8086", 
                            token=influx_token, org=influx_org)

In [None]:
# Prepara pandas Dataframe. timestamp has to be the index column. 
df = df.rename(columns={'Datetime' : '_time'})
df = df.set_index('_time')
df.head()


In [None]:
# Write pandas dataframe to influxDB https://www.influxdata.com/blog/getting-started-with-influxdb-and-pandas/

write_api = client.write_api(write_options=SYNCHRONOUS)
write_api.write(influx_bucket, influx_org, record=df,
                data_frame_measurement_name='waterlevel',
                data_frame_tag_columns=['waterlevel'])

write_api.close()
client.close()
