### Arrange Data

In [1]:
import mysql.connector
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def load_properties(filepath, sep='=', comment_char='#'):
    """
    Read the file passed as parameter as a properties file.
    """
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

In [3]:
connection = mysql.connector.connect(**load_properties('./project.properties'))

In [4]:
query = '''
select *, avg(rainfall) OVER(ORDER BY date
     ROWS BETWEEN 2 PRECEDING AND current row)
     as moving_average from weatherdailydelay where sbbregion_isocode = 'RWT'
'''

result = pd.read_sql(sql=query, con=connection)
result_df = pd.DataFrame(result)



In [5]:
result_df.head()

Unnamed: 0,sbbregion_isocode,date,rainfall,temp,zugpuenktlichkeit,moving_average
0,RWT,2021-01-01,0.775,-0.375,94.018834,0.775
1,RWT,2021-01-02,0.025,-0.65,96.219471,0.4
2,RWT,2021-01-03,0.2,-2.25,95.556445,0.333333
3,RWT,2021-01-04,0.1,-3.125,92.288617,0.108333
4,RWT,2021-01-05,0.0,-2.7,94.436709,0.1


In [6]:
### change puenktlichkeit to delay, for better visual understanding of correlation to rain and temperature

result_df.zugpuenktlichkeit = 100 - result_df.zugpuenktlichkeit

In [7]:
result_df.head()

Unnamed: 0,sbbregion_isocode,date,rainfall,temp,zugpuenktlichkeit,moving_average
0,RWT,2021-01-01,0.775,-0.375,5.981166,0.775
1,RWT,2021-01-02,0.025,-0.65,3.780529,0.4
2,RWT,2021-01-03,0.2,-2.25,4.443555,0.333333
3,RWT,2021-01-04,0.1,-3.125,7.711383,0.108333
4,RWT,2021-01-05,0.0,-2.7,5.563291,0.1


In [8]:
import plotly.io as pio
import plotly.graph_objects as go

### Below Zero

In [9]:
filtered_df = result_df[result_df['temp'] < 0]

In [10]:
max_rainfall = max(filtered_df['rainfall'])

In [11]:
rain_range = np.arange(0, max_rainfall + max_rainfall / 5, max_rainfall / 5)

In [12]:
grouped_df = filtered_df.groupby(pd.cut(filtered_df['rainfall'], rain_range)).mean()

In [13]:
grouped_df

Unnamed: 0_level_0,rainfall,temp,zugpuenktlichkeit,moving_average
rainfall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.0, 3.91]",1.12125,-1.52375,12.483444,3.044167
"(3.91, 7.82]",4.475,-0.525,11.317501,3.358333
"(7.82, 11.73]",8.975,-1.025,4.748083,7.858333
"(11.73, 15.64]",12.8625,-1.3875,19.929579,4.8125
"(15.64, 19.55]",18.45,-1.075,32.90936,9.316667


### Lineplot

In [None]:
datatrace1 = {
    'name': 'Rainfall',
    'type': 'bar', 
    'y': rain_range, 
    'x': grouped_df['zugpuenktlichkeit'],
    'orientation': 'h'
}

layout = {
    'title': 'SBB Verspätung gruppiert nach Regenmenge'
}

figdict = {'data': [datatrace1], 
          'layout': layout}

fig = go.Figure(**figdict)

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = rain_range,
        ticktext = ['(Fast) kein Regenfall (0 - 3.91mm)','Wenig Regenfall (3.91 - 7.82mm)','Mittlerer Regenfall (7.82 - 11.73mm)','Mittelstarker Regenfall (11.73 - 15.64mm)','Starker Regenfall (15.64 - 19.55mm)',]
    )
)

fig.show()

### Above Zero

In [15]:
filtered_above_zero_df = result_df[result_df['temp'] > 0]

In [16]:
max_rainfall_above_zero = max(filtered_above_zero_df['rainfall'])

In [17]:
rain_range_above_zero = np.arange(0, max_rainfall_above_zero + max_rainfall_above_zero / 3, max_rainfall_above_zero / 3)

In [18]:
grouped_above_zero_df = filtered_above_zero_df.groupby(pd.cut(filtered_above_zero_df['rainfall'], rain_range)).mean()

In [19]:
grouped_above_zero_df

Unnamed: 0_level_0,rainfall,temp,zugpuenktlichkeit,moving_average
rainfall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.0, 3.91]",0.901429,9.787857,11.778154,2.315873
"(3.91, 7.82]",5.940833,9.51,13.28745,5.728611
"(7.82, 11.73]",9.989474,9.593421,13.381015,7.586842
"(11.73, 15.64]",13.727632,11.086842,13.014052,10.209649
"(15.64, 19.55]",17.770833,7.133333,12.775379,9.45


### Lineplot

In [None]:
datatrace1 = {
    'name': 'Delay when temperature above 0',
    'type': 'bar', 
    'x': rain_range, 
    'y': grouped_above_zero_df['zugpuenktlichkeit']
}
datatrace2 = {
    'name': 'Delay when temperature below 0',
    'type': 'bar', 
    'x': rain_range, 
    'y': grouped_df['zugpuenktlichkeit'],
}


layout = {
    'title': 'SBB Verspätung gruppiert nach Regenmenge'
}

figdict = {'data': [datatrace1, datatrace2], 
          'layout': layout}

go.Figure(**figdict)