In [None]:
import sys
sys.path.append('../')
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

import sqlalchemy
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib
from RtdRay import RtdRay
from mpl_toolkits.basemap import Basemap
from helpers.StationPhillip import StationPhillip

rtd = RtdRay(notebook=True)

### Delay analysis

In [None]:
def get_delays(rtd_df):
    rtd_df['ar_happened'] = rtd_df['ar_cs'] == 'c'
    rtd_df['ar_cancellation_time_delta'] = (rtd_df['ar_clt'] - rtd_df['ar_pt']) / pd.Timedelta(minutes=1)
    rtd_df['ar_delay'] = (rtd_df['ar_ct'] - rtd_df['ar_pt']) / pd.Timedelta(minutes=1)
    ar_mask = ((rtd_df['ar_cs'] != 'c')) & (rtd_df['ar_delay'].notnull())
    rtd_df['ar_on_time_3'] = rtd_df.loc[ar_mask, 'ar_delay'] < 4
    rtd_df['ar_on_time_5'] = rtd_df.loc[ar_mask, 'ar_delay'] < 6
    rtd_df['ar_fern_on_time_5'] = rtd_df.loc[rtd_df['f'] == 'F', 'ar_on_time_5']

    rtd_df['dp_happened'] = rtd_df['dp_cs'] == 'c'
    rtd_df['dp_cancellation_time_delta'] = (rtd_df['dp_clt'] - rtd_df['dp_pt']) / pd.Timedelta(minutes=1)
    rtd_df['dp_delay'] = (rtd_df['dp_ct'] - rtd_df['dp_pt']) / pd.Timedelta(minutes=1)
    dp_mask = ((rtd_df['dp_cs'] != 'c')) & (rtd_df['dp_delay'].notnull())
    rtd_df['dp_on_time_3'] = rtd_df.loc[dp_mask, 'dp_delay'] < 4
    rtd_df['dp_on_time_5'] = rtd_df.loc[dp_mask, 'dp_delay'] < 6
    rtd_df['dp_fern_on_time_5'] = rtd_df.loc[rtd_df['f'] == 'F', 'dp_on_time_5']

    return rtd_df

### Analysis per station 

In [None]:
class PerStationAnalysis(StationPhillip):
    FERN_ON_TIME_PLOT = {
        'count_1': 'ar_fern_on_time_5',
        'count_2': 'dp_fern_on_time_5',
        'color_value': 'dp_fern_on_time_5'
    }

    ALL_ON_TIME_PLOT = {
        'count_1': 'ar_delay',
        'count_2': 'dp_delay',
        'color_value': 'dp_on_time_5'
    }

    ALL_CANCELLATIONS_PLOT =  {
        'count_1': 'ar_delay',
        'count_2': 'dp_delay',
        'color_value': 'dp_happened'
    }

    def __init__(self, rtd_df, data=None):
        super().__init__(notebook=True)
        if data is not None:
            self.data = data
        else:
            self.data = rtd_df.groupby('station').agg({
                    'ar_delay': ['mean', 'count'],
                    'ar_on_time_3': ['mean'],
                    'ar_on_time_5': ['mean'],
                    'ar_happened': ['mean'],
                    'ar_cancellation_time_delta': ['mean', 'count'],
                    'ar_fern_on_time_5': ['mean', 'count'],
                    'dp_delay': ['mean', 'count'],
                    'dp_on_time_3': ['mean'],
                    'dp_on_time_5': ['mean'],
                    'dp_happened': ['mean'],
                    'dp_cancellation_time_delta': ['mean', 'count'],
                    'dp_fern_on_time_5': ['mean', 'count']
                }).compute()
            # remove station with less than 500 stops
            self.data = self.data.loc[self.data[('dp_delay', 'count')] > 500, :]

    def plot(self, data_to_plot):
        left = 5.67
        right = 15.64
        bot = 47.06
        top = 55.06
        plt.figure(figsize=(90/2,50/2))
        m = Basemap(llcrnrlon=left,llcrnrlat=bot,urcrnrlon=right,urcrnrlat=top,
                    resolution='i', projection='tmerc', lat_0 = 51, lon_0 = 10)
        m.drawcoastlines(linewidth=0.72, color='black')
        m.drawcountries(zorder=0, color='black')

        x = np.zeros(len(self.data.index))
        y = np.zeros(len(self.data.index))
        s = np.zeros(len(self.data.index))
        c = np.zeros(len(self.data.index))

        for i, station in enumerate(self.data.index):
            x[i], y[i] = self.get_location(name=station)
            s[i] = (self.data.loc[station, [(data_to_plot['count_1'], 'count')]][0] +
                self.data.loc[station, [(data_to_plot['count_2'], 'count')]][0])
            c[i] = self.data.loc[station, [(data_to_plot['color_value'], 'mean')]]

        c = (c - min(c)) / max(c - min(c))
        self.c = c

        cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["red", 'yellow',"green"])
        m.scatter(x, y, c=c, cmap=cmap, s=s, alpha=0.2, latlon=True)

In [None]:
rtd_df = rtd.load_data(columns=['station', 'c', 'ar_ct', 'ar_pt', 'dp_ct', 'dp_pt', 'ar_cs', 'ar_clt', 'dp_cs', 'dp_clt', 'f'])

In [None]:
rtd_df = get_delays(rtd_df)

In [None]:
per_station = PerStationAnalysis(rtd_df)

In [None]:
per_station = PerStationAnalysis(rtd_df, per_station.data)

In [None]:
per_station.plot(per_station.ALL_ON_TIME_PLOT)
plt.show()

### Datapoints over time

In [None]:
rtd_df = rtd.load_data(columns=['ar_pt'])

In [None]:
rtd_df['ar_pt'] = rtd_df['ar_pt'].dt.round(freq='D')

In [None]:
over_time = rtd_df.groupby('ar_pt').agg({'ar_pt': ['count']}).compute()

In [None]:
over_time.plot(kind='area')

### train type stuff

In [None]:
def train_types_bubble_chart(rtd_df):
    train_types = rtd_df['c']
    per_train_type = train_types.value_counts()
    per_train_type = np.sqrt(per_train_type.to_numpy() / np.pi)
    print(per_train_type)
    bubble_plot = BubbleChart(per_train_type)

    fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"))
    ax.axis("off")

    bubble_plot.collapse()

    bubble_plot.plot(ax)
    ax.relim()
    ax.autoscale_view()
    plt.show()

In [None]:
def train_types_pie_chart(rtd_df):
    train_types = rtd_df['c']
    per_train_type = train_types.value_counts()
    per_train_type
    per_train_type.plot.pie(figsize=(30, 30))