In [None]:
import pandas as pd
from scipy.stats import gmean

In [None]:
import plotly.graph_objects as go

In [None]:
filename_DD = f"../data/df_DD_for_SARIMAX_2025-04-08_14-28-37.csv"
filename_FB = f"../data/df_FB_for_SARIMAX_2025-04-08_14-28-37.csv"
df_DD = pd.read_csv(filename_DD, index_col=None, parse_dates=["datetime_hour"])
df_FB = pd.read_csv(filename_FB, index_col=None, parse_dates=["datetime_hour"])

In [None]:
cols_to_keep = ["hex_id", 'datetime_hour', 'rent_count', 'return_count', 'is_dayoff']

In [None]:
df_DD = df_DD[cols_to_keep]

In [None]:
df_FB = df_FB[cols_to_keep]

In [None]:
df_FB["city"]="Freiburg"
df_DD["city"]="Dresden"

In [None]:
df_bike = pd.concat([df_DD, df_FB], axis=0, ignore_index=True)

In [None]:
outlier_days = ["2024-03-22",  "2024-03-01", "2024-03-02", "2024-02-02", "2023-06-24"]
outlier_days = [pd.to_datetime(day).date() for day in outlier_days]
outlier_days

In [None]:
flt = df_bike.datetime_hour.dt.date.isin(outlier_days)
df_bike = df_bike[~flt]

In [None]:
filename_events = "../data/events/data/events/df_events_with_hex_id_and_all_cols_2025-04-23_13-12-48.csv"
df_events = pd.read_csv(filename_events, index_col=None, parse_dates=["StartDateTime", "EndDateTime"])

In [None]:
flt = df_events.EndDateTime.isna()
df_events.loc[flt, "EndDateTime"] = df_events.loc[flt, "StartDateTime"] + pd.DateOffset(hours=3)

In [None]:
df_events.StartDateTime = df_events.StartDateTime.dt.floor("h")
df_events.EndDateTime = df_events.EndDateTime.dt.floor("h")

In [None]:
hex_id_grouping = df_bike.groupby("hex_id")["rent_count"].sum()

In [None]:
hex_id_grouping = hex_id_grouping[hex_id_grouping>5000]

In [None]:
allowed_hex_ids = hex_id_grouping.index.tolist()

In [None]:
allowed_hex_ids

In [None]:
df_events = df_events[df_events.hex_id.isin(allowed_hex_ids)].copy()

In [None]:
df_events.Source.value_counts()

In [None]:
football_events = df_events.loc[df_events.Source.isin(["dynamo-dresden.de", "scfreiburg.com"])]
football_events

In [None]:
df_events.EventCategory.fillna("Unknown", inplace=True)

In [None]:
df_events["category"] = df_events.SourceGroup + " - " + df_events.EventCategory

In [None]:
df_events["category"].value_counts(dropna=False)

# graph some events - which hours to take?

In [None]:
flt_tmp = (df_events.EndDateTime.dt.hour!=0) & (df_events.StartDateTime.dt.date == df_events.EndDateTime.dt.date)
df_tmp = df_events.loc[flt_tmp]

In [None]:
df_tmp = df_tmp.sample((10))

In [None]:
df_tmp

In [None]:
# for return,take this hour and previous hour

rent_or_return = "return" # "rent" or "return"


for i, row in df_tmp.iterrows():
    hex_id = row.hex_id
    start_hour = row.StartDateTime.hour
    end_hour = row.EndDateTime.hour
    date = row.StartDateTime.date()

    day_last_week = (date - pd.DateOffset(weeks=1)).date()
    day_next_week = (date + pd.DateOffset(weeks=1)).date()

    fig = go.Figure()

    label_names= ["event_day", "day_last_week", "day_next_week"] 
    color_dict = {
        "event_day": "orange",
        "day_last_week": "blue",
        "day_next_week": "cornflowerblue"
    }

    linestyle_dict = {
        "event_day": "solid",
        "day_last_week": "dash",
        "day_next_week": "dash"
    }

    for label, dt in zip(label_names , [date, day_last_week, day_next_week]):
        flt = (df_bike.datetime_hour.dt.date == dt) & (df_bike.hex_id == hex_id)
        line_df = df_bike.loc[flt, ["datetime_hour", "rent_count", "return_count"]].copy()
        if len(line_df) > 0:
            line_df['hour'] = line_df.datetime_hour.dt.hour
            colname = "rent_count" if rent_or_return == "rent" else "return_count"

            fig.add_trace(go.Scatter(x = line_df.hour, 
                                          y = line_df[colname],name=label, 
                                          line=dict(color=color_dict[label],
                                                    dash=linestyle_dict[label],)))
            
            max_value = line_df[colname].max()
    
    # annotate start_hour and end_hour with vertical lines
    fig.add_trace(go.Scatter(x=[start_hour, start_hour], y=[0, max_value], mode='lines', name='Start Hour', line=dict(color='red', width=2)))
    fig.add_trace(go.Scatter(x=[end_hour, end_hour], y=[0, max_value], mode='lines', name='End Hour', line=dict(color='pink', width=2)))


    fig.update_layout(
        xaxis_title="Hour",
        yaxis_title=rent_or_return,
        legend_title="Legend",
        xaxis=dict(tickmode='linear', dtick=1),
        yaxis=dict(tickmode='linear', dtick=1),
        width=800, height=600
    )


    fig.show()

    

In [None]:
rent_or_return = "rent" # "rent" or "return"


for i, row in df_tmp.iloc[:].iterrows():
    hex_id = row.hex_id
    start_hour = row.StartDateTime.hour
    end_hour = row.EndDateTime.hour
    date = row.StartDateTime.date()

    day_last_week = (date - pd.DateOffset(weeks=1)).date()
    day_next_week = (date + pd.DateOffset(weeks=1)).date()

    fig = go.Figure()

    label_names= ["event_day", "day_last_week", "day_next_week"] 
    color_dict = {
        "event_day": "orange",
        "day_last_week": "blue",
        "day_next_week": "cornflowerblue"
    }

    linestyle_dict = {
        "event_day": "solid",
        "day_last_week": "dash",
        "day_next_week": "dash"
    }

    for label, dt in zip(label_names , [date, day_last_week, day_next_week]):
        flt = (df_bike.datetime_hour.dt.date == dt) & (df_bike.hex_id == hex_id)
        line_df = df_bike.loc[flt, ["datetime_hour", "rent_count", "return_count"]].copy()
        if len(line_df) > 0:
            line_df['hour'] = line_df.datetime_hour.dt.hour
            colname = "rent_count" if rent_or_return == "rent" else "return_count"

            fig.add_trace(go.Scatter(x = line_df.hour, 
                                          y = line_df[colname],name=label, 
                                          line=dict(color=color_dict[label],
                                                    dash=linestyle_dict[label],)))
            
            max_value = line_df[colname].max()
    
    # annotate start_hour and end_hour with vertical lines
    fig.add_trace(go.Scatter(x=[start_hour, start_hour], y=[0, max_value], mode='lines', name='Event start', line=dict(color='red', width=2)))
    fig.add_trace(go.Scatter(x=[end_hour, end_hour], y=[0, max_value], mode='lines',name='Event end',  line=dict(color='pink', width=2)))


    fig.update_layout(
        xaxis_title="Hour",
        yaxis_title=rent_or_return,
        legend_title="Legend",
        xaxis=dict(tickmode='linear', dtick=1),
        yaxis=dict(tickmode='linear', dtick=1),
        width=800, height=600
    )


    fig.show()

    

In [None]:
# CONCLUSION 
# for rent, take the end hour and the hout after that

# for return,take stat hour and previous hour

# compute effect for each event individually

In [None]:
# flt1 = (df_events.StartDateTime.dt.date == df_events.EndDateTime.dt.date).value_counts()
# flt1

In [None]:
# flt2 = df_events.EndDateTime.isna().value_counts()
# flt2

In [None]:
df_bike.columns

In [None]:
variables= ["rent_count", "return_count"]
labels = ["event", "last_week", "next_week"]

In [None]:
# for rent, take the end hour and the hout after that (maybe only end hour)

# for return,take stat hour and previous hour (ad m,aybe 2 h before )
# TODO collect hour of event, hour before, 2 hours before, hour of event end, 1 hour afterwards -- all that separately 

event_stats = []

for i, row in df_events.iterrows():
    category = row.category
    hex_id = row.hex_id
    start_hour = row.StartDateTime.hour

    start_datetime = row.StartDateTime
    end_datetime = row.EndDateTime
    if pd.isna(end_datetime):
        end_datetime = start_datetime + pd.DateOffset(hours=3)

    end_hour = end_datetime.hour
    event_day = row.StartDateTime.date()

    flt_hex_id = (df_bike.hex_id == hex_id)

    hour_interval_helper = {
        "return_count": {"event":[start_datetime, start_datetime-pd.DateOffset(hours=1)],
                        "last_week": [start_datetime-pd.DateOffset(weeks=1), start_datetime-pd.DateOffset(weeks=1)],
                        "next_week": [start_datetime+pd.DateOffset(weeks=1), start_datetime+pd.DateOffset(weeks=1)]
                        },
        "rent_count": {
            "event": [end_datetime], # , end_datetime + pd.DateOffset(hours=1)
            "last_week": [end_datetime-pd.DateOffset(weeks=1)], # , end_datetime + pd.DateOffset(hours=1)
            "next_week":[end_datetime+pd.DateOffset(weeks=1)] # , end_datetime +pd.DateOffset(weeks=1) + pd.DateOffset(hours=1)
        }
    }
    
    for variable in variables:
        for label in labels:
            hour_interval = hour_interval_helper[variable][label]
            flt_date = df_bike.datetime_hour.isin(hour_interval)
            flt = flt_hex_id & flt_date
            slice_tmp = df_bike.loc[flt, variable].copy()
            if not slice_tmp.empty:
                mean = slice_tmp.mean()
                df_events.at[i, f"{variable}_{label}"]=mean
            else:
                pass

In [None]:
variables

In [None]:
for variable in variables:
    df_events[f'{variable}_other_weeks'] = df_events[[f"{variable}_last_week", f"{variable}_next_week"]].mean(axis=1, skipna=True)

In [None]:
df_events['rent_ratio'] = df_events.rent_count_event / df_events.rent_count_other_weeks
df_events['return_ratio'] = df_events.return_count_event / df_events.return_count_other_weeks

In [None]:
import numpy as np

df_events.rent_ratio = df_events.rent_ratio.replace(0, pd.NA)
df_events.return_ratio = df_events.return_ratio.replace(0, pd.NA)
df_events.rent_ratio = df_events.rent_ratio.replace(np.inf, pd.NA)
df_events.return_ratio = df_events.return_ratio.replace(np.inf, pd.NA)

In [None]:
gmean([0.1,2], nan_policy="omit")

In [None]:
def gmean_with_nan_handling(series):
	# Convert the series to numeric, coercing errors to NaN
	numeric_series = pd.to_numeric(series, errors='coerce')
	return gmean(numeric_series.dropna(), nan_policy="omit")

def count_non_na(series):
	# Count the number of non-NA/null observations
	return series.notna().sum()

# Apply the custom function
stats_by_category = df_events.groupby("category")[["rent_ratio", "return_ratio"]].agg([gmean_with_nan_handling, count_non_na])

In [None]:
df_events

In [None]:
stats_by_category

In [None]:
# df_events.sort_values("rent_ratio", ascending=False)

# group event category into broader categories

In [None]:
df_events[["SourceGroup","EventCategory"]].value_counts(dropna=False).sort_index()