In [200]:
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
import plotly.express as px
from datetime import datetime


In [56]:
filename = '../data/events/df_events 2025-03-02_12-26-38.pkl' # 

In [57]:
df = pd.read_pickle(filename)

# preprocessing

In [None]:
df.City.value_counts()

In [59]:
df.loc[df.City!="Freiburg", "City"]="Dresden"

In [60]:
df.StartDate = df.StartDateTime.dt.date

In [None]:
df.StartDate 

In [62]:
mapping = {"Gema":"GEMA",
           "facebook.com": "deecooob GmbH",
           "browse_ai": "Self scraped", 
           "Football matches": "Self scraped",
           "eventim.de": "deecooob GmbH",
           "bandsintown.com": "deecooob GmbH",
           "songkick.com": "deecooob GmbH",
           "wegow.com": "deecooob GmbH",
           "setlist.fm": "deecooob GmbH",
          "ticketmaster.de": "deecooob GmbH",
          "concertful.com": "deecooob GmbH",
          "jambase.com": "deecooob GmbH",
          "eventbrite.com": "deecooob GmbH",
          "partyflock": "deecooob GmbH",
          "livenation": "deecooob GmbH"         
}

In [63]:
df['SourceGroup'] = df.Source.map(mapping)

In [None]:
df['SourceGroup'].value_counts()

In [None]:
flt = (df.StartDate > pd.to_datetime('2023-06-25').date()) & (df.StartDate < pd.to_datetime('2024-07-01').date()) & (df.City=="Freiburg")
df[flt]

In [None]:

# remove some of the events with wrong dates
initial_len = len(df)


flt = (df.StartDate > pd.to_datetime('2024-03-31').date()) & (df.StartDate < pd.to_datetime('2024-09-03').date()) & (df.City=="Dresden")
df = df[~flt]
print(initial_len - len(df))
initial_len=len(df)

flt = (df.StartDate > pd.to_datetime('2023-06-25').date()) & (df.StartDate < pd.to_datetime('2023-07-01').date()) & (df.City=="Freiburg")
df = df[~flt]
print(initial_len - len(df))
initial_len=len(df)

flt = (df.StartDate > pd.to_datetime('2023-07-31').date()) & (df.StartDate < pd.to_datetime('2024-09-03').date()) & (df.City=="Freiburg")
df = df[~flt]
print(initial_len - len(df))
initial_len=len(df)

flt = df.StartDate > pd.to_datetime('2024-10-31').date()
df = df[~flt]
print(initial_len - len(df))
initial_len=len(df)

# trips per day

In [67]:
grouped= df.groupby(['City','StartDate']).size().reset_index(name='Count')

In [68]:
grouped = grouped.sort_values(by=['StartDate'])

In [69]:
min_date, max_date = grouped['StartDate'].min(), grouped['StartDate'].max()

In [70]:
date_range = pd.date_range(start=min_date, end=max_date)

In [71]:
multi_index = pd.MultiIndex.from_product([df["City"].unique(), date_range], names=["City", "StartDate"])


In [72]:
grouped = grouped.set_index(["City", "StartDate"]).reindex(multi_index).reset_index()

In [None]:
grouped.Count.iloc[0]

In [None]:
grouped.Count.fillna(pd.NA).iloc[0]

In [None]:
grouped.Count.astype('Int64').fillna(pd.NA).iloc[0]

In [76]:
grouped.Count =  grouped.Count.astype('Int64').fillna(pd.NA)

In [None]:
grouped.Count.iloc[0]

In [None]:
city_data = grouped[(grouped["City"] == "Freiburg") & (grouped["StartDate"] < pd.to_datetime('2023-08-01'))]
city_data["Count"] = city_data["Count"].fillna(0)
plt.bar(city_data["StartDate"], city_data["Count"])
plt.xticks(rotation=45)
plt.ylabel("Count")
plt.xlabel("StartDate")
plt.title("Events in Freiburg before 2024")
plt.show()

In [None]:
# Create the figure
plt.figure(figsize=(8, 5))

colors = plt.colormaps.get_cmap("Dark2")

for i, city in enumerate(['Dresden', "Freiburg"]):
    city_data = grouped[(grouped["City"] == city)]
    plt.plot(city_data["StartDate"], city_data["Count"], linestyle="-", label=city,color=colors(i))

# Formatting
plt.xticks(rotation=45) 
plt.ylabel("Number of events per day")
plt.xlabel("Date")
plt.legend(title="City")
plt.grid(True)
#plt.ylim(bottom=0, top=16000)
plt.savefig(f'/Users/v.sinichenko/Downloads/plots/daily_events.png', format='png', bbox_inches='tight')
plt.show()


# Stacked by month

In [80]:
df["StartMonth"]= df.StartDateTime.dt.to_period("M")

In [81]:
grouped= df.groupby(['City','StartMonth', 'SourceGroup']).size().reset_index(name='Count')

In [None]:
grouped

In [83]:
# min_date, max_date = grouped['StartDateTime'].min(), grouped['StartDateTime'].max()

In [84]:
#multi_index = pd.MultiIndex.from_product([df["City"].unique(), date_range], names=["City", "date_rent"])


In [85]:
# grouped = grouped.set_index(["City", "date_rent"]).reindex(multi_index).reset_index()

In [86]:
# grouped.Count.iloc[0]

In [87]:
# grouped.Count.fillna(pd.NA).iloc[0]

In [88]:
# grouped.Count.astype('Int64').fillna(pd.NA).iloc[0]

In [89]:
# grouped.Count =  grouped.Count.astype('Int64').fillna(pd.NA)

In [90]:
# grouped.Count.iloc[0]

In [None]:
husl_colors = sns.color_palette("husl", 5).as_hex()
husl_colors

In [92]:
grouped['StartMonth'] = grouped['StartMonth'].dt.to_timestamp()

In [93]:
category_order = grouped.SourceGroup.unique().tolist()

In [None]:
grouped.StartMonth.unique()

In [None]:
to_map

In [None]:
for city in ["Dresden", "Freiburg"]:
    grey =200
    rgb_colors = ["rgb(188, 38, 26)", f"rgb({grey}, {grey}, {grey})", "rgb(74, 114, 171)"]
    to_map = grouped[grouped["City"] == city].sort_values(by=['SourceGroup']) 
    to_map.StartMonth = to_map.StartMonth.dt.strftime('%Y-%m').astype(str)

    fig = px.bar(to_map, 
                x="StartMonth", 
                y="Count", color="SourceGroup", 
                barmode="stack",
                color_discrete_sequence=rgb_colors)

    # change xtitle
    fig.update_yaxes(title_text='Number of events per month')

    fig.update_layout(template='simple_white')
    fig.update_layout(
        plot_bgcolor='white',  # Set background to white
        paper_bgcolor='white',  # Set outer background to white
        xaxis=dict(
            showgrid=True,  # Show grid lines
            gridcolor='rgb(211, 211, 211)',
            gridwidth=1,
            linecolor='black'
        ),
        yaxis=dict(
            showgrid=True,  # Show grid lines
            gridcolor='rgb(211, 211, 211)',
            gridwidth=1, 
            showline=True,
        ),
        legend_title="Major source",
        margin=dict(t=0, b=0, l=0, r=0), 
        autosize=True )
    
    fig.write_image(f'/Users/v.sinichenko/Downloads/plots/events_by_source_{city}_events.png')
    

    fig.show()

# Table of small sources

In [None]:
df.columns

In [None]:
df.loc[df.SourceGroup=="deecooob GmbH", ['SourceGroup', "Source"]].value_counts().reset_index(name='Count') #.to_csv('/Users/v.sinichenko/Downloads/plots/source_group_counts.csv', index=False)

In [None]:
df[['SourceGroup', "Source"]].value_counts().sort_index().reset_index(name='Count')

In [137]:
flt = df.Source=="Gema"
df.loc[flt, "Source"]="GEMA"

In [138]:
flt = (df.Source=="Football matches") & (df.City=="Dresden")
df.loc[flt, "Source"]="dynamo-dresden.de"

In [139]:
flt = (df.Source=="Football matches") & (df.City=="Freiburg")
df.loc[flt, "Source"]="scfreiburg.com"

In [155]:
df_browse_ai = pd.read_pickle("../data/events/browse_ai/df_browse_ai 2025-01-02_18-50-53.pkl")

In [181]:
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("https://", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("www.", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/veranstaltungen/heute/", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/veranstaltungen/morgen/", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/sachsen.html", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/baden-wuerttemberg.html", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/veranstaltungen/wochenende/", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/veranstaltungen-baden-wuerttemberg-qqxx", "")
df_browse_ai.Origin_URL = df_browse_ai.Origin_URL.str.replace("/veranstaltungskalender?page=12", "")

In [None]:
(df_browse_ai.Origin_URL.value_counts(normalize = True))*1920

In [None]:
1125 +717 +59+ 20

In [None]:
len(df)

In [None]:
df.City.value_counts()

In [201]:
EVENTS_FOLDER = "../data/events/"
df.to_pickle(f"{EVENTS_FOLDER}/df_events with full StartDateTime {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")