In [None]:
import pandas as pd
import os

In [None]:
file_path = "../Data_Raw/Tickets_Data"

tables = {}

In [None]:
for file in os.listdir(file_path):
    if file.endswith(".xlsx"):
        file_path_full = os.path.join(file_path, file)
        tables[file] = pd.read_excel(file_path_full)

In [None]:
adjusted_tables = tables
adjusted_tables["2022.xlsx"].head()

In [None]:
for file_name, df in adjusted_tables.items():
    df["Datum"] = df["Datum"].replace("-", pd.NA)
    
    df['Datum'] = pd.to_datetime(df['Datum'])
    df['Date'] = df['Datum'].dt.date
    df['Hour'] = df['Datum'].dt.hour

    df = df[(df['Hour'] >= 10) & (df['Hour'] <= 17)]

    adjusted_tables[file_name] = df

In [None]:
visitors_per_day = {}
combined_pivot_table = pd.DataFrame()

for file_name, df in adjusted_tables.items():
    visitors_per_day_group = df.groupby(['Date', 'Bezoekersgroep'])['Bezoekers'].sum().reset_index()
    pivot_table = visitors_per_day_group.pivot(index='Date', 
                                              columns='Bezoekersgroep', 
                                              values='Bezoekers').fillna(0)
                                              
    pivot_table = pivot_table.astype(int)

    pivot_table['Total'] = pivot_table.sum(axis=1)
    
    combined_pivot_table = pd.concat([combined_pivot_table, pivot_table])
    visitors_per_day[file_name] = pivot_table

# Appended Table

In [None]:
combined_pivot_table = combined_pivot_table.sort_values("Date")

In [None]:
cleaned_data_path = "../../Data_Sources/Data_Cleaned/Visitors"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "entrance_data_cleaned.csv")
combined_pivot_table.to_csv(output_file, index=True)

# Hourly attendance

In [None]:
visitors_per_hour = {}
combined_hourly_table = pd.DataFrame()

for file_name, df in adjusted_tables.items():
    visitors_per_hour_group = df.groupby(['Date', 'Hour'])['Bezoekers'].sum().reset_index()

    pivot_table = visitors_per_hour_group.pivot(index='Date', 
                                              columns='Hour', 
                                              values='Bezoekers').fillna(0)
    pivot_table["Total"] = pivot_table.sum(axis=1)
    pivot_table = pivot_table.astype(int)

    pivot_table = pivot_table.reset_index()
    
    combined_hourly_table = pd.concat([combined_hourly_table, pivot_table])

    visitors_per_hour[file_name] = pivot_table

In [None]:
combined_hourly_table = combined_hourly_table.sort_values("Date")

In [None]:
combined_hourly_table

In [None]:
combined_hourly_table['Date'] = pd.to_datetime(combined_hourly_table['Date'])
combined_hourly_table['Year'] = combined_hourly_table['Date'].dt.year
combined_hourly_table['Weekday'] = combined_hourly_table['Date'].dt.day_name()
combined_hourly_table

In [None]:
# First, get a list of all columns in the dataframe
all_columns = combined_hourly_table.columns.tolist()

# Filter for hour columns (those that are numeric and end with .0)
hour_columns = []
for col in all_columns:
    if isinstance(col, (int, float)) or (isinstance(col, str) and col.replace('.', '').isdigit()):
        if isinstance(col, str) and col.endswith('.0') or isinstance(col, (int, float)):
            hour_columns.append(col)

# Group by Year and Weekday, then calculate average visitors for each hour
agg_dict = {col: 'mean' for col in hour_columns}
if 'Total' in all_columns:
    agg_dict['Total'] = 'mean'  # Add Total column to aggregation

hourly_avg_by_year_weekday = combined_hourly_table.groupby(['Year', 'Weekday']).agg(agg_dict).reset_index()

# Round the values to make them more readable
numeric_columns = hour_columns.copy()
if 'Total' in all_columns:
    numeric_columns.append('Total')
hourly_avg_by_year_weekday[numeric_columns] = hourly_avg_by_year_weekday[numeric_columns].round(1)

# Sort by Year and a custom weekday order
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hourly_avg_by_year_weekday['Weekday_order'] = hourly_avg_by_year_weekday['Weekday'].map({day: i for i, day in enumerate(weekday_order)})
hourly_avg_by_year_weekday = hourly_avg_by_year_weekday.sort_values(['Year', 'Weekday_order']).drop('Weekday_order', axis=1)

In [None]:
# Display the result
hourly_avg_by_year_weekday.sort_values(by=["Weekday", "Year"], ascending=[True, True])

In [None]:
cleaned_data_path = "../../Data_Sources/Data_Cleaned/Visitors"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "hourly_visitors.csv")
hourly_avg_by_year_weekday.to_csv(output_file, index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Reshape the data for plotting
# First, create a copy of the dataframe
plot_data = hourly_avg_by_year_weekday.copy()

# Get hour columns for plotting
hour_cols = [col for col in plot_data.columns if col not in ['Hour', 'Year', 'Weekday', 'Total']]

# Melt the dataframe to get it in the right format for plotting
melted_data = pd.melt(
    plot_data, 
    id_vars=['Year', 'Weekday'], 
    value_vars=hour_cols,
    var_name='Hour', 
    value_name='Visitors'
)

# Convert hour column to numeric (remove the .0 suffix)
melted_data['Hour'] = melted_data['Hour'].astype(float)

# Set up the plot style
plt.figure(figsize=(16, 10))
sns.set_style("darkgrid")

# Create custom weekday order
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create facet grid by weekday and year
g = sns.FacetGrid(
    melted_data, 
    col='Weekday', 
    row='Year',
    col_order=weekday_order,
    row_order=[2022, 2023, 2024, 2025],
    height=3, 
    aspect=1.2
)

# Plot the hourly distribution for each weekday and year
g.map_dataframe(
    sns.lineplot, 
    x='Hour', 
    y='Visitors', 
    marker='o'
)

# Add titles and labels
g.set_axis_labels('Hour of Day', 'Average Number of Visitors')
g.set_titles('Year: {row_name} | {col_name}')

# Adjust the layout
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle('Average Hourly Visitor Distribution by Year and Weekday', fontsize=16)

# Show the plot
plt.show()

# Additional visualization: Heatmap of total visitors by year and weekday
plt.figure(figsize=(12, 8))
pivot_data = plot_data.pivot(index='Weekday', columns='Year', values='Total')
pivot_data = pivot_data.reindex(weekday_order)  # Reorder weekdays

# Create heatmap
sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='YlOrRd', linewidths=.5)
plt.title('Total Average Daily Visitors by Year and Weekday', fontsize=14)
plt.tight_layout()
plt.show()

# Add all days to the combined entrance

In [None]:
df = pd.read_csv("../Data_Cleaned/Visitors/entrance_data_cleaned.csv")

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Get the first and last date in the dataset
start_date = df['Date'].min()
end_date = df['Date'].max()

# Create a complete date range
date_range = pd.date_range(start=start_date, end=end_date)

# Create a new DataFrame with the complete date range
complete_df = pd.DataFrame({'Date': date_range})

# Merge with the original data
filled_df = pd.merge(complete_df, df, on='Date', how='left')

# Fill NaN values with 0
columns_to_fill = ['Extern', 'PO', 'Recreatief Buitenland', 'Recreatief NL', 'Student', 'VO', 'Total']
filled_df[columns_to_fill] = filled_df[columns_to_fill].fillna(0).astype(int)

# Sort by date
filled_df = filled_df.sort_values('Date')

In [None]:
cleaned_data_path = "../../Data_Sources/Data_Cleaned/Visitors"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "entrance_data_cleaned.csv")
filled_df.to_csv(output_file, index=False)