## USed for merging the 15 attraction dataset containing waiting times

In [None]:
import pandas as pd
import os
import re
!pip install openpyxl




### Merge files on Attraction -> output: Each excel for each attraction with cols ('Datetime', 'WaitTime', 'Month', 'Year')
### 0Attraction open
### -1Virtual Queue
### -2Maintenance
###  -3Closed due to weather
### -4Attraction closed
### 91over 90 minutes

In [48]:


# Folder with the Excel files
folder_path = 'EU_Park/europark_raw_files/'

# Regex to extract attraction, month, and year from filename
pattern = r'^(.*?) - Queue times in (\w+) (\d{4})\.xlsx'

# Dictionary to hold data for each attraction
attraction_data = {}

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx'):
        match = re.match(pattern, filename)
        if match:
            attraction = match.group(1).strip()
            month = match.group(2)
            year = int(match.group(3))
            file_path = os.path.join(folder_path, filename)

            # Read the file
            df = pd.read_excel(file_path)

            # Add metadata
            df['Month'] = month
            df['Year'] = year

            # Rename columns for consistency
            df.columns = ['Datetime', 'WaitTime', 'Month', 'Year']

            # Add to the attraction's list
            if attraction not in attraction_data:
                attraction_data[attraction] = []
            attraction_data[attraction].append(df)



In [49]:
attraction_data

{'ARTHUR': [              Datetime  WaitTime  Month  Year
  0     2022-04-01 08:15      -4.0  April  2022
  1     2022-04-01 08:20      -4.0  April  2022
  2     2022-04-01 08:25      -4.0  April  2022
  3     2022-04-01 08:30      -4.0  April  2022
  4     2022-04-01 08:35      -4.0  April  2022
  ...                ...       ...    ...   ...
  3941  2022-04-30 18:55      10.0  April  2022
  3942  2022-04-30 19:00      10.0  April  2022
  3943  2022-04-30 19:05      -4.0  April  2022
  3944  2022-04-30 19:10      -4.0  April  2022
  3945  2022-04-30 19:15      -4.0  April  2022
  
  [3946 rows x 4 columns],
                 Datetime  WaitTime  Month  Year
  0      2023-04-01 08:15      -4.0  April  2023
  1      2023-04-01 08:16      -4.0  April  2023
  2      2023-04-01 08:17      -4.0  April  2023
  3      2023-04-01 08:18      -4.0  April  2023
  4      2023-04-01 08:19      -4.0  April  2023
  ...                 ...       ...    ...   ...
  19742  2023-04-30 20:11      -4.0  Apri

In [50]:
# For each attraction, combine all months and save one Excel file
safe_attraction_name_list = []
save_path = 'EU_Park/europark_attraction_merged_dfs/'
for attraction, dfs in attraction_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)

    # Drop rows where Datetime is missing
    combined_df = combined_df.dropna(subset=['Datetime'])

    # Split Datetime into Date and Time using string operations
    combined_df[['Date', 'Time']] = combined_df['Datetime'].astype(str).str.strip().str.split(' ', expand=True)
    safe_attraction_name = attraction.replace('-', ' ')  # avoid file path issues # Create the output file path
    fil_safe_attraction_name = safe_attraction_name.replace(' ','_')
    safe_attraction_name_list.append(fil_safe_attraction_name)
    # print(safe_attraction_name_list)
    output_file = os.path.join(save_path, f"{fil_safe_attraction_name} - All Queue Times.xlsx")
    combined_df.to_excel(output_file, index=False)
    print(f"✅ Saved merged file for: {attraction} -> {output_file}")

✅ Saved merged file for: ARTHUR -> EU_Park/europark_attraction_merged_dfs/ARTHUR - All Queue Times.xlsx


### Merge All attractions in one single excel file

In [51]:
# Folder where individual attraction Excel files are stored
folder_path = 'EU_Park/europark_attraction_merged_dfs/'

# Get list of merged attraction files
files = [f for f in os.listdir(folder_path) if f.endswith('All Queue Times.xlsx')]
print(files)

['ARTHUR - All Queue Times.xlsx']


In [52]:
merged_df = None

In [53]:
for file in files:
    # Full path to the Excel file
    file_path = os.path.join(folder_path, file)

    # Extract attraction name from filename
    attraction = file.replace(' - All Queue Times.xlsx', '').strip()

    # Read file
    df = pd.read_excel(file_path)
    df = df.drop(columns=['Datetime'], errors='ignore')
    df.rename(columns={'WaitTime': f'{attraction}_WaitTime'}, inplace=True)

    # Merge using pandas only
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on=['Date', 'Time', 'Month', 'Year'], how='outer')

merged_df = merged_df.sort_values(by=['Date', 'Time'])

output_file = os.path.join(folder_path, 'All_Attractions_Queue_Times_By_Date_Time.csv')
merged_df.to_csv(output_file, index=False)

print(f"✅ Final merged file saved to: {output_file}")

✅ Final merged file saved to: EU_Park/europark_attraction_merged_dfs/All_Attractions_Queue_Times_By_Date_Time.csv


In [55]:
merged_df_dt = pd.read_csv("EU_PARK\europark_attraction_merged_dfs\All_Attractions_Queue_Times_By_Date_Time.csv",index_col=False)

In [56]:
merged_df_dt.dropna()

Unnamed: 0,ARTHUR_WaitTime,Month,Year,Date,Time
0,-4.0,April,2022,2022-04-01,08:15
1,-4.0,April,2022,2022-04-01,08:20
2,-4.0,April,2022,2022-04-01,08:25
3,-4.0,April,2022,2022-04-01,08:30
4,-4.0,April,2022,2022-04-01,08:35
...,...,...,...,...,...
548707,-4.0,April,2025,2025-04-30,19:11
548708,-4.0,April,2025,2025-04-30,19:12
548709,-4.0,April,2025,2025-04-30,19:13
548710,-4.0,April,2025,2025-04-30,19:14


In [57]:
merged_df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548715 entries, 0 to 548714
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   ARTHUR_WaitTime  547261 non-null  float64
 1   Month            548715 non-null  object 
 2   Year             548715 non-null  int64  
 3   Date             548715 non-null  object 
 4   Time             547261 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 20.9+ MB


In [58]:
merged_df_dt.isna().sum()

ARTHUR_WaitTime    1454
Month                 0
Year                  0
Date                  0
Time               1454
dtype: int64

In [59]:
merged_df_dt = merged_df_dt.dropna(subset=['Time'])

In [60]:
def report_missing_values(df):
    total_rows = len(df)
    missing_count = df.isna().sum()
    missing_percent = (missing_count / total_rows) * 100

    result = pd.DataFrame({
        'Missing Count': missing_count,
        'Missing %': missing_percent.round(2)
    })
    result = result.sort_values(by='Missing %', ascending=False)

    print(result)  # Only show columns with missing values

report_missing_values(merged_df_dt)

                 Missing Count  Missing %
ARTHUR_WaitTime              0        0.0
Month                        0        0.0
Year                         0        0.0
Date                         0        0.0
Time                         0        0.0


In [None]:
#merged_df_dt.to_csv("EU_PARK/All_Attractions_Queue_Times_By_Date_Time.csv")

### Merge wind data files to one excel files contating wind data

In [61]:
def merge_fn_wind_prec_temp(df_path,df_type):
    """
    Parameters:
        df_path (str): Path to folder containing Excel files.
        data_type (str): Type of data ('wind', 'prec', 'temp', etc.)
        
    Returns:
        pd.DataFrame: Combined dataframe with Month and Year columns.
    """
    files = [f for f in os.listdir(df_path) if f.endswith('.xlsx')]
    data_final = []
    for file in files:
        file_path = os.path.join(df_path, file)

        # Extract month and year from filename: 'Wind speed in March 2024.xlsx'
        if df_type == 'wind':
            name_parts = file.replace('.xlsx', '').replace('Wind speed in ', '').strip().split()
        elif df_type == 'prec':
            name_parts = file.replace('.xlsx', '').replace('Precipitation probability in ', '').strip().split()
        elif df_type == 'temp':
            name_parts = file.replace('.xlsx', '').replace('Temperatures in ', '').strip().split()
        else:
            raise ValueError(f"Unknown data_type: {df_type}")
        month = name_parts[0]
        year = name_parts[1]

        # Read the file
        df = pd.read_excel(file_path)

        # Add Month and Year columns
        df['Month'] = month
        df['Year'] = int(year)
        data_final.append(df)

    # Combine all dataframes into one
    wind_df = pd.concat(data_final, ignore_index=True)
    wind_df[['Date', 'Time']] = wind_df['date_time'].astype(str).str.strip().str.split(' ', expand=True)
    wind_df = wind_df.drop(columns=['date_time'])

    return wind_df


In [62]:
wind_loc = 'EU_PARK/wind_speed/'
final_wind_df = merge_fn_wind_prec_temp(wind_loc,df_type='wind')

In [63]:
final_wind_df.shape

(464598, 5)

In [64]:
final_wind_df.head()

Unnamed: 0,wind_speed_in_kmh,Month,Year,Date,Time
0,25.0,April,2023,2023-04-01,08:15
1,25.0,April,2023,2023-04-01,08:16
2,25.0,April,2023,2023-04-01,08:17
3,25.0,April,2023,2023-04-01,08:18
4,25.0,April,2023,2023-04-01,08:19


In [65]:
report_missing_values(final_wind_df)

                   Missing Count  Missing %
wind_speed_in_kmh            836       0.18
Time                         836       0.18
Month                          0       0.00
Year                           0       0.00
Date                           0       0.00


### Merge Precipitation data files to one excel files contating Precipitation data

In [66]:
precipitation_loc = 'EU_PARK/precipitation/'
final_prec_df = merge_fn_wind_prec_temp(precipitation_loc,df_type='prec')


In [67]:
final_prec_df.shape

(464598, 5)

In [68]:
final_prec_df.head()

Unnamed: 0,precipitation_in_percent,Month,Year,Date,Time
0,30.0,April,2023,2023-04-01,08:15
1,30.0,April,2023,2023-04-01,08:16
2,30.0,April,2023,2023-04-01,08:17
3,30.0,April,2023,2023-04-01,08:18
4,30.0,April,2023,2023-04-01,08:19


In [69]:
report_missing_values(final_prec_df)

                          Missing Count  Missing %
precipitation_in_percent            836       0.18
Time                                836       0.18
Month                                 0       0.00
Year                                  0       0.00
Date                                  0       0.00


### Merge Temperature data files to one excel files contating Temperature data

In [70]:
temperature_loc = 'EU_PARK/temperatures/'
final_temp_df = merge_fn_wind_prec_temp(temperature_loc,df_type='temp')


In [71]:
final_temp_df.shape

(464598, 5)

In [72]:
final_temp_df.head()

Unnamed: 0,temperature_in_celsius,Month,Year,Date,Time
0,8.8,April,2023,2023-04-01,08:15
1,8.8,April,2023,2023-04-01,08:16
2,8.8,April,2023,2023-04-01,08:17
3,8.8,April,2023,2023-04-01,08:18
4,8.8,April,2023,2023-04-01,08:19


In [73]:
report_missing_values(final_temp_df)

                        Missing Count  Missing %
temperature_in_celsius            836       0.18
Time                              836       0.18
Month                               0       0.00
Year                                0       0.00
Date                                0       0.00


### Merge Final Excel of waiting times with wind data, Precipitaiton and Temperature data based on Month ,year,Date Time

In [74]:
# merged_df_dt : final df for all attraction waiting times
# final_temp_df: final df for all attraction Temperatures
# final_prec_df: final df for all attraction Precipitation
# final_wind_df: final df for all attraction Wind speed in kmh

In [75]:
merge_keys = ['Month', 'Year', 'Date', 'Time']


In [76]:
# Step-by-step outer merges using pandas only
merged = pd.merge(merged_df_dt, final_temp_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')
merged = pd.merge(merged, final_prec_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')
merged = pd.merge(merged, final_wind_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')

# Optional: sort for clean structure
final_merged = merged.sort_values(by=['Year', 'Month', 'Date', 'Time']).reset_index(drop=True)



In [77]:
final_merged.shape

(549604, 8)

In [78]:
report_missing_values(final_merged)

                          Missing Count  Missing %
temperature_in_celsius            85828      15.62
precipitation_in_percent          85828      15.62
wind_speed_in_kmh                 85828      15.62
ARTHUR_WaitTime                    2336       0.43
Time                                836       0.15
Month                                 0       0.00
Year                                  0       0.00
Date                                  0       0.00


In [80]:
final_merged.columns

Index(['ARTHUR_WaitTime', 'Month', 'Year', 'Date', 'Time',
       'temperature_in_celsius', 'precipitation_in_percent',
       'wind_speed_in_kmh'],
      dtype='object')

In [88]:
gb = final_merged.groupby(['Month', 'Year','ARTHUR_WaitTime']).isna().value_counts()

AttributeError: 'DataFrameGroupBy' object has no attribute 'isna'

In [87]:
gb

Month      Year  ARTHUR_WaitTime  Date        Time   temperature_in_celsius  precipitation_in_percent  wind_speed_in_kmh
April      2023  -4.0             2023-04-01  08:15  8.8                     30.0                      25.0                 1
                                  2023-04-27  19:02  18.3                    10.0                      5.0                  1
                                  2023-04-30  08:23  11.8                    20.0                      15.0                 1
                                              08:22  11.8                    20.0                      15.0                 1
                                              08:21  11.8                    20.0                      15.0                 1
                                                                                                                           ..
September  2024   65.0            2024-09-21  11:10  17.0                    10.0                      5.0                 

In [82]:
final_merged_v1.columns

Index(['ARTHUR_WaitTime', 'Month', 'Year', 'Date', 'Time',
       'Atlantica_SuperSplash_WaitTime', 'blue_fire_Megacoaster_WaitTime',
       'Eurosat___CanCan_Coaster_WaitTime', 'Euro_Mir_WaitTime',
       'Fjord_Rafting_WaitTime', 'Matterhorn_Blitz_WaitTime',
       'Pegasus___The_YoungStar_Coaster_WaitTime',
       'Pirates_in_Batavia_WaitTime', 'Silver_Star_WaitTime',
       'Swiss_Bob_Run_WaitTime', 'Voletarium_WaitTime',
       'Voltron_Nevera_WaitTime', 'Water_Rollercoaster_Poseidon_WaitTime',
       'WODAN___Timburcoaster_WaitTime', 'temperature_in_celsius',
       'precipitation_in_percent', 'wind_speed_in_kmh'],
      dtype='object')

In [44]:
final_merged_v1.to_csv("final_df.csv")

In [None]:
final_merged_v1.groupby

In [45]:
df = pd.read_csv("Old_All_Attractions_Queue_Times_By_Date_Time.csv")

In [46]:
report_missing_values(df)

                                          Missing Count  Missing %
Voletarium_WaitTime                               68066      26.61
Voltron_Nevera_WaitTime                           63048      24.65
Fjord_Rafting_WaitTime                            60223      23.54
Atlantica_SuperSplash_WaitTime                    59899      23.42
WODAN___Timburcoaster_WaitTime                    33693      13.17
Water_Rollercoaster_Poseidon_WaitTime             27968      10.93
Pirates_in_Batavia_WaitTime                       26533      10.37
Eurosat___CanCan_Coaster_WaitTime                 26511      10.36
Euro_Mir_WaitTime                                 26454      10.34
Matterhorn_Blitz_WaitTime                         26420      10.33
blue_fire_Megacoaster_WaitTime                    26406      10.32
Pegasus___The_YoungStar_Coaster_WaitTime          26405      10.32
ARTHUR_WaitTime                                   26408      10.32
Silver_Star_WaitTime                               7094       

### Add a column for holiday = 0 or 1 (true or false)

In [38]:
holiday_df = pd.read_excel('EU_PARK/holidays/holidays_europapark_apr2024_apr2025.xlsx')

In [39]:
holiday_df.columns

Index(['Date', 'holiday_event'], dtype='object')

In [40]:
holiday_df.shape

(86, 2)

In [181]:
holiday_df.head()

Unnamed: 0,Date,holiday_event
0,2024-03-29,Karfreitag
1,2024-03-31,Ostersonntag
2,2024-04-01,Ostermontag
3,2024-05-01,Tag der Arbeit
4,2024-05-09,Christi Himmelfahrt
