## USed for merging the 15 attraction dataset containing waiting times

In [144]:
import pandas as pd
import os
import re

### Merge files on Attraction -> output: Each excel for each attraction with cols ('Datetime', 'WaitTime', 'Month', 'Year')

In [145]:


# Folder with the Excel files
folder_path = 'EU_Park/europark_raw_files/'

# Regex to extract attraction, month, and year from filename
pattern = r'^(.*?) - Queue times in (\w+) (\d{4})\.xlsx'

# Dictionary to hold data for each attraction
attraction_data = {}

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx'):
        match = re.match(pattern, filename)
        if match:
            attraction = match.group(1).strip()
            month = match.group(2)
            year = int(match.group(3))
            file_path = os.path.join(folder_path, filename)

            # Read the file
            df = pd.read_excel(file_path)

            # Add metadata
            df['Month'] = month
            df['Year'] = year

            # Rename columns for consistency
            df.columns = ['Datetime', 'WaitTime', 'Month', 'Year']

            # Add to the attraction's list
            if attraction not in attraction_data:
                attraction_data[attraction] = []
            attraction_data[attraction].append(df)



In [146]:
attraction_data

{'ARTHUR': [               Datetime  WaitTime  Month  Year
  0      2024-04-01 08:15      -4.0  April  2024
  1      2024-04-01 08:16      -4.0  April  2024
  2      2024-04-01 08:17      -4.0  April  2024
  3      2024-04-01 08:18      -4.0  April  2024
  4      2024-04-01 08:19      -4.0  April  2024
  ...                 ...       ...    ...   ...
  19367  2024-04-30 19:41      -4.0  April  2024
  19368  2024-04-30 19:42      -4.0  April  2024
  19369  2024-04-30 19:43      -4.0  April  2024
  19370  2024-04-30 19:44      -4.0  April  2024
  19371  2024-04-30 19:45      -4.0  April  2024
  
  [19372 rows x 4 columns],
                 Datetime  WaitTime  Month  Year
  0      2025-04-01 08:15      -4.0  April  2025
  1      2025-04-01 08:16      -4.0  April  2025
  2      2025-04-01 08:17      -4.0  April  2025
  3      2025-04-01 08:18      -4.0  April  2025
  4      2025-04-01 08:19      -4.0  April  2025
  ...                 ...       ...    ...   ...
  20060  2025-04-30 19:11   

In [147]:
# For each attraction, combine all months and save one Excel file
safe_attraction_name_list = []
save_path = 'EU_Park/europark_attraction_merged_dfs/'
for attraction, dfs in attraction_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)

    # Drop rows where Datetime is missing
    combined_df = combined_df.dropna(subset=['Datetime'])

    # Split Datetime into Date and Time using string operations
    combined_df[['Date', 'Time']] = combined_df['Datetime'].astype(str).str.strip().str.split(' ', expand=True)
    safe_attraction_name = attraction.replace('-', ' ')  # avoid file path issues # Create the output file path
    fil_safe_attraction_name = safe_attraction_name.replace(' ','_')
    safe_attraction_name_list.append(fil_safe_attraction_name)
    # print(safe_attraction_name_list)
    output_file = os.path.join(save_path, f"{fil_safe_attraction_name} - All Queue Times.xlsx")
    combined_df.to_excel(output_file, index=False)
    print(f"✅ Saved merged file for: {attraction} -> {output_file}")

✅ Saved merged file for: ARTHUR -> EU_Park/europark_attraction_merged_dfs/ARTHUR - All Queue Times.xlsx
✅ Saved merged file for: Atlantica SuperSplash -> EU_Park/europark_attraction_merged_dfs/Atlantica_SuperSplash - All Queue Times.xlsx
✅ Saved merged file for: blue fire Megacoaster -> EU_Park/europark_attraction_merged_dfs/blue_fire_Megacoaster - All Queue Times.xlsx
✅ Saved merged file for: Euro-Mir -> EU_Park/europark_attraction_merged_dfs/Euro_Mir - All Queue Times.xlsx
✅ Saved merged file for: Eurosat - CanCan Coaster -> EU_Park/europark_attraction_merged_dfs/Eurosat___CanCan_Coaster - All Queue Times.xlsx
✅ Saved merged file for: Fjord Rafting -> EU_Park/europark_attraction_merged_dfs/Fjord_Rafting - All Queue Times.xlsx
✅ Saved merged file for: Matterhorn-Blitz -> EU_Park/europark_attraction_merged_dfs/Matterhorn_Blitz - All Queue Times.xlsx
✅ Saved merged file for: Pegasus - The YoungStar Coaster -> EU_Park/europark_attraction_merged_dfs/Pegasus___The_YoungStar_Coaster - All Q

### Merge All attractions in one single excel file

In [148]:
# Folder where individual attraction Excel files are stored
folder_path = 'EU_Park/europark_attraction_merged_dfs/'

# Get list of merged attraction files
files = [f for f in os.listdir(folder_path) if f.endswith('All Queue Times.xlsx')]
print(files)

['ARTHUR - All Queue Times.xlsx', 'Atlantica_SuperSplash - All Queue Times.xlsx', 'blue_fire_Megacoaster - All Queue Times.xlsx', 'Eurosat___CanCan_Coaster - All Queue Times.xlsx', 'Euro_Mir - All Queue Times.xlsx', 'Fjord_Rafting - All Queue Times.xlsx', 'Matterhorn_Blitz - All Queue Times.xlsx', 'Pegasus___The_YoungStar_Coaster - All Queue Times.xlsx', 'Pirates_in_Batavia - All Queue Times.xlsx', 'Silver_Star - All Queue Times.xlsx', 'Swiss_Bob_Run - All Queue Times.xlsx', 'Voletarium - All Queue Times.xlsx', 'Voltron_Nevera - All Queue Times.xlsx', 'Water_Rollercoaster_Poseidon - All Queue Times.xlsx', 'WODAN___Timburcoaster - All Queue Times.xlsx']


In [149]:
merged_df = None

In [150]:
for file in files:
    # Full path to the Excel file
    file_path = os.path.join(folder_path, file)

    # Extract attraction name from filename
    attraction = file.replace(' - All Queue Times.xlsx', '').strip()

    # Read file
    df = pd.read_excel(file_path)
    df = df.drop(columns=['Datetime'], errors='ignore')
    df.rename(columns={'WaitTime': f'{attraction}_WaitTime'}, inplace=True)

    # Merge using pandas only
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on=['Date', 'Time', 'Month', 'Year'], how='outer')

merged_df = merged_df.sort_values(by=['Date', 'Time'])

output_file = os.path.join(folder_path, 'All_Attractions_Queue_Times_By_Date_Time.csv')
merged_df.to_csv(output_file, index=False)

print(f"✅ Final merged file saved to: {output_file}")

✅ Final merged file saved to: EU_Park/europark_attraction_merged_dfs/All_Attractions_Queue_Times_By_Date_Time.csv


In [151]:
merged_df_dt = pd.read_csv("EU_Park/All_Attractions_Queue_Times_By_Date_Time.csv",index_col=False)

In [152]:
merged_df_dt.dropna()

Unnamed: 0.1,Unnamed: 0,ARTHUR_WaitTime,Month,Year,Date,Time,Atlantica_SuperSplash_WaitTime,blue_fire_Megacoaster_WaitTime,Eurosat___CanCan_Coaster_WaitTime,Euro_Mir_WaitTime,Fjord_Rafting_WaitTime,Matterhorn_Blitz_WaitTime,Pegasus___The_YoungStar_Coaster_WaitTime,Pirates_in_Batavia_WaitTime,Silver_Star_WaitTime,Swiss_Bob_Run_WaitTime,Voletarium_WaitTime,Voltron_Nevera_WaitTime,Water_Rollercoaster_Poseidon_WaitTime,WODAN___Timburcoaster_WaitTime
22834,22970,30.0,April,2024,2024-04-26,11:00,1.0,30.0,30.0,25.0,1.0,25.0,15.0,10.0,25.0,30.0,30.0,91.0,1.0,35.0
22835,22971,30.0,April,2024,2024-04-26,11:01,1.0,30.0,30.0,25.0,1.0,25.0,15.0,10.0,25.0,30.0,30.0,91.0,1.0,35.0
22836,22972,30.0,April,2024,2024-04-26,11:02,1.0,30.0,30.0,25.0,1.0,25.0,15.0,10.0,25.0,30.0,30.0,91.0,1.0,35.0
22837,22973,30.0,April,2024,2024-04-26,11:03,1.0,30.0,30.0,25.0,1.0,25.0,15.0,10.0,25.0,30.0,30.0,91.0,1.0,35.0
22838,22974,30.0,April,2024,2024-04-26,11:04,1.0,30.0,30.0,25.0,1.0,25.0,15.0,10.0,25.0,30.0,30.0,91.0,1.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216082,217114,-4.0,March,2025,2025-03-31,18:11,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
216083,217115,-4.0,March,2025,2025-03-31,18:12,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
216084,217116,-4.0,March,2025,2025-03-31,18:13,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
216085,217117,-4.0,March,2025,2025-03-31,18:14,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0


In [153]:
merged_df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255788 entries, 0 to 255787
Data columns (total 20 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Unnamed: 0                                255788 non-null  int64  
 1   ARTHUR_WaitTime                           229380 non-null  float64
 2   Month                                     255788 non-null  object 
 3   Year                                      255788 non-null  int64  
 4   Date                                      255788 non-null  object 
 5   Time                                      255788 non-null  object 
 6   Atlantica_SuperSplash_WaitTime            195889 non-null  float64
 7   blue_fire_Megacoaster_WaitTime            229382 non-null  float64
 8   Eurosat___CanCan_Coaster_WaitTime         229277 non-null  float64
 9   Euro_Mir_WaitTime                         229334 non-null  float64
 10  Fjord_Rafting_WaitTi

In [154]:
merged_df_dt.isna().sum()

Unnamed: 0                                      0
ARTHUR_WaitTime                             26408
Month                                           0
Year                                            0
Date                                            0
Time                                            0
Atlantica_SuperSplash_WaitTime              59899
blue_fire_Megacoaster_WaitTime              26406
Eurosat___CanCan_Coaster_WaitTime           26511
Euro_Mir_WaitTime                           26454
Fjord_Rafting_WaitTime                      60223
Matterhorn_Blitz_WaitTime                   26420
Pegasus___The_YoungStar_Coaster_WaitTime    26405
Pirates_in_Batavia_WaitTime                 26533
Silver_Star_WaitTime                         7094
Swiss_Bob_Run_WaitTime                       6727
Voletarium_WaitTime                         68066
Voltron_Nevera_WaitTime                     63048
Water_Rollercoaster_Poseidon_WaitTime       27968
WODAN___Timburcoaster_WaitTime              33693


In [155]:
merged_df_dt = merged_df_dt.dropna(subset=['Time'])

In [156]:
def report_missing_values(df):
    total_rows = len(df)
    missing_count = df.isna().sum()
    missing_percent = (missing_count / total_rows) * 100

    result = pd.DataFrame({
        'Missing Count': missing_count,
        'Missing %': missing_percent.round(2)
    })
    result = result.sort_values(by='Missing %', ascending=False)

    print(result)  # Only show columns with missing values

report_missing_values(merged_df_dt)

                                          Missing Count  Missing %
Voletarium_WaitTime                               68066      26.61
Voltron_Nevera_WaitTime                           63048      24.65
Fjord_Rafting_WaitTime                            60223      23.54
Atlantica_SuperSplash_WaitTime                    59899      23.42
WODAN___Timburcoaster_WaitTime                    33693      13.17
Water_Rollercoaster_Poseidon_WaitTime             27968      10.93
Pirates_in_Batavia_WaitTime                       26533      10.37
Eurosat___CanCan_Coaster_WaitTime                 26511      10.36
Euro_Mir_WaitTime                                 26454      10.34
Matterhorn_Blitz_WaitTime                         26420      10.33
blue_fire_Megacoaster_WaitTime                    26406      10.32
ARTHUR_WaitTime                                   26408      10.32
Pegasus___The_YoungStar_Coaster_WaitTime          26405      10.32
Silver_Star_WaitTime                               7094       

In [157]:
merged_df_dt.to_csv("EU_PARK/All_Attractions_Queue_Times_By_Date_Time.csv")

### Merge wind data files to one excel files contating wind data

In [158]:
def merge_fn_wind_prec_temp(df_path,df_type):
    """
    Parameters:
        df_path (str): Path to folder containing Excel files.
        data_type (str): Type of data ('wind', 'prec', 'temp', etc.)
        
    Returns:
        pd.DataFrame: Combined dataframe with Month and Year columns.
    """
    files = [f for f in os.listdir(df_path) if f.endswith('.xlsx')]
    data_final = []
    for file in files:
        file_path = os.path.join(df_path, file)

        # Extract month and year from filename: 'Wind speed in March 2024.xlsx'
        if df_type == 'wind':
            name_parts = file.replace('.xlsx', '').replace('Wind speed in ', '').strip().split()
        elif df_type == 'prec':
            name_parts = file.replace('.xlsx', '').replace('Precipitation probability in ', '').strip().split()
        elif df_type == 'temp':
            name_parts = file.replace('.xlsx', '').replace('Temperatures in ', '').strip().split()
        else:
            raise ValueError(f"Unknown data_type: {df_type}")
        month = name_parts[0]
        year = name_parts[1]

        # Read the file
        df = pd.read_excel(file_path)

        # Add Month and Year columns
        df['Month'] = month
        df['Year'] = int(year)
        data_final.append(df)

    # Combine all dataframes into one
    wind_df = pd.concat(data_final, ignore_index=True)
    wind_df[['Date', 'Time']] = wind_df['date_time'].astype(str).str.strip().str.split(' ', expand=True)
    wind_df = wind_df.drop(columns=['date_time'])

    return wind_df


In [159]:
wind_loc = 'EU_PARK/wind_speed/'
final_wind_df = merge_fn_wind_prec_temp(wind_loc,df_type='wind')

In [160]:
final_wind_df.shape

(213232, 5)

In [161]:
final_wind_df.head()

Unnamed: 0,wind_speed_in_kmh,Month,Year,Date,Time
0,10.0,April,2024,2024-04-01,08:15
1,10.0,April,2024,2024-04-01,08:16
2,10.0,April,2024,2024-04-01,08:17
3,15.0,April,2024,2024-04-01,08:18
4,15.0,April,2024,2024-04-01,08:19


In [162]:
report_missing_values(final_wind_df)

                   Missing Count  Missing %
wind_speed_in_kmh            418        0.2
Time                         418        0.2
Month                          0        0.0
Year                           0        0.0
Date                           0        0.0


### Merge Precipitation data files to one excel files contating Precipitation data

In [163]:
precipitation_loc = 'EU_PARK/precipitation/'
final_prec_df = merge_fn_wind_prec_temp(precipitation_loc,df_type='prec')


In [164]:
final_prec_df.shape

(213232, 5)

In [165]:
final_prec_df.head()

Unnamed: 0,precipitation_in_percent,Month,Year,Date,Time
0,70.0,April,2024,2024-04-01,08:15
1,70.0,April,2024,2024-04-01,08:16
2,70.0,April,2024,2024-04-01,08:17
3,60.0,April,2024,2024-04-01,08:18
4,60.0,April,2024,2024-04-01,08:19


In [166]:
report_missing_values(final_prec_df)

                          Missing Count  Missing %
precipitation_in_percent            418        0.2
Time                                418        0.2
Month                                 0        0.0
Year                                  0        0.0
Date                                  0        0.0


### Merge Temperature data files to one excel files contating Temperature data

In [167]:
temperature_loc = 'EU_PARK/temperatures/'
final_temp_df = merge_fn_wind_prec_temp(temperature_loc,df_type='temp')


In [168]:
final_temp_df.shape

(213232, 5)

In [169]:
final_temp_df.head()

Unnamed: 0,temperature_in_celsius,Month,Year,Date,Time
0,8.9,April,2024,2024-04-01,08:15
1,8.9,April,2024,2024-04-01,08:16
2,8.9,April,2024,2024-04-01,08:17
3,8.9,April,2024,2024-04-01,08:18
4,8.9,April,2024,2024-04-01,08:19


In [170]:
report_missing_values(final_temp_df)

                        Missing Count  Missing %
temperature_in_celsius            418        0.2
Time                              418        0.2
Month                               0        0.0
Year                                0        0.0
Date                                0        0.0


### Merge Final Excel of waiting times with wind data, Precipitaiton and Temperature data based on Month ,year,Date Time

In [171]:
# merged_df_dt : final df for all attraction waiting times
# final_temp_df: final df for all attraction Temperatures
# final_prec_df: final df for all attraction Precipitation
# final_wind_df: final df for all attraction Wind speed in kmh

In [172]:
merge_keys = ['Month', 'Year', 'Date', 'Time']


In [173]:
# Step-by-step outer merges using pandas only
merged = pd.merge(merged_df_dt, final_temp_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')
merged = pd.merge(merged, final_prec_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')
merged = pd.merge(merged, final_wind_df, on=['Month', 'Year', 'Date', 'Time'], how='outer')

# Optional: sort for clean structure
final_merged = merged.sort_values(by=['Year', 'Month', 'Date', 'Time']).reset_index(drop=True)



In [184]:
final_merged.shape

(256515, 23)

In [174]:
report_missing_values(final_merged)

                                          Missing Count  Missing %
Voletarium_WaitTime                               68793      26.82
Voltron_Nevera_WaitTime                           63775      24.86
Fjord_Rafting_WaitTime                            60950      23.76
Atlantica_SuperSplash_WaitTime                    60626      23.63
WODAN___Timburcoaster_WaitTime                    34420      13.42
Water_Rollercoaster_Poseidon_WaitTime             28695      11.19
wind_speed_in_kmh                                 27318      10.65
precipitation_in_percent                          27318      10.65
temperature_in_celsius                            27318      10.65
Pirates_in_Batavia_WaitTime                       27260      10.63
Eurosat___CanCan_Coaster_WaitTime                 27238      10.62
Euro_Mir_WaitTime                                 27181      10.60
Matterhorn_Blitz_WaitTime                         27147      10.58
Pegasus___The_YoungStar_Coaster_WaitTime          27132      1

In [175]:
final_temp_df.columns

Index(['temperature_in_celsius', 'Month', 'Year', 'Date', 'Time'], dtype='object')

### Add a column for holiday = 0 or 1 (true or false)

In [178]:
holiday_df = pd.read_excel('EU_PARK/holidays/holidays_europapark_apr2024_apr2025.xlsx')

In [179]:
holiday_df.columns

Index(['Date', 'holiday_event'], dtype='object')

In [183]:
holiday_df.shape

(86, 2)

In [181]:
holiday_df.head()

Unnamed: 0,Date,holiday_event
0,2024-03-29,Karfreitag
1,2024-03-31,Ostersonntag
2,2024-04-01,Ostermontag
3,2024-05-01,Tag der Arbeit
4,2024-05-09,Christi Himmelfahrt
