In [2]:
import numpy as np
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
# Define the folder path and list of files
folder_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/"
file_names = [
    "entity_schedule_cleaned.csv",
    "attendance_cleaned.csv",
    "parade_night_show_cleaned.csv",
    "waiting_times_cleaned.csv",
    "weather_data_cleaned.csv"
]

# Dictionary to store DataFrames
dataframes = {}

# Iterate over files and load each into a separate DataFrame
for file in file_names:
    file_path = folder_path + file  # Construct full path
    dataframes[file] = pd.read_csv(file_path)  # Read full data into a DataFrame

In [8]:
# Set pandas options to display full DataFrame width
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 1000)  # Adjust width to prevent wrapping
pd.set_option("display.max_rows", 100)  # Show more rows if needed

In [9]:
entity_schedule_df = dataframes["entity_schedule_cleaned.csv"]
attendance_df = dataframes["attendance_cleaned.csv"]
parade_df = dataframes["parade_night_show_cleaned.csv"]
waiting_times_df = dataframes["waiting_times_cleaned.csv"]
weather_df = dataframes["weather_data_cleaned.csv"]

In [81]:
# Ensure date columns are in datetime format for accurate merging
attendance_df["USAGE_DATE"] = pd.to_datetime(attendance_df["USAGE_DATE"])
waiting_times_df["WORK_DATE"] = pd.to_datetime(waiting_times_df["WORK_DATE"])

# Drop the "FACILITY_NAME" column from attendance_df
attendance_df = attendance_df.drop(columns=["FACILITY_NAME"], errors="ignore")

# Rename USAGE_DATE to WORK_DATE in attendance_df for consistency
attendance_df = attendance_df.rename(columns={"USAGE_DATE": "WORK_DATE"})

# Merge the two DataFrames on WORK_DATE
merged_df = pd.merge(attendance_df, waiting_times_df, on="WORK_DATE", how="outer")  # Use "inner" if you want only common dates

# Print the first few rows of the merged DataFrame
print(merged_df.head())

   WORK_DATE  attendance                 DEB_TIME  DEB_TIME_HOUR                 FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN
0 2018-01-01         NaN  2018-01-01 21:00:00.000             21  2018-01-01 21:15:00.000           Roller Coaster              0       2.0            0.0     0.000             0.00          0        0         0          2.0            0
1 2018-01-01         NaN  2018-01-01 19:30:00.000             19  2018-01-01 19:45:00.000              Bumper Cars              5      18.0          148.0   254.749           254.75         15       15         0         18.0           15
2 2018-01-01         NaN  2018-01-01 22:30:00.000             22  2018-01-01 22:45:00.000              Rapids Ride              0       1.0            0.0     0.000             0.00          0        0         0          2.0            0
3 2018-01-01         NaN  2018-01-01 12:45:00.00

In [82]:
# Drop rows where ATTENDANCE is NaN
merged_df_att = merged_df.dropna(subset=["attendance"])

# Print the first few rows of the cleaned DataFrame
print(merged_df_att.head())

        WORK_DATE  attendance                 DEB_TIME  DEB_TIME_HOUR                 FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN
211400 2018-06-01     46804.0  2018-06-01 19:00:00.000             19  2018-06-01 19:15:00.000           Roller Coaster             15       2.0           50.0     75.00            75.00         15       15         0          2.0           15
211401 2018-06-01     46804.0  2018-06-01 20:30:00.000             20  2018-06-01 20:45:00.000                  Zipline             40      12.0           69.0    101.25           101.25         15       15         0         12.0           15
211402 2018-06-01     46804.0  2018-06-01 16:30:00.000             16  2018-06-01 16:45:00.000         Spinning Coaster             15       5.0          283.0    526.25           438.50         15       15         0          6.0           15
211403 2018-06-01     46804.

In [83]:
# Ensure WORK_DATE columns are in datetime format
parade_df["WORK_DATE"] = pd.to_datetime(parade_df["WORK_DATE"])
merged_df_att["WORK_DATE"] = pd.to_datetime(merged_df["WORK_DATE"])

# Merge parade_df with merged_df on WORK_DATE
second_merged_df = pd.merge(merged_df_att, parade_df, on="WORK_DATE", how="outer")  # Use "inner" for only matching dates

# Print the first few rows of the final merged DataFrame
print(second_merged_df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_att["WORK_DATE"] = pd.to_datetime(merged_df["WORK_DATE"])


   WORK_DATE  attendance                 DEB_TIME  DEB_TIME_HOUR                 FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN  Unnamed: 0 NIGHT_SHOW PARADE_1 PARADE_2
0 2018-06-01     46804.0  2018-06-01 19:00:00.000           19.0  2018-06-01 19:15:00.000           Roller Coaster           15.0       2.0           50.0     75.00            75.00       15.0     15.0       0.0          2.0         15.0         NaN        NaN      NaN      NaN
1 2018-06-01     46804.0  2018-06-01 20:30:00.000           20.0  2018-06-01 20:45:00.000                  Zipline           40.0      12.0           69.0    101.25           101.25       15.0     15.0       0.0         12.0         15.0         NaN        NaN      NaN      NaN
2 2018-06-01     46804.0  2018-06-01 16:30:00.000           16.0  2018-06-01 16:45:00.000         Spinning Coaster           15.0       5.0          283.0    526.2

In [91]:
# Convert datetime columns to proper format
second_merged_df["DEB_TIME"] = pd.to_datetime(second_merged_df["DEB_TIME"])
second_merged_df["FIN_TIME"] = pd.to_datetime(second_merged_df["FIN_TIME"])
weather_df["dt_iso"] = pd.to_datetime(weather_df["dt_iso"])

# Extract only the date part for merging
second_merged_df["WORK_DATE"] = second_merged_df["DEB_TIME"].dt.date  # Extract only date
weather_df["DATE_ONLY"] = weather_df["dt_iso"].dt.date  # Extract only date

# Reduce memory usage by keeping only necessary columns before merging
weather_df = weather_df[['dt_iso', 'DATE_ONLY', 'temp', 'visibility', 'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all', 'weather_main', 'weather_description']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_df["dt_iso"] = pd.to_datetime(weather_df["dt_iso"])


In [92]:
# Drop rows where ATTENDANCE is NaN
second_merged_df = second_merged_df.dropna(subset=["attendance"])

# Print the first few rows of the cleaned DataFrame
print(second_merged_df.head())

    WORK_DATE  attendance            DEB_TIME  DEB_TIME_HOUR            FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN  Unnamed: 0 NIGHT_SHOW PARADE_1 PARADE_2
0  2018-06-01     46804.0 2018-06-01 19:00:00           19.0 2018-06-01 19:15:00           Roller Coaster           15.0       2.0           50.0     75.00            75.00       15.0     15.0       0.0          2.0         15.0         NaN        NaN      NaN      NaN
1  2018-06-01     46804.0 2018-06-01 20:30:00           20.0 2018-06-01 20:45:00                  Zipline           40.0      12.0           69.0    101.25           101.25       15.0     15.0       0.0         12.0         15.0         NaN        NaN      NaN      NaN
2  2018-06-01     46804.0 2018-06-01 16:30:00           16.0 2018-06-01 16:45:00         Spinning Coaster           15.0       5.0          283.0    526.25           438.50       15.0     15

In [93]:
# Check for missing values in DEB_TIME
missing_deb_time = second_merged_df[second_merged_df["DEB_TIME"].isna()]

# Print summary
print(f"Total missing DEB_TIME values: {missing_deb_time.shape[0]}")

# Filter rows
row_df = second_merged_df[second_merged_df["DEB_TIME"] .isna()]

# Display the filtered DataFrame
print(row_df)

Total missing DEB_TIME values: 0
Empty DataFrame
Columns: [WORK_DATE, attendance, DEB_TIME, DEB_TIME_HOUR, FIN_TIME, ENTITY_DESCRIPTION_SHORT, WAIT_TIME_MAX, NB_UNITS, GUEST_CARRIED, CAPACITY, ADJUST_CAPACITY, OPEN_TIME, UP_TIME, DOWNTIME, NB_MAX_UNIT, SUM_UP_DOWN, Unnamed: 0, NIGHT_SHOW, PARADE_1, PARADE_2]
Index: []


In [94]:
# 🔥 Step 1: Filter weather data **before merging** (reduce size)
valid_dates = set(second_merged_df["WORK_DATE"])  # Get only necessary dates
filtered_weather_df = weather_df[weather_df["DATE_ONLY"].isin(valid_dates)]  # Keep only matching dates

# 🔥 Step 2: Sort dataframes for efficient `merge_asof()`
second_merged_df = second_merged_df.sort_values("DEB_TIME")
filtered_weather_df = filtered_weather_df.sort_values("dt_iso")

# 🔥 Step 3: Use `merge_asof()` in chunks to avoid memory overload
chunk_size = 50000  # Adjust based on memory limits
third_merged_dfs = []  # Store results

for i in range(0, len(second_merged_df), chunk_size):
    chunk = second_merged_df.iloc[i:i + chunk_size]  # Get small chunk of data
    merged_chunk = pd.merge_asof(
        chunk, filtered_weather_df,
        left_on="DEB_TIME", right_on="dt_iso",
        direction="backward"
    )
    third_merged_dfs.append(merged_chunk)

# 🔥 Step 4: Combine all processed chunks into a single DataFrame
third_merged_df = pd.concat(third_merged_dfs, ignore_index=True)

# Drop unnecessary columns
third_merged_df = third_merged_df.drop(columns=["DATE_ONLY"])

# Print a sample to verify
print(third_merged_df.head())

    WORK_DATE  attendance            DEB_TIME  DEB_TIME_HOUR            FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN  Unnamed: 0 NIGHT_SHOW PARADE_1 PARADE_2              dt_iso   temp  visibility  dew_point  feels_like  temp_min  temp_max  pressure  humidity  wind_speed  wind_deg  wind_gust  rain_1h  rain_3h  snow_1h  snow_3h  clouds_all weather_main weather_description
0  2018-06-01     46804.0 2018-06-01 09:00:00            9.0 2018-06-01 09:15:00            Haunted House            0.0       9.0            0.0       0.0              0.0        0.0      0.0       0.0          9.0          0.0         NaN        NaN      NaN      NaN 2018-06-01 09:00:00  18.55         NaN      16.35       18.73     17.74     18.94      1020        87        2.95       264        NaN      NaN      NaN      NaN      NaN         100       Clouds     overcast clouds
1  2018-06-01     46804.0 20

In [96]:
third_merged_df = third_merged_df.drop(columns=["Unnamed: 0"], errors="ignore")
third_merged_df = third_merged_df.drop(columns=["dt_iso"], errors="ignore")

# Print the full head of the DataFrame
print(third_merged_df.head(10))
print(third_merged_df.columns)
print(weather_df.columns)

    WORK_DATE  attendance            DEB_TIME  DEB_TIME_HOUR            FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN NIGHT_SHOW PARADE_1 PARADE_2   temp  visibility  dew_point  feels_like  temp_min  temp_max  pressure  humidity  wind_speed  wind_deg  wind_gust  rain_1h  rain_3h  snow_1h  snow_3h  clouds_all weather_main weather_description
0  2018-06-01     46804.0 2018-06-01 09:00:00            9.0 2018-06-01 09:15:00            Haunted House            0.0       9.0            0.0      0.00             0.00        0.0      0.0       0.0          9.0          0.0        NaN      NaN      NaN  18.55         NaN      16.35       18.73     17.74     18.94      1020        87        2.95       264        NaN      NaN      NaN      NaN      NaN         100       Clouds     overcast clouds
1  2018-06-01     46804.0 2018-06-01 09:00:00            9.0 2018-06-01 09:15:00           K

In [98]:
# Drop rows where ATTENDANCE is NaN
third_merged_df = third_merged_df.dropna(subset=["attendance"])

# Print the first few rows of the cleaned DataFrame
print(third_merged_df.head())

    WORK_DATE  attendance            DEB_TIME  DEB_TIME_HOUR            FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  OPEN_TIME  UP_TIME  DOWNTIME  NB_MAX_UNIT  SUM_UP_DOWN NIGHT_SHOW PARADE_1 PARADE_2   temp  visibility  dew_point  feels_like  temp_min  temp_max  pressure  humidity  wind_speed  wind_deg  wind_gust  rain_1h  rain_3h  snow_1h  snow_3h  clouds_all weather_main weather_description
0  2018-06-01     46804.0 2018-06-01 09:00:00            9.0 2018-06-01 09:15:00            Haunted House            0.0       9.0            0.0       0.0              0.0        0.0      0.0       0.0          9.0          0.0        NaN      NaN      NaN  18.55         NaN      16.35       18.73     17.74     18.94      1020        87        2.95       264        NaN      NaN      NaN      NaN      NaN         100       Clouds     overcast clouds
1  2018-06-01     46804.0 2018-06-01 09:00:00            9.0 2018-06-01 09:15:00           K

In [99]:
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_no_ES.csv"
third_merged_df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/merged_no_ES.csv


In [100]:
file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_no_ES.csv"
merged_no_ES_df = pd.read_csv(file_path)

  merged_no_ES_df = pd.read_csv(file_path)


In [104]:
# Convert datetime columns to proper format
merged_no_ES_df["DEB_TIME"] = pd.to_datetime(merged_no_ES_df["DEB_TIME"])
merged_no_ES_df["FIN_TIME"] = pd.to_datetime(merged_no_ES_df["FIN_TIME"])
entity_schedule_df["DEB_TIME"] = pd.to_datetime(entity_schedule_df["DEB_TIME"])
entity_schedule_df["FIN_TIME"] = pd.to_datetime(entity_schedule_df["FIN_TIME"])

In [110]:
# Ensure ENTITY_DESCRIPTION_SHORT is a string for proper merging
merged_no_ES_df["ENTITY_DESCRIPTION_SHORT"] = merged_no_ES_df["ENTITY_DESCRIPTION_SHORT"].astype(str)
entity_schedule_df["ENTITY_DESCRIPTION_SHORT"] = entity_schedule_df["ENTITY_DESCRIPTION_SHORT"].astype(str)

In [111]:
# Sort DataFrames to align them for merge_asof()
merged_no_ES_df = merged_no_ES_df.sort_values(["ENTITY_DESCRIPTION_SHORT", "DEB_TIME"])
entity_schedule_df = entity_schedule_df.sort_values(["ENTITY_DESCRIPTION_SHORT", "DEB_TIME"])

# Verify sorting
print("Sorting check (merged_no_ES_df):", merged_no_ES_df[["ENTITY_DESCRIPTION_SHORT", "DEB_TIME"]].head())
print("Sorting check (entity_schedule_df):", entity_schedule_df[["ENTITY_DESCRIPTION_SHORT", "DEB_TIME"]].head())

Sorting check (merged_no_ES_df):     ENTITY_DESCRIPTION_SHORT            DEB_TIME
5                Bumper Cars 2018-06-01 09:00:00
25               Bumper Cars 2018-06-01 09:15:00
51               Bumper Cars 2018-06-01 09:30:00
76               Bumper Cars 2018-06-01 09:45:00
106              Bumper Cars 2018-06-01 10:00:00
Sorting check (entity_schedule_df):       ENTITY_DESCRIPTION_SHORT            DEB_TIME
19528              Bumper Cars 2018-01-01 08:30:00
287                Bumper Cars 2018-01-02 08:00:00
13229              Bumper Cars 2018-01-03 08:00:00
5357               Bumper Cars 2018-01-04 08:00:00
1335               Bumper Cars 2018-01-05 08:00:00


In [114]:
# Create a new column, assuming all attractions are open initially
merged_no_ES_df["REF_CLOSING_DESCRIPTION"] = "Open"

# Iterate over each row in entity_schedule_df to check closures
for _, row in entity_schedule_df.iterrows():
    closure_start = row["DEB_TIME"]
    closure_end = row["FIN_TIME"]
    attraction = row["ENTITY_DESCRIPTION_SHORT"]
    closure_reason = row["REF_CLOSING_DESCRIPTION"]

    # If it's a park closure, mark all attractions as closed
    if row["ENTITY_TYPE"] == "PARK":
        merged_no_ES_df.loc[
            (merged_no_ES_df["DEB_TIME"] >= closure_start) &
            (merged_no_ES_df["FIN_TIME"] <= closure_end),
            "REF_CLOSING_DESCRIPTION"
        ] = closure_reason
    else:
        # Otherwise, mark only the specific attraction as closed
        merged_no_ES_df.loc[
            (merged_no_ES_df["ENTITY_DESCRIPTION_SHORT"] == attraction) &
            (merged_no_ES_df["DEB_TIME"] >= closure_start) &
            (merged_no_ES_df["FIN_TIME"] <= closure_end),
            "REF_CLOSING_DESCRIPTION"
        ] = closure_reason

In [122]:
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_with_ES.csv"
merged_no_ES_df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/merged_with_ES.csv


In [143]:
merged_no_ES_df["reference"] = merged_no_ES_df["REF_CLOSING_DESCRIPTION"].apply(
    lambda x: "not in ES" if x == "Open" else "ES"
)
merged_no_ES_df["REF_CLOSING_DESCRIPTION"] = merged_no_ES_df["REF_CLOSING_DESCRIPTION"].replace("Open", "Overture")

In [145]:
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_final.csv"
merged_no_ES_df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/merged_final.csv


In [4]:
file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_final.csv"
merged_final_df = pd.read_csv(file_path)

In [40]:
all_null_list = ['visibility', 'wind_gust', 'rain_3h', 'snow_3h']
merged_final_df = merged_final_df.drop(columns=all_null_list, errors="ignore")

In [44]:
# List of columns to convert to datetime
datetime_columns = ["WORK_DATE"]

# Convert columns to datetime format
merged_final_df[datetime_columns] = merged_final_df[datetime_columns].apply(pd.to_datetime, errors="coerce")

In [49]:
merged_final_df["DEB_TIME_ONLY"] = merged_final_df["DEB_TIME"].astype(str).str.split(" ").str[1]  # Extracts time part
merged_final_df["FIN_TIME_ONLY"] = merged_final_df["FIN_TIME"].astype(str).str.split(" ").str[1]  # Extracts time part

In [56]:
# Convert to full Date-Time format
merged_final_df["DEB_TIME"] = pd.to_datetime(merged_final_df["DEB_TIME"])
merged_final_df["FIN_TIME"] = pd.to_datetime(merged_final_df["FIN_TIME"])
#print(type(merged_final_df["DEB_TIME"].iloc[0]))  # Output: <class 'pandas._libs.tslibs.timestamps.Timestamp'>

merged_final_df["DEB_TIME_ONLY"] = merged_final_df["DEB_TIME"].dt.time
merged_final_df["FIN_TIME_ONLY"] = merged_final_df["FIN_TIME"].dt.time
#print(type(merged_final_df["DEB_TIME_ONLY"].iloc[0]))  # Output: <class 'datetime.time'>

In [70]:
from datetime import datetime

# Convert to datetime.time format only for valid times
time_columns = ["NIGHT_SHOW", "PARADE_1", "PARADE_2"]

for col in time_columns:
    merged_final_df[col] = merged_final_df[col].apply(
        lambda x: datetime.strptime(x, "%H:%M:%S").time() if isinstance(x, str) and x != "no parade" else x
    )

In [75]:
# parade1, 2 and night show
from datetime import datetime, timedelta

# Function to determine if a show occurs within DEB_TIME and FIN_TIME
def show_flag(row, column_name):
    # If it's "no parade", return 0
    if row[column_name] == "no parade" or pd.isna(row[column_name]):
        return 0

    try:
        # Get DEB_TIME_ONLY and FIN_TIME_ONLY
        deb_time = row["DEB_TIME_ONLY"]
        fin_time = row["FIN_TIME_ONLY"]
        show_time = row[column_name]

        # Ensure DEB_TIME_ONLY and FIN_TIME_ONLY are datetime.time objects
        if isinstance(deb_time, str):
            deb_time = datetime.strptime(deb_time, "%H:%M:%S").time()
        if isinstance(fin_time, str):
            fin_time = datetime.strptime(fin_time, "%H:%M:%S").time()

        # Convert everything to full datetime for comparison
        deb_time_dt = datetime.combine(datetime.today(), deb_time)
        fin_time_dt = datetime.combine(datetime.today(), fin_time)
        show_time_dt = datetime.combine(datetime.today(), show_time)

        # Add 15 minutes to show time
        show_time_plus_15 = show_time_dt + timedelta(minutes=15)

        # ✅ Check for **partial** overlap
        if deb_time_dt < show_time_plus_15 and fin_time_dt > show_time_dt:
            return 1  # Mark as 1 if it overlaps at all
    except Exception as e:
        print(f"Error processing row {row.name} for {column_name}: {e}")  # Debugging error output
        return 0  # Default to 0

    return 0  # Default to 0

# Apply the function using lambda
merged_final_df["NIGHT_SHOW_FLAG"] = merged_final_df.apply(lambda row: show_flag(row, "NIGHT_SHOW"), axis=1)
merged_final_df["PARADE_1_FLAG"] = merged_final_df.apply(lambda row: show_flag(row, "PARADE_1"), axis=1)
merged_final_df["PARADE_2_FLAG"] = merged_final_df.apply(lambda row: show_flag(row, "PARADE_2"), axis=1)

In [84]:
column_drop = ['NIGHT_SHOW', 'PARADE_1', 'PARADE_2', 'SUM_UP_DOWN']
merged_final_df = merged_final_df.drop(columns=column_drop, errors="ignore")

In [87]:
new_order = ['WORK_DATE', 'attendance', 'DEB_TIME', 'DEB_TIME_HOUR', 'DEB_TIME_ONLY','FIN_TIME', 'FIN_TIME_ONLY',
             'ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX', 'NB_UNITS', 'GUEST_CARRIED',
             'CAPACITY', 'ADJUST_CAPACITY', 'OPEN_TIME', 'UP_TIME', 'DOWNTIME', 'NB_MAX_UNIT',
             'NIGHT_SHOW_FLAG', 'PARADE_1_FLAG', 'PARADE_2_FLAG',
             'temp', 'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity',
             'wind_speed', 'wind_deg', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_main', 'weather_description',
             'REF_CLOSING_DESCRIPTION', 'reference']
merged_final_df = merged_final_df[new_order]

In [88]:
print(merged_final_df.columns)

Index(['WORK_DATE', 'attendance', 'DEB_TIME', 'DEB_TIME_HOUR', 'DEB_TIME_ONLY', 'FIN_TIME', 'FIN_TIME_ONLY', 'ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX', 'NB_UNITS', 'GUEST_CARRIED', 'CAPACITY', 'ADJUST_CAPACITY', 'OPEN_TIME', 'UP_TIME', 'DOWNTIME', 'NB_MAX_UNIT', 'NIGHT_SHOW_FLAG', 'PARADE_1_FLAG', 'PARADE_2_FLAG', 'temp', 'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_main', 'weather_description', 'REF_CLOSING_DESCRIPTION', 'reference'], dtype='object')


In [89]:
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/merged_final_2.csv"
merged_final_df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/merged_final_2.csv
