In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
file_path = "/content/drive/Shared drives/Hackathon/data/parade_night_show.csv"
df = pd.read_csv(file_path)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  671 non-null    int64 
 1   WORK_DATE   671 non-null    object
 2   NIGHT_SHOW  671 non-null    object
 3   PARADE_1    671 non-null    object
 4   PARADE_2    190 non-null    object
dtypes: int64(1), object(4)
memory usage: 26.3+ KB
None


In [5]:
print(df.head())

   Unnamed: 0  WORK_DATE NIGHT_SHOW  PARADE_1  PARADE_2
0           0  2018/10/1   20:00:00  17:30:00  12:10:00
1           1  2018/10/2   20:00:00  17:30:00  12:10:00
2           2  2018/10/3   20:00:00  17:30:00  12:10:00
3           3  2018/10/4   20:00:00  17:30:00  12:10:00
4           4  2018/10/5   20:00:00  17:30:00  12:10:00


In [6]:
### Step 0: Filter dataset for "PortAventura World" ###

In [7]:
### Step 1: Check for missing values ###
missing_counts = df.isnull().sum()
total_missing = missing_counts.sum()
print("Missing values per column before handling:\n", missing_counts)
print(f"Total missing values: {total_missing}")

Missing values per column before handling:
 Unnamed: 0      0
WORK_DATE       0
NIGHT_SHOW      0
PARADE_1        0
PARADE_2      481
dtype: int64
Total missing values: 481


In [10]:
# Replace NaN values in PARADE_2 with "no parade"
df["PARADE_2"] = df["PARADE_2"].fillna("no parade")

# Verify changes
print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
Unnamed: 0    0
WORK_DATE     0
NIGHT_SHOW    0
PARADE_1      0
PARADE_2      0
dtype: int64


In [11]:
### Step 2: Check for missing dates ###
# Convert "USAGE_DATE" to datetime
date_col = "WORK_DATE"
df[date_col] = pd.to_datetime(df[date_col])

# Detect Missing Dates ###
# Generate the full expected date range
date_range = pd.date_range(start=df[date_col].min(), end=df[date_col].max())

# Find the missing dates
existing_dates = set(df[date_col])
missing_dates = sorted(set(date_range) - existing_dates)

# Print missing dates info
print(f"Total missing dates: {len(missing_dates)}")
if missing_dates:
    print(pd.DataFrame({"Missing Dates": missing_dates}).to_string(index=False))  # Print all missing dates

# Add Missing Dates to the Dataset ###
# Reindex DataFrame with full date range
#df = df.set_index("USAGE_DATE").reindex(date_range).reset_index()
#df.rename(columns={"index": "USAGE_DATE"}, inplace=True)

# Forward-fill or interpolate missing values
#df["FACILITY_NAME"] = df["FACILITY_NAME"].fillna(method="ffill")  # Ensure facility name is filled
#df["attendance"] = df["attendance"].interpolate(method="linear")  # Fill missing attendance values

Total missing dates: 747
Missing Dates
   2020-03-15
   2020-03-16
   2020-03-17
   2020-03-18
   2020-03-19
   2020-03-20
   2020-03-21
   2020-03-22
   2020-03-23
   2020-03-24
   2020-03-25
   2020-03-26
   2020-03-27
   2020-03-28
   2020-03-29
   2020-03-30
   2020-03-31
   2020-04-01
   2020-04-02
   2020-04-03
   2020-04-04
   2020-04-05
   2020-04-06
   2020-04-07
   2020-04-08
   2020-04-09
   2020-04-10
   2020-04-11
   2020-04-12
   2020-04-13
   2020-04-14
   2020-04-15
   2020-04-16
   2020-04-17
   2020-04-18
   2020-04-19
   2020-04-20
   2020-04-21
   2020-04-22
   2020-04-23
   2020-04-24
   2020-04-25
   2020-04-26
   2020-04-27
   2020-04-28
   2020-04-29
   2020-04-30
   2020-05-01
   2020-05-02
   2020-05-03
   2020-05-04
   2020-05-05
   2020-05-06
   2020-05-07
   2020-05-08
   2020-05-09
   2020-05-10
   2020-05-11
   2020-05-12
   2020-05-13
   2020-05-14
   2020-05-15
   2020-05-16
   2020-05-17
   2020-05-18
   2020-05-19
   2020-05-20
   2020-05-21
   2020-0

In [12]:
### Step 3: negative value

In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  671 non-null    int64         
 1   WORK_DATE   671 non-null    datetime64[ns]
 2   NIGHT_SHOW  671 non-null    object        
 3   PARADE_1    671 non-null    object        
 4   PARADE_2    671 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 26.3+ KB
None


In [14]:
# Display the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
   Unnamed: 0  WORK_DATE NIGHT_SHOW  PARADE_1  PARADE_2
0           0 2018-10-01   20:00:00  17:30:00  12:10:00
1           1 2018-10-02   20:00:00  17:30:00  12:10:00
2           2 2018-10-03   20:00:00  17:30:00  12:10:00
3           3 2018-10-04   20:00:00  17:30:00  12:10:00
4           4 2018-10-05   20:00:00  17:30:00  12:10:00


In [15]:
### Step 5: Export the cleaned dataset ###
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/parade_night_show.csv"
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/parade_night_show.csv
