In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
file_path = "/content/drive/Shared drives/Hackathon/data/attendance.csv"
df = pd.read_csv(file_path)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2367 entries, 0 to 2366
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   USAGE_DATE     2367 non-null   object
 1   FACILITY_NAME  2367 non-null   object
 2   attendance     2367 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 55.6+ KB
None


In [5]:
### Step 0: Filter dataset for "PortAventura World" ###
df = df[df["FACILITY_NAME"] == "PortAventura World"]
unique_values = df["FACILITY_NAME"].unique()
print(unique_values)

['PortAventura World']


In [6]:
### Step 1: Check for missing values ###
missing_counts = df.isnull().sum()
total_missing = missing_counts.sum()
print("Missing values per column before handling:\n", missing_counts)
print(f"Total missing values: {total_missing}")

Missing values per column before handling:
 USAGE_DATE       0
FACILITY_NAME    0
attendance       0
dtype: int64
Total missing values: 0


In [7]:
### Step 2: Check for missing dates ###
# Convert "USAGE_DATE" to datetime
df["USAGE_DATE"] = pd.to_datetime(df["USAGE_DATE"])

# Detect Missing Dates ###
# Generate the full expected date range
date_range = pd.date_range(start=df["USAGE_DATE"].min(), end=df["USAGE_DATE"].max())

# Find the missing dates
existing_dates = set(df["USAGE_DATE"])
missing_dates = sorted(set(date_range) - existing_dates)

# Print missing dates info
print(f"Total missing dates: {len(missing_dates)}")
if missing_dates:
    print(pd.DataFrame({"Missing Dates": missing_dates}).to_string(index=False))  # Print all missing dates

# Add Missing Dates to the Dataset ###
# Reindex DataFrame with full date range
#df = df.set_index("USAGE_DATE").reindex(date_range).reset_index()
#df.rename(columns={"index": "USAGE_DATE"}, inplace=True)

# Forward-fill or interpolate missing values
#df["FACILITY_NAME"] = df["FACILITY_NAME"].fillna(method="ffill")  # Ensure facility name is filled
#df["attendance"] = df["attendance"].interpolate(method="linear")  # Fill missing attendance values

Total missing dates: 335
Missing Dates
   2020-03-14
   2020-03-15
   2020-03-16
   2020-03-17
   2020-03-18
   2020-03-19
   2020-03-20
   2020-03-21
   2020-03-22
   2020-03-23
   2020-03-24
   2020-03-25
   2020-03-26
   2020-03-27
   2020-03-28
   2020-03-29
   2020-03-30
   2020-03-31
   2020-04-01
   2020-04-02
   2020-04-03
   2020-04-04
   2020-04-05
   2020-04-06
   2020-04-07
   2020-04-08
   2020-04-09
   2020-04-10
   2020-04-11
   2020-04-12
   2020-04-13
   2020-04-14
   2020-04-15
   2020-04-16
   2020-04-17
   2020-04-18
   2020-04-19
   2020-04-20
   2020-04-21
   2020-04-22
   2020-04-23
   2020-04-24
   2020-04-25
   2020-04-26
   2020-04-27
   2020-04-28
   2020-04-29
   2020-04-30
   2020-05-01
   2020-05-02
   2020-05-03
   2020-05-04
   2020-05-05
   2020-05-06
   2020-05-07
   2020-05-08
   2020-05-09
   2020-05-10
   2020-05-11
   2020-05-12
   2020-05-13
   2020-05-14
   2020-05-15
   2020-05-16
   2020-05-17
   2020-05-18
   2020-05-19
   2020-05-20
   2020-0

In [8]:
### Step 3: Check for negative attendance values ###
negative_count = (df["attendance"] < 0).sum()
print(f"Number of negative attendance values: {negative_count}")

Number of negative attendance values: 37


In [9]:
# Filter rows where UP_TIME is negative
negative_up_time_df = df[df["attendance"] < 0]

# Display the filtered DataFrame
print("Rows with Negative attendance:")
# Print all rows where UP_TIME is negative
print(negative_up_time_df.to_string(index=False))  # Prints all rows without truncation

Rows with Negative attendance:
USAGE_DATE      FACILITY_NAME  attendance
2020-07-08 PortAventura World       -9914
2020-07-10 PortAventura World       -9959
2020-07-11 PortAventura World       -4108
2020-07-12 PortAventura World       -3004
2020-09-01 PortAventura World        -792
2020-09-03 PortAventura World        -788
2020-09-04 PortAventura World        -755
2020-09-07 PortAventura World        -844
2020-09-14 PortAventura World       -2474
2020-09-15 PortAventura World       -2571
2020-09-16 PortAventura World        -699
2020-09-17 PortAventura World       -2049
2020-09-18 PortAventura World       -1505
2020-09-21 PortAventura World       -1129
2020-09-22 PortAventura World       -1685
2020-09-23 PortAventura World       -1630
2020-09-24 PortAventura World       -3700
2020-09-25 PortAventura World       -3076
2020-09-28 PortAventura World       -1790
2020-09-29 PortAventura World       -2336
2020-10-01 PortAventura World       -3979
2020-10-02 PortAventura World       -3986
202

In [11]:
# Identify negative values in "attendance"
negative_indices = df[df["attendance"] < 0].index

# Loop through each negative attendance value
for idx in negative_indices:
    # Get the 3 previous positive values
    prev_values = df.loc[:idx-1, "attendance"][df["attendance"] >= 0].tail(3).values

    # Get the 3 next positive values
    next_values = df.loc[idx+1:, "attendance"][df["attendance"] >= 0].head(3).values

    # Combine previous and next values
    valid_values = np.concatenate((prev_values, next_values))

    # If we have at least one valid value, replace the negative attendance with the mean
    if len(valid_values) > 0:
        df.at[idx, "attendance"] = np.mean(valid_values)

# Print the first few rows to verify
#print(df.head(10))

In [12]:
negative_count = (df["attendance"] < 0).sum()
print(f"Number of negative attendance values: {negative_count}")

Number of negative attendance values: 0


In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1182 entries, 0 to 2365
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   USAGE_DATE     1182 non-null   datetime64[ns]
 1   FACILITY_NAME  1182 non-null   object        
 2   attendance     1182 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 69.2+ KB
None


In [14]:
# Display the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
  USAGE_DATE       FACILITY_NAME  attendance
0 2018-06-01  PortAventura World     46804.0
2 2018-06-02  PortAventura World     57940.0
4 2018-06-03  PortAventura World     44365.0
6 2018-06-04  PortAventura World     37617.0
8 2018-06-05  PortAventura World     32438.0


In [17]:
### Step 5: Export the cleaned dataset ###
cleaned_file_path = "/content/drive/Shared drives/Hackathon/data/cleaned_data/attendance_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: /content/drive/Shared drives/Hackathon/data/cleaned_data/attendance_cleaned.csv
