In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [None]:
def extract_and_map_features(df):
    """
    Extract and map important features from the forecast table
    
    Parameters:
    df (DataFrame): Raw forecast data
    
    Returns:
    DataFrame: Enhanced dataframe with mapped and engineered features
    """
    # Create a copy to avoid modifying the original
    df_features = df.copy()
    
    # Basic datetime features (already present but can be standardized)
    datetime_mapping = {
        'Datum': 'date',
        'Dag': 'day',
        'Jaar': 'year',
        'Kwartaal': 'quarter',
        'Maand': 'month',
        'Week': 'week',
        'Weekdag': 'weekday'
    }
    
    # Operational features
    operational_mapping = {
        'NEMO open/dicht': 'nemo_status',
        'Openingstijd': 'opening_time',
        'Sluitingstijd': 'closing_time'
    }
    
    # Calendar and event features
    calendar_mapping = {
        'Schoolvakantie': 'school_holiday',
        'Regio': 'region',
        'Feestdag': 'public_holiday',
        'Evenement': 'event'
    }
    
    # International visitor features
    international_mapping = {
        'DU': 'visitors_germany',
        'BE': 'visitors_belgium',
        'FR': 'visitors_france',
        'IT': 'visitors_italy',
        'GB': 'visitors_uk',
        'Overig': 'visitors_other'
    }
    
    # Visitor type features
    visitor_type_mapping = {
        'Recreatief NL': 'recreational_domestic',
        'Recreatief Buitenland': 'recreational_international',
        'Scholen': 'schools',
        'Student': 'students',
        'Extern/Events': 'external_events',
        'Po': 'primary_education',
        'Vo': 'secondary_education'
    }
    
    # Actual numbers
    actual_mapping = {
        'Totaal': 'total_visitors',
        'Totaal (afgerond)': 'total_visitors_rounded',
        'Maat': 'crew_size'
    }
    
    # Forecast numbers
    forecast_mapping = {
        'FC Recreatief NL': 'forecast_recreational_domestic',
        'FC Recreatief Buitenland': 'forecast_recreational_international',
        'FC Scholen': 'forecast_schools',
        'FC Student': 'forecast_students',
        'FC Extern/Events': 'forecast_external_events',
        'Boekingen PO': 'bookings_primary',
        'Boekingen VO': 'bookings_secondary',
        'FC Maat': 'forecast_maat',
        'FC Totaal': 'forecast_total',
        'FC Totaal (afgerond)': 'forecast_total_rounded',
        'FC Maat (afgerond)': 'forecast_maat_rounded',
        'FC opmerkingen': 'forecast_notes'
    }
    
    # Studio and other forecasts
    studio_mapping = {
        'Forecast Maat PB': 'forecast_maat_pb',
        'Forecast Maat S&S': 'forecast_maat_ss',
        'Forecast Maat Horeca': 'forecast_maat_horeca',
        'Prognose Studio': 'studio_forecast',
        'Prognose Cumulatief': 'cumulative_forecast'
    }
    
    # Hours forecasts
    hours_mapping = {
        'Urenprognose PB': 'hours_forecast_pb',
        'Urenprognose S&S': 'hours_forecast_ss',
        'Urenprognose S&S kantoor': 'hours_forecast_ss_office',
        'Urenprognose Horeca': 'hours_forecast_horeca'
    }
    
    # Apply all mappings
    all_mappings = {
        **datetime_mapping,
        **operational_mapping,
        **calendar_mapping,
        **international_mapping,
        **visitor_type_mapping,
        **actual_mapping,
        **forecast_mapping,
        **studio_mapping,
        **hours_mapping
    }
    
    # Rename columns
    df_features = df_features.rename(columns=all_mappings)
    
    # Feature Engineering
    
    # Convert date to datetime if not already
    if 'date' in df_features.columns:
        df_features['date'] = pd.to_datetime(df_features['date'], format='%d-%b-%y', errors='coerce')
    
    # Binary features from categorical
    if 'nemo_status' in df_features.columns:
        df_features['is_open'] = (df_features['nemo_status'].str.lower() == 'open').astype(int)
    
    if 'weekday' in df_features.columns:
        df_features['is_weekend'] = df_features['weekday'].isin(['Saturday', 'Sunday', 'zaterdag', 'zondag']).astype(int)
    
    # Convert boolean flags
    boolean_columns = ['school_holiday', 'public_holiday']
    for col in boolean_columns:
        if col in df_features.columns:
            df_features[col] = df_features[col].notna().astype(int)
    
    # Total international visitors
    international_cols = [col for col in df_features.columns if col.startswith('visitors_') and col != 'visitors_other']
    df_features = df_features.drop(columns=international_cols)
    
    # Total forecast accuracy (actual vs forecast)
    if 'total_visitors' in df_features.columns and 'forecast_total' in df_features.columns:
        df_features['forecast_accuracy'] = df_features['total_visitors'] / df_features['forecast_total']
        df_features['forecast_error'] = df_features['total_visitors'] - df_features['forecast_total']
        df_features['forecast_error_percentage'] = (df_features['forecast_error'] / df_features['forecast_total']) * 100
    
    # Visitor composition percentages
    visitor_columns = ['recreational_domestic', 'recreational_international', 'schools', 'students', 'external_events']
    for col in visitor_columns:
        if col in df_features.columns and 'total_visitors' in df_features.columns:
            df_features[f'{col}_percentage'] = (df_features[col] / df_features['total_visitors']) * 100
    
    # Booking conversion rates
    if 'bookings_primary' in df_features.columns and 'primary_education' in df_features.columns:
        df_features['booking_conversion_primary'] = df_features['primary_education'] / df_features['bookings_primary']
    
    if 'bookings_secondary' in df_features.columns and 'secondary_education' in df_features.columns:
        df_features['booking_conversion_secondary'] = df_features['secondary_education'] / df_features['bookings_secondary']
    
    # Time-based features
    if 'date' in df_features.columns:
        df_features['day_of_year'] = df_features['date'].dt.dayofyear
        df_features['is_month_start'] = df_features['date'].dt.is_month_start.astype(int)
        df_features['is_month_end'] = df_features['date'].dt.is_month_end.astype(int)
        df_features['days_from_today'] = (pd.Timestamp.now() - df_features['date']).dt.days
    
    # Event and holiday interactions
    if 'event' in df_features.columns and 'public_holiday' in df_features.columns:
        df_features['holiday_with_event'] = ((df_features['public_holiday'] == 1) & (df_features['event'].notna())).astype(int)
    
    if 'school_holiday' in df_features.columns and 'is_weekend' in df_features.columns:
        df_features['school_holiday_weekend'] = ((df_features['school_holiday'] == 1) & (df_features['is_weekend'] == 1)).astype(int)
    
    return df_features

In [None]:
directory_path = "../Data_Raw/Forecasts/"

csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

combined_df = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    print(f"Processing: {file}")
    df = process_forecast_file(file_path)
    df['source_file'] = file
    combined_df = pd.concat([combined_df, df], ignore_index=True)

In [None]:
combined_df = combined_df.sort_values("date")

In [None]:
cleaned_data_path = "../../Data_Sources/Data_Cleaned/Visitors/"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "forecast_data_cleaned.csv")
combined_df.to_csv(output_file, index=False)