In [None]:
from datetime import datetime

import requests
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import json

# Historical data

In [None]:
file_path = "../Data_Cleaned/Visitors/entrance_data_cleaned.csv"

combined_entance_table = pd.read_csv(file_path)

In [None]:
def fetch_knmi_data(start_date, end_date, station=240):  # 240 is Amsterdam Schiphol
    """
    Fetch weather data from KNMI API
    
    Parameters:
    - start_date: Start date in format 'YYYYMMDD'
    - end_date: End date in format 'YYYYMMDD'
    - station: Weather station number (240 = Amsterdam Schiphol)
    
    Returns:
    - Pandas DataFrame with weather data
    """
    
    start = start_date
    end = end_date
    
    # Set up API parameters
    url = 'https://www.daggegevens.knmi.nl/klimatologie/daggegevens'
    params = {
        'stns': station,
        'vars': 'TG:RH:SQ',
        'start': start,
        'end': end
    }
    
    # Make the request
    print(f"Fetching KNMI data from {start} to {end}...")
    response = requests.post(url, data=params)
    
    if response.status_code != 200:
        print(f"Error fetching data: {response.status_code}")
        print(response.text)
        return None
    
    data_text = response.text
    
    # Find the start of the actual data (after comments)
    lines = data_text.strip().split('\n')
    data_lines = []
    header = None
    
    for line in lines:
        if line.startswith('# STN,'):
            # This is the header line, remove the # and spaces
            header = line[2:].strip()
        elif not line.startswith('#'):
            # This is a data line
            data_lines.append(line)
    
    if not header or not data_lines:
        print("Failed to parse KNMI data")
        return None
    
    # Parse the data
    data_text = '\n'.join(data_lines)
    column_names = [col.strip() for col in header.split(',')]

    try:
        df = pd.read_csv(
            io.StringIO(data_text), 
            sep=',', 
            names=column_names,
            skipinitialspace=True
        )
    except Exception as e:
        print(f"Error parsing CSV data: {e}")
        print("First few lines of data:")
        for i, line in enumerate(data_lines[:5]):
            print(f"Line {i+1}: {repr(line)}")
        return None
    
    # Process the date column
    df['YYYYMMDD'] = pd.to_datetime(df['YYYYMMDD'], format='%Y%m%d')
    df.rename(columns={'YYYYMMDD': 'Date'}, inplace=True)
    
    # Convert temperature to Celsius (KNMI multiplies by 10)
    if 'TG' in df.columns:  # Mean temperature
        df['TG'] = df['TG'] / 10.0
    if 'TX' in df.columns:  # Maximum temperature
        df['TX'] = df['TX'] / 10.0
    if 'TN' in df.columns:  # Minimum temperature
        df['TN'] = df['TN'] / 10.0
    
    # Convert precipitation to mm (KNMI multiplies by 10)
    if 'RH' in df.columns:  # Precipitation amount
        df['RH'] = df['RH'] / 10.0
    
    # Create more readable column names
    column_mapping = {
        'TG': 'MeanTemp_C',
        'TX': 'MaxTemp_C',
        'TN': 'MinTemp_C',
        'RH': 'Precipitation_mm',
        'SQ': 'Sunshine_hours',
    }
    
    # Rename only columns that exist in the dataframe
    rename_cols = {k: v for k, v in column_mapping.items() if k in df.columns}
    df.rename(columns=rename_cols, inplace=True)
    
    return df


In [None]:
combined_entance_table["Date"] = pd.to_datetime(combined_entance_table['Date'])

start_date = combined_entance_table['Date'].min().strftime('%Y%m%d')
# end_date = combined_entance_table['Date'].max().strftime('%Y%m%d')
end_date = datetime.now().strftime('%Y%m%d')


# Fetch weather data
weather_data = fetch_knmi_data(start_date, end_date)

if weather_data is not None:
    print(weather_data.head())
    
    daily_visitors = combined_entance_table.groupby('Date')['Total'].sum().reset_index()
    
    daily_visitors['Date'] = pd.to_datetime(daily_visitors['Date'])
    
    merged_data = pd.merge(daily_visitors, weather_data, on='Date', how='inner')

    
    # Create scatter plots to visualize relationships
    fig, axes = plt.subplots(2, 1, figsize=(12, 10))
    
    # Temperature vs Visitors
    sns.scatterplot(data=merged_data, x='MeanTemp_C', y='Total', ax=axes[0])
    axes[0].set_title('Temperature vs. Number of Visitors')
    axes[0].set_xlabel('Mean Temperature (°C)')
    axes[0].set_ylabel('Total Daily Visitors')
    
    # Add regression line
    sns.regplot(data=merged_data, x='MeanTemp_C', y='Total', 
                scatter=False, ax=axes[0], color='red')
    
    # Precipitation vs Visitors
    sns.scatterplot(data=merged_data, x='Precipitation_mm', y='Total', ax=axes[1])
    axes[1].set_title('Precipitation vs. Number of Visitors')
    axes[1].set_xlabel('Precipitation (mm)')
    axes[1].set_ylabel('Total Daily Visitors')
    
    # Add regression line
    sns.regplot(data=merged_data, x='Precipitation_mm', y='Total', 
                scatter=False, ax=axes[1], color='red')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate correlation
    correlation = merged_data[['Total', 'MeanTemp_C', 'Precipitation_mm']].corr()
    print("\nCorrelation between weather factors and visitors:")
    print(correlation)
    
    # Advanced analysis: Day of week + weather
    merged_data['Weekday'] = merged_data['Date'].dt.day_name()
    
    # Order weekdays properly
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    merged_data['Weekday'] = pd.Categorical(merged_data['Weekday'], categories=weekday_order, ordered=True)
    
    # Create a figure for more detailed analysis
    plt.figure(figsize=(14, 8))
    
    # Plot visitors by temperature, colored by weekday, with size indicating precipitation
    sns.scatterplot(
        data=merged_data, 
        x='MeanTemp_C', 
        y='Total', 
        hue='Weekday',
        size='Precipitation_mm',
        sizes=(20, 200),
        palette='viridis'
    )
    
    plt.title('Museum Visitors by Temperature, Weekday and Precipitation', fontsize=16)
    plt.xlabel('Mean Temperature (°C)', fontsize=12)
    plt.ylabel('Total Daily Visitors', fontsize=12)
    plt.legend(title='Weekday', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
else:
    print("Failed to fetch weather data.")

In [None]:
weather_data

In [None]:
weather_data = weather_data.drop(columns=["STN"])

In [None]:
# Convert Date column to datetime
weather_data['Date'] = pd.to_datetime(weather_data['Date'])
weather_data["Sunshine_hours"] = weather_data["Sunshine_hours"] * 0.1

# Get the first and last date in the dataset
start_date = weather_data['Date'].min()
end_date = weather_data['Date'].max()

# Create a complete date range
date_range = pd.date_range(start=start_date, end=end_date)

# Create a new DataFrame with the complete date range
complete_df = pd.DataFrame({'Date': date_range})

# Merge with the original data
filled_df = pd.merge(complete_df, weather_data, on='Date', how='left')

# Fill NaN values with 0
columns_to_fill = ["MeanTemp_C", "Precipitation_mm", "Sunshine_hours"]
filled_df[columns_to_fill] = filled_df[columns_to_fill].fillna(0).astype(float)

# Sort by date
filled_df = filled_df.sort_values('Date')

In [None]:
filled_df

In [None]:
import os

cleaned_data_path = "../../Data_Sources/Data_Cleaned/Weather"

# Create the target directory if it doesn't exist
os.makedirs(cleaned_data_path, exist_ok=True)

# Save the DataFrame to the target directory
output_file = os.path.join(cleaned_data_path, "weather_data.csv")
filled_df.to_csv(output_file, index=False)

# Weather Forecast

In [None]:
def get_weather_forecast(api_key, location, days=3):

    """
    Get weather forecast for a location
    
    Parameters:
    api_key (str): Your WeatherAPI.com API key
    location (str): Location name, zip code, or lat,lon coordinates
    days (int): Number of days of forecast (1-14)
    
    Returns:
    dict: Weather forecast data
    """
    
    # API endpoint
    url = "https://api.weatherapi.com/v1/forecast.json"

    # Request parameters
    params = {
        'key': api_key,
        'q': location,
        'days': days,
        'aqi': 'no',  # Not Include air quality data
        'alerts': 'no'  # Not Include weather alerts
    }
    
    try:
        # Make API request
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        # Parse response to JSON
        forecast_data = response.json()
        
        return forecast_data
        
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error occurred: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"JSON decode error: {json_err}")
        
    return None

In [None]:
def create_df_from_forecast(forecast):
    forecastdays = forecast["forecast"]["forecastday"]
    result = {
        "date": [],
        "avgtemp_c": [],
        "totalprecip_mm": []
    }
    
    for day in forecastdays:
        result["date"].append(day["date"])
        result["avgtemp_c"].append(day["day"]["avgtemp_c"])
        result["totalprecip_mm"].append(day["day"]["totalprecip_mm"])
    
    df = pd.DataFrame(result)
    return df

In [None]:
API_KEY = "fc5bf7e3434d49658b3153114252904"
location = "Amsterdam"
days_to_forecast = 4

forecast_data = get_weather_forecast(API_KEY, location, days_to_forecast)



In [None]:
create_df_from_forecast(forecast_data)