# Data Collection and Integration of Meteostat Dataset

In this notebook, we will extend our data collection process to include historical weather data from the MeteoStat dataset, supplied via Python library. This will allow us to obtain data dating back to 1990, significantly enhancing the robustness of our predictive models.

We will:

- Set up Meteostat and install the necessary libraries.
- Download data for the variables: `temperature`, `rainfall`, `snowfall`.
- Process and save the data in the same format and structure as our existing datasets.
- Integrate the new data with our existing data cleaning pipeline.


### 1. Import libraries

In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import time

### 2. Import Custom Modules

In [3]:
from src.data.fetch_data import (
    get_nearest_station,
    download_meteostat_data
)
from src.data.processing import (
    process_meteostat_data,
    compile_meteostat_data
)

Raw data will be saved to: /workspace/SkiSnow/data/raw/cds


### 3. Define Data Paths

In [None]:
# Define the root data directory
data_root = '/workspace/SkiSnow/data'

# Define subdirectories for raw data
raw_data_root = os.path.join(data_root, 'raw', 'cds')

# Create the directories if they don't exist
os.makedirs(raw_data_root, exist_ok=True)

print(f"Raw data will be saved to: {raw_data_root}")

### 4. Define the List of Resorts and Their Coordinates

In [4]:
resorts = {
    'austrian_alps/st_anton': {
        'latitude': 47.1787,
        'longitude': 10.3143,
        'months_open': ['12', '01', '02', '03', '04'],
    },
    'austrian_alps/kitzbuhel': {
        'latitude': 47.4967,
        'longitude': 12.4429,
        'months_open': ['11', '12', '01', '02', '03', '04'],
    },
    'austrian_alps/solden': {
        'latitude': 47.0190,
        'longitude': 11.0606,
        'months_open': ['10', '11', '12', '01', '02', '03', '04', '05'],
    },
    'swiss_alps/st_moritz': {
        'latitude': 46.5407,
        'longitude': 9.8855,
        'months_open': ['11', '12', '01', '02', '03', '04'],
    },
    'swiss_alps/verbier': {
        'latitude': 46.1465,
        'longitude': 7.2769,
        'months_open': ['12', '01', '02', '03', '04'],
    },
    'italian_alps/cortina_d_ampezzo': {
        'latitude': 46.5905,
        'longitude': 12.1857,
        'months_open': ['12', '01', '02', '03', '04'],
    },
    'italian_alps/val_gardena': {
        'latitude': 46.6219,
        'longitude': 11.7673,
        'months_open': ['12', '01', '02', '03', '04'],
    },
    'italian_alps/sestriere': {
        'latitude': 45.0055,
        'longitude': 6.9335,
        'months_open': ['12', '01', '02', '03', '04'],
    },
    'slovenian_alps/kranjska_gora': {
        'latitude': 46.5347,
        'longitude': 13.8336,
        'months_open': ['12', '01', '02', '03'],
    },
    'slovenian_alps/mariborsko_pohorje': {
        'latitude': 46.5652,
        'longitude': 15.6431,
        'months_open': ['12', '01', '02', '03'],
    },
    'slovenian_alps/krvavec': {
        'latitude': 46.3471,
        'longitude': 14.5875,
        'months_open': ['12', '01', '02', '03', '04'],
    },
}


### 4. Execute the Data Retrieval Workflow

In [5]:
def run_data_retrieval(resorts, start_year, end_year, raw_data_root):
    """
    Orchestrate the data retrieval and processing for all resorts.
    
    Parameters:
    - resorts (dict): Dictionary of resorts with coordinates and open months.
    - start_year (int): Starting year for data retrieval.
    - end_year (int): Ending year for data retrieval.
    - raw_data_root (str): Root directory for raw data.
    
    Returns:
    - None
    """
    for resort_name, resort_info in resorts.items():
        latitude = resort_info['latitude']
        longitude = resort_info['longitude']
        months_open = resort_info['months_open']
        
        # Define resort-specific raw data directory
        resort_raw_dir = os.path.join(raw_data_root, resort_name.replace('/', '_'))
        
        # Define compiled CSV path within raw_data_root
        compiled_csv_path = os.path.join(raw_data_root, f"{resort_name.replace('/', '_')}_meteostat.csv")
        
        # Download data
        download_meteostat_data(resort_name, latitude, longitude, start_year, end_year, resort_raw_dir)
        
        # Optional: Pause to respect any rate limits
        time.sleep(1)
        
        # Compile data
        compile_meteostat_data(resort_name, resort_raw_dir, compiled_csv_path)

# %%
if __name__ == "__main__":
    # Define temporal range
    start_year = 1990
    end_year = 2023  # Adjust as needed
    
    # Run the data retrieval process
    run_data_retrieval(resorts, start_year, end_year, raw_data_root)
