# Data Cleaning and Preparation

In this notebook, we will perform data cleaning and preparation for all resorts across the Alps. This includes:

- Loading the raw data
- Handling missing values
- Correcting data types
- Normalizing resort names to handle special characters
- Filtering data based on resort operating dates
- Saving the cleaned data for further analysis

### 1. Import Libraries

In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### 2. Import custom modules

In [None]:
from src.data.cleaning import (
    get_all_csv_files_with_metadata,
    clean_and_filter_data,
    save_cleaned_data
)
from src.features.feature_engineering import (
    categorize_season,
    add_operating_season_indicator
)
from src.features.anomaly_detection import (
    detect_snow_depth_anomalies,
    handle_snow_depth_anomalies
)

### 3. Define Resort Operating Seasons

In [None]:
resort_seasons = {
    'austrian_alps/st_anton': {'open': '12-01', 'close': '04-30'},
    'austrian_alps/kitzbuhel': {'open': '10-15', 'close': '05-01'},
    'austrian_alps/solden': {'open': '11-01', 'close': '05-01'},
    'swiss_alps/st_moritz': {'open': '11-25', 'close': '05-01'},
    'swiss_alps/verbier': {'open': '12-01', 'close': '04-30'},
    'italian_alps/cortina_d_ampezzo': {'open': '11-25', 'close': '04-05'},
    'italian_alps/val_gardena': {'open': '12-01', 'close': '04-15'},
    'italian_alps/sestriere': {'open': '12-01', 'close': '04-15'},
    'slovenian_alps/kranjska_gora': {'open': '12-15', 'close': '04-15'},
    'slovenian_alps/mariborsko_pohorje': {'open': '12-01', 'close': '04-05'},
    'slovenian_alps/krvavec': {'open': '12-01', 'close': '04-30'},
}

### 4. Load and Clean Data

In [None]:
# Define the root directory
raw_data_root = '../data/raw/cds'
processed_data_root = '../data/processed/cds'

# Get list of all CSV files with dataset type
csv_files = get_all_csv_files_with_metadata(raw_data_root)
print(f"Found {len(csv_files)} CSV files after excluding specified resorts.")

data_frames = {}
for file_info in csv_files:
    if file_info['type'] == 'new':  # Only process 'new' datasets
        key, df = clean_and_filter_data(file_info)
        if key and df is not None:
            data_frames[key] = df
            print(f"Loaded and cleaned data for {key}: {df.shape[0]} rows.")
    else:
        print(f"Excluded 'old' dataset: {file_info['file_path']}")

### 4. Feature Engineering: Season Categorisation

In [None]:
for key, df in data_frames.items():
    resort = key
    if resort in resort_seasons:
        season_info = resort_seasons[resort]
        
        # Categorize seasons
        df = categorize_season(df, season_info, resort)
        
        # Add operating season indicator
        df = add_operating_season_indicator(df)
        
        # Update the DataFrame in the dictionary
        data_frames[key] = df
        print(f"Season categorized and operating season indicator added for {resort}.")
    else:
        print(f"No season information for {resort}. Data not categorized.")

### 5. Handle Missing Values and Anomalies

In [None]:
for key, df in data_frames.items():
    # Impute missing 'snow_depth' during operating season
    if 'snow_depth' in df.columns:
        # Example imputation logic can be modularized further if needed
        df['snow_depth'].fillna(method='ffill', inplace=True)
        print(f"{key}: Imputed missing 'snow_depth' values.")
    
    # Detect and handle anomalies
    df = detect_snow_depth_anomalies(df, threshold=20)
    df = handle_snow_depth_anomalies(df)
    
    # Update the DataFrame in the dictionary
    data_frames[key] = df
    print(f"Anomaly detection and handling completed for {key}.")

### 6. Rounding and Unit Conversion

In [None]:
columns_to_round = {
    'snow_depth': 1,
    'precipitation_sum': 1,
    'temperature_min': 1,
    'temperature_max': 1,
}

for key, df in data_frames.items():
    # Round numerical columns
    for column, decimals in columns_to_round.items():
        if column in df.columns:
            df[column] = df[column].round(decimals)
            print(f"{key}: Rounded '{column}' to {decimals} decimal places.")
    
    data_frames[key] = df
    print(f"Processed numerical columns for {key}.")

### 7. Save Cleaned Data

In [None]:
save_cleaned_data(data_frames, processed_data_root)

# Data Cleaning and Preparation

In this notebook, we will perform data cleaning and preparation for all resorts across the Alps. This includes:

- Loading the raw data
- Handling missing values
- Correcting data types
- Normalizing resort names to handle special characters
- Filtering data based on resort operating dates
- Saving the cleaned data for further analysis

## 1. Import Libraries

In [20]:
import pandas as pd
import os
import unicodedata
import re
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Handling Special Characters in File and Resort Names

To avoid issues with special characters (like accents and apostrophes) in file names and resort names, we'll define a normalization function. This function will:

- Convert names to lowercase
- Remove accents and diacritics
- Replace non-alphanumeric characters with underscores

In [21]:
def normalize_name(name):
    """
    Normalize names by converting to lowercase, removing accents, and replacing non-alphanumeric characters with underscores.
    """
    # Convert to lowercase
    name = name.lower()
    # Remove accents and diacritics
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('utf-8')
    # Replace non-alphanumeric characters with underscores
    name = re.sub(r'[^a-z0-9]+', '_', name)
    # Remove leading/trailing underscores
    name = name.strip('_')
    return name

### 2 (a) Utility function for standardising columns.

In [22]:
def standardize_columns(df):
    """
    Standardize column names based on dataset type.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to standardize.
    
    Returns:
    - pd.DataFrame: Standardized DataFrame.
    """
    # Rename 'time' to 'date'
    df = df.rename(columns={'time': 'date'})
    
    # Ensure 'precipitation_sum' and 'snow_depth' columns exist
    if 'precipitation' in df.columns:
        df = df.rename(columns={'precipitation': 'precipitation_sum'})
    if 'snowfall' in df.columns:
        df = df.rename(columns={'snowfall': 'snow_depth'})
    
    # If 'snow_depth' is missing, add it with NaN values
    if 'snow_depth' not in df.columns:
        df['snow_depth'] = np.nan

    return df

### 2 (b) Set data paths

In [23]:
raw_data_root = '../data/raw/cds'
processed_data_root = '../data/processed/cds'

### 3. Function to get list of all CSV files in the raw data directory

We'll create a function to traverse the directory structure and collect all CSV files. While doing so, we'll normalize the country and resort names to ensure consistency.

In [24]:
def get_all_csv_files_with_metadata(root_dir):

    exclude_resorts = [
        'french_alps/chamonix',
        'french_alps/val_d_isere_tignes',
        'french_alps/les_trois_vallees',
        'swiss_alps/verbier',
        'swiss_alps/zermatt'
    ]

    csv_files = []
    for country in os.listdir(root_dir):
        country_path = os.path.join(root_dir, country)
        if os.path.isdir(country_path):
            normalized_country = normalize_name(country)
            for resort in os.listdir(country_path):
                resort_path = os.path.join(country_path, resort)
                if os.path.isdir(resort_path):
                    normalized_resort = normalize_name(resort)
                    key = f"{normalized_country}/{normalized_resort}"
                    if key in exclude_resorts:
                        print(f"Excluding resort due to insufficient data: {key}")
                        continue  # Skip this resort
                    for file in os.listdir(resort_path):
                        if file.endswith('.csv'):
                            file_path = os.path.join(resort_path, file)
                            dataset_type = 'unknown'
                            try:
                                df_sample = pd.read_csv(file_path, nrows=1)
                                columns = df_sample.columns.tolist()
                                    
                                # Check for 'new' dataset columns
                                new_columns = {'temperature_min', 'temperature_max', 'precipitation_sum', 'snow_depth'}
                                    
                                if new_columns.issubset(columns):
                                    dataset_type = 'new'
                                else:
                                    print(f"Skipping 'old' dataset: {file_path}")
                                    continue  # Skip this file
                            except Exception as e:
                                print(f"Error reading {file_path}: {e}")
                                continue
                                
                            csv_files.append({
                                'type': dataset_type,
                                'country': normalized_country,
                                'resort': normalized_resort,
                                'file_path': file_path
                            })
    return csv_files

## 4. Data Cleaning Steps

We will perform the following data cleaning steps for each resort:

1. Remove empty rows prior to `2021-03-23`
2. Handle missing values
3. Handle duplicates
4. Correct data types
5. Filter data based on each resort's opening and closing dates
6. Save cleaned data

## 4.1 Function to Clean and Filter a Single CSV File

In [25]:
def clean_and_filter_data(file_info, optional_cutoff_date=None):
    """
    Cleans and filters data based on dataset type.
    
    Parameters:
    - file_info (dict): Information about the file.
    
    Returns:
    - key (str): Unique key for the resort.
    - df (pd.DataFrame): Cleaned DataFrame.
    """

    country = file_info['country']
    resort = file_info['resort']
    file_path = file_info['file_path']

    key = f"{country}/{resort}"
    
    try:
        df = pd.read_csv(file_path)
        df = standardize_columns(df)
        
        # Convert 'date' column to datetime format
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df = df.dropna(subset=['date'])  # Drop rows where date conversion failed
        
        # Apply optional cutoff date filter if specified
        if optional_cutoff_date:
            cutoff_date = pd.to_datetime(optional_cutoff_date)
            df = df[df['date'] >= cutoff_date]

            # Convert snow_depth units conditionally
        if 'snow_depth' in df.columns:
            if key == 'slovenian_alps/kranjska_gora':
                df['snow_depth'] = df['snow_depth'] / 10  # Convert millimeters to centimeters
                print(f"{key}: Converted 'snow_depth' from millimeters to centimeters.")
            else:
                print(f"{key}: 'snow_depth' is assumed to be in centimeters. No conversion applied.")
        
        df = df.reset_index(drop=True)
        return key, df
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None

### 4.2 Process All CSV Files

In [26]:
# Define the root directory
raw_data_root = '../data/raw/cds'

# Get list of all CSV files with dataset type
csv_files = get_all_csv_files_with_metadata(raw_data_root)
print(f"Found {len(csv_files)} CSV files after excluding specified resorts.")

data_frames = {}
for file_info in csv_files:
    if file_info['type'] == 'new':  # Only process 'new' datasets
        key, df = clean_and_filter_data(file_info)
        if key and df is not None:
            data_frames[key] = df
            print(f"Loaded and cleaned data for {key}: {df.shape[0]} rows.")
    else:
        print(f"Excluded 'old' dataset: {file_info['file_path']}")

Skipping 'old' dataset: ../data/raw/cds/austrian_alps/kitzbühel/kitzbühel.csv
Skipping 'old' dataset: ../data/raw/cds/austrian_alps/st._anton/st._anton.csv
Skipping 'old' dataset: ../data/raw/cds/austrian_alps/sölden/sölden.csv
Excluding resort due to insufficient data: french_alps/chamonix
Excluding resort due to insufficient data: french_alps/les_trois_vallees
Excluding resort due to insufficient data: french_alps/val_d_isere_tignes
Skipping 'old' dataset: ../data/raw/cds/italian_alps/cortina_d'ampezzo/cortina_d'ampezzo.csv
Skipping 'old' dataset: ../data/raw/cds/italian_alps/sestriere/sestriere.csv
Skipping 'old' dataset: ../data/raw/cds/italian_alps/val_gardena/val_gardena.csv
Skipping 'old' dataset: ../data/raw/cds/slovenian_alps/kranjska_gora/kranjska_gora.csv
Skipping 'old' dataset: ../data/raw/cds/slovenian_alps/krvavec/krvavec.csv
Skipping 'old' dataset: ../data/raw/cds/slovenian_alps/mariborsko_pohorje/mariborsko_pohorje.csv
Skipping 'old' dataset: ../data/raw/cds/swiss_alps/

## 4.3 Filter data based on each resort's opening and closing dates

Each resort operates during specific dates in the year. We'll filter the data to include only the dates when each resort is open.

Here are the approximate opening and closing dates for each resort:

- **Austrian Alps:**
  - **St. Anton:** Opens early December (`12-01`), closes late April (`04-30`)
  - **Kitzbühel:** Opens mid-October (`10-15`), closes May (`05-01`)
  - **Sölden:** Opens early November (`11-01`), closes early May (`05-01`)
  
- **Swiss Alps:**
  - **Zermatt:** Opens mid-November (`11-15`), closes late April (`04-30`)
  - **St. Moritz:** Opens late November (`11-25`), closes early May (`05-01`)
  - **Verbier:** Opens early December (`12-01`), closes late April (`04-30`)
  
- **Italian Alps:**
  - **Cortina d'Ampezzo:** Opens late November (`11-25`), closes early April (`04-05`)
  - **Val Gardena:** Opens early December (`12-01`), closes mid-April (`04-15`)
  - **Sestriere:** Opens early December (`12-01`), closes mid-April (`04-15`)
  
- **Slovenian Alps:**
  - **Kranjska Gora:** Opens mid-December (`12-15`), closes mid-April (`04-15`)
  - **Mariborsko Pohorje:** Opens December (`12-01`), closes early April (`04-05`)
  - **Krvavec:** Opens December (`12-01`), closes April (`04-30`)

  We'll define the `resort_seasons` dictionary with normalized keys to match the keys in `data_frames`.


In [27]:
resort_seasons = {
    'austrian_alps/st_anton': {'open': '12-01', 'close': '04-30'},
    'austrian_alps/kitzbuhel': {'open': '10-15', 'close': '05-01'},
    'austrian_alps/solden': {'open': '11-01', 'close': '05-01'},
    'swiss_alps/st_moritz': {'open': '11-25', 'close': '05-01'},
    'swiss_alps/verbier': {'open': '12-01', 'close': '04-30'},
    'italian_alps/cortina_d_ampezzo': {'open': '11-25', 'close': '04-05'},
    'italian_alps/val_gardena': {'open': '12-01', 'close': '04-15'},
    'italian_alps/sestriere': {'open': '12-01', 'close': '04-15'},
    'slovenian_alps/kranjska_gora': {'open': '12-15', 'close': '04-15'},
    'slovenian_alps/mariborsko_pohorje': {'open': '12-01', 'close': '04-05'},
    'slovenian_alps/krvavec': {'open': '12-01', 'close': '04-30'},
}


### 4.4 Handles seasons that span across years

In [28]:
def get_season_dates(year, open_mm_dd, close_mm_dd):
    """
    Given a year and open/close month-day strings, return datetime objects for open and close dates.
    Handles seasons that span across years.
    """
    open_month, open_day = map(int, open_mm_dd.split('-'))
    close_month, close_day = map(int, close_mm_dd.split('-'))
    
    open_date = pd.Timestamp(year=year, month=open_month, day=open_day)
    close_date = pd.Timestamp(year=year, month=close_month, day=close_day)
    
    # If close_date is earlier than open_date, it spans to the next year
    if close_date < open_date:
        close_date += pd.DateOffset(years=1)
    
    return open_date, close_date

### 4.5 Assign rows to dataframe to a season based on the operating dates.

In [29]:
def categorize_season(df, season_info, resort_key):
    """
    Parameters:
    - df (pd.DataFrame): DataFrame containing 'date' column.
    - season_info (dict): Dictionary with 'open' and 'close' dates in 'MM-DD' format.
    - resort_key (str): Key to identify the resort (e.g., 'french_alps/chamonix').
    
    Returns:
    - pd.DataFrame: DataFrame with an added 'season_id' column.
    """
    if not season_info:
        # No season information provided
        df['season_id'] = None
        return df
    
    open_mm_dd = season_info['open']
    close_mm_dd = season_info['close']
    
    df = df.copy()
    df['season_id'] = None  # Initialize season identifier
    
    years = df['date'].dt.year.unique()
    
    for year in years:
        open_date, close_date = get_season_dates(year, open_mm_dd, close_mm_dd)
        
        # Filter rows within the current season
        season_mask = (df['date'] >= open_date) & (df['date'] <= close_date)
        season_label = f"{year}-{close_date.year}"
        
        df.loc[season_mask, 'season_id'] = season_label
    
    return df

### 4.7 Apply Season Categorization

In [30]:
for key, df in data_frames.items():
    resort = key
    if resort in resort_seasons:
        season_info = resort_seasons[resort]
        
        # Categorize seasons
        df = categorize_season(df, season_info, resort)
        
        # Update the DataFrame in the dictionary
        data_frames[key] = df
        print(f"Season categorized for {resort}.")
    else:
        print(f"No season information for {resort}. Data not categorized.")

Season categorized for austrian_alps/kitzbuhel.
Season categorized for austrian_alps/st_anton.
Season categorized for austrian_alps/solden.
Season categorized for italian_alps/cortina_d_ampezzo.
Season categorized for italian_alps/sestriere.
Season categorized for italian_alps/val_gardena.
Season categorized for slovenian_alps/kranjska_gora.
Season categorized for slovenian_alps/krvavec.
Season categorized for slovenian_alps/mariborsko_pohorje.
Season categorized for swiss_alps/st_moritz.


### 4.8 Add Operating Season Indicator

In [31]:
def add_operating_season_indicator(df):
    """
    Adds a boolean column 'is_operating_season' indicating if the row is within an operating season.
    """
    df = df.copy()
    df['is_operating_season'] = df['season_id'].notnull()
    return df

for key, df in data_frames.items():
    df = add_operating_season_indicator(df)
    data_frames[key] = df
    print(f"Operating season indicator added for {key}.")

Operating season indicator added for austrian_alps/kitzbuhel.
Operating season indicator added for austrian_alps/st_anton.
Operating season indicator added for austrian_alps/solden.
Operating season indicator added for italian_alps/cortina_d_ampezzo.
Operating season indicator added for italian_alps/sestriere.
Operating season indicator added for italian_alps/val_gardena.
Operating season indicator added for slovenian_alps/kranjska_gora.
Operating season indicator added for slovenian_alps/krvavec.
Operating season indicator added for slovenian_alps/mariborsko_pohorje.
Operating season indicator added for swiss_alps/st_moritz.


### 4.9  Impute Missing Values Appropriately

Implement imputation methods based on dataset type.
The code does the following: 
- Only impute snow depth increases when precipitation occurs during the operating season and conditions logically support snow accumulation.
- Use patterns in the original dataset to guide imputation, focusing on years where similar weather conditions occurred.
- Avoids imputation during off-season unless there is a strong indication of unusual snowfall.

In [32]:
for key, df in data_frames.items():
    # Ensure the 'is_operating_season' column exists
    if 'is_operating_season' not in df.columns:
        raise ValueError(f"'is_operating_season' column missing for {key}")

    # Filter for operating season
    season_df = df[df['is_operating_season']]
    
    if 'snow_depth' in season_df.columns:
        # Identify missing snow_depth values within the operating season
        missing_mask = season_df['snow_depth'].isnull()
        
        # Conditions for snow depth increase imputation during freezing conditions
        precip_freeze = (season_df['precipitation_sum'] > 0) & (season_df['temperature_min'] <= 0) & missing_mask
        
        if precip_freeze.any():
            # Loop through data_frames to collect all historical data
            historical_data = pd.concat([
                other_df[(other_df['is_operating_season']) & 
                         (other_df['temperature_min'] <= 0) & 
                         (other_df['precipitation_sum'] > 0)]
                for other_df in data_frames.values()
            ], ignore_index=True)

            if not historical_data.empty:
                typical_increase = historical_data['snow_depth'].mean()
                
                if not np.isnan(typical_increase):
                    season_df.loc[precip_freeze, 'snow_depth'] = typical_increase
                    print(f"{key}: Imputed 'snow_depth' based on historical average increase.")
                else:
                    print(f"{key}: Insufficient historical data for imputation.")
            else:
                print(f"{key}: No historical reference found for imputation under freezing conditions.")
        
       # Conditions for days without precipitation (temperature-based melting)
        no_precip = (season_df['precipitation_sum'] == 0) & missing_mask
        
        temp_melt_rates = pd.Series(0.0, index=season_df.index)  # Explicitly set to float

        # Define temperature-adjusted melt rates based on specified ranges
        temp_melt_rates[(season_df['temperature_max'] > 0) & (season_df['temperature_max'] <= 2)] = 0.005  # Minimal Melt
        temp_melt_rates[(season_df['temperature_max'] > 2) & (season_df['temperature_max'] <= 5)] = 0.01  # Moderate Melt
        temp_melt_rates[(season_df['temperature_max'] > 5) & (season_df['temperature_max'] <= 8)] = 0.015  # High Melt
        temp_melt_rates[season_df['temperature_max'] > 8] = 0.02  # Maximum Melt Rate

        if no_precip.any():
            previous_snow_depth = season_df['snow_depth'].shift(1)
            # Apply temperature-adjusted melt rates only on selected days
            melt_mask = temp_melt_rates > 0
            season_df.loc[no_precip & melt_mask, 'snow_depth'] = (
                previous_snow_depth[no_precip & melt_mask].fillna(0) * 
                (1 - temp_melt_rates[no_precip & melt_mask])
            )
            print(f"{key}: Applied temperature-based melting on days with no precipitation and above-freezing temperatures.")

        # Update the main DataFrame with only converted off-season data
        df.update(season_df)

    # Update the DataFrame in data_frames with imputed values for operating season and converted values for off-season
    data_frames[key] = df

austrian_alps/kitzbuhel: Imputed 'snow_depth' based on historical average increase.
austrian_alps/kitzbuhel: Applied temperature-based melting on days with no precipitation and above-freezing temperatures.
austrian_alps/st_anton: Imputed 'snow_depth' based on historical average increase.
austrian_alps/st_anton: Applied temperature-based melting on days with no precipitation and above-freezing temperatures.
austrian_alps/solden: Imputed 'snow_depth' based on historical average increase.
austrian_alps/solden: Applied temperature-based melting on days with no precipitation and above-freezing temperatures.
italian_alps/cortina_d_ampezzo: Imputed 'snow_depth' based on historical average increase.
italian_alps/cortina_d_ampezzo: Applied temperature-based melting on days with no precipitation and above-freezing temperatures.
italian_alps/sestriere: Imputed 'snow_depth' based on historical average increase.
italian_alps/sestriere: Applied temperature-based melting on days with no precipitation

### 4.10 Incorporating anomaly detection

In [33]:
def detect_snow_depth_anomalies(df, threshold=20):
    """
    Identifies snow_depth values that exceed a specified threshold during the off-season.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing 'snow_depth' and 'is_operating_season'.
    - threshold (float): The maximum plausible snow_depth value during the off-season.
    
    Returns:
    - pd.DataFrame: DataFrame with anomalies flagged in 'snow_depth_anomaly' column.
    """
    df = df.copy()
    
    # Ensure 'is_operating_season' column exists
    if 'is_operating_season' not in df.columns:
        raise ValueError("'is_operating_season' column is missing in the DataFrame.")
    
    # Identify anomalies: snow_depth > threshold during off-season
    off_season_mask = df['is_operating_season'] == False
    anomaly_mask = (df['snow_depth'] > threshold) & off_season_mask
    
    df['snow_depth_anomaly'] = anomaly_mask
    
    return df

for key, df in data_frames.items():
    # Detect anomalies in 'snow_depth'
    df = detect_snow_depth_anomalies(df, threshold=20)

    # Remove or correct anomalies
    df.loc[df['snow_depth_anomaly'], 'snow_depth'] = np.nan  # Set to NaN

    # Drop the 'snow_depth_anomaly' column if not needed
    df = df.drop(columns=['snow_depth_anomaly'])

    # Update the DataFrame in the dictionary
    data_frames[key] = df

### 4.11 Handle rounding and unit conversion

In [34]:
columns_to_round = {
    'snow_depth': 1,
    'precipitation_sum': 1,
    'temperature_min': 1,
    'temperature_max': 1,
}

for key, df in data_frames.items():
    # Round numerical columns
    for column, decimals in columns_to_round.items():
        if column in df.columns:
            df[column] = df[column].round(decimals)
            print(f"{key}: Rounded '{column}' to {decimals} decimal places.")

    data_frames[key] = df
    print(f"Processed numerical columns for {key}.")

austrian_alps/kitzbuhel: Rounded 'snow_depth' to 1 decimal places.
austrian_alps/kitzbuhel: Rounded 'precipitation_sum' to 1 decimal places.
austrian_alps/kitzbuhel: Rounded 'temperature_min' to 1 decimal places.
austrian_alps/kitzbuhel: Rounded 'temperature_max' to 1 decimal places.
Processed numerical columns for austrian_alps/kitzbuhel.
austrian_alps/st_anton: Rounded 'snow_depth' to 1 decimal places.
austrian_alps/st_anton: Rounded 'precipitation_sum' to 1 decimal places.
austrian_alps/st_anton: Rounded 'temperature_min' to 1 decimal places.
austrian_alps/st_anton: Rounded 'temperature_max' to 1 decimal places.
Processed numerical columns for austrian_alps/st_anton.
austrian_alps/solden: Rounded 'snow_depth' to 1 decimal places.
austrian_alps/solden: Rounded 'precipitation_sum' to 1 decimal places.
austrian_alps/solden: Rounded 'temperature_min' to 1 decimal places.
austrian_alps/solden: Rounded 'temperature_max' to 1 decimal places.
Processed numerical columns for austrian_alps/so

## 5. Save Cleaned Data

We'll save the cleaned and filtered DataFrames to the `data/processed` directory, maintaining the normalized folder structure.
Furthermore, we need to save to unique filenames, given that we have previously cleaned data within the respective folders.

In [35]:
from datetime import datetime  # Import datetime for timestamp generation

for key, df in data_frames.items():
    try:
        # Split the key back into country and resort
        country, resort = key.split('/')
        
        # Build the processed data path
        processed_dir = os.path.join(processed_data_root, country, resort)
        os.makedirs(processed_dir, exist_ok=True)
        
        # Generate current timestamp in 'YYYY-MM-DD_HH-MM-SS' format
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        
        # Define the new file name with timestamp to prevent overwriting
        processed_file_path = os.path.join(processed_dir, f"{resort}_cleaned_{timestamp}.csv")
        
        # Save the cleaned DataFrame to the new CSV file
        df.to_csv(processed_file_path, index=False)
        
        # Informative message indicating successful save
        print(f"Saved cleaned data to {processed_file_path}.")
        
    except Exception as e:
        # Handle potential errors, such as key not having exactly two parts
        print(f"Error saving data for {key}: {e}")


Saved cleaned data to ../data/processed/cds/austrian_alps/kitzbuhel/kitzbuhel_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/austrian_alps/st_anton/st_anton_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/austrian_alps/solden/solden_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/italian_alps/cortina_d_ampezzo/cortina_d_ampezzo_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/italian_alps/sestriere/sestriere_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/italian_alps/val_gardena/val_gardena_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/slovenian_alps/kranjska_gora/kranjska_gora_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/slovenian_alps/krvavec/krvavec_cleaned_2024-10-15_12-03-53.csv.
Saved cleaned data to ../data/processed/cds/slovenian_alps/mariborsko_pohorje/mariborsko_pohorje_