# Expand Dataframes Based on Family Types and Countries

In [None]:
import os
import json
import numpy as np
import pandas as pd

from config import COUNTRY_CODE, COUNTRY_CODE_TO_NAME, COUNTRIES, \
    JSON_MASTER_FILE_PATH, YEARLY_FILE_TEMPLATE, \
    FAMILY_NOISE_FACTOR, WEATHER_NOISE_FACTOR, \
    EXP_PROFILES_EXPAND_PATH, EXP_WEATHER_EXPAND_PATH, \
    WEATHER_COMBINED_PATH, FAMILY_COMBINED_PATH

### Functions

In [None]:
def load_master_file(country_code, output_file):
    """
    Loads the master file for a specific country.

    Parameters:
    - country_code (str): ISO country code to identify the file.

    Returns:
    - pd.DataFrame: The master dataframe for the specified country.
    """
    file_path = f"{output_file}_{country_code}.csv"
    master_df = pd.read_csv(file_path, parse_dates=['datetime'])

    # Create the 'pattern' column in master_df
    if 'pattern' not in master_df.columns:
        master_df['pattern'] = master_df.apply(
            lambda row: 'Weekend' if row['is_weekend'] == 1 or row['is_holiday'] == 1 else 'Weekday', axis=1
        )
        print("\tCreated 'pattern' column in master_df.")

    return master_df

def get_family_types_for_country(country_name, family_types_json):
    """
    Extract family types for a specific country from the loaded JSON.

    Parameters:
    - country_name (str): Name of the country.
    - family_types_json (list): The loaded JSON as a list of dictionaries.

    Returns:
    - list: List of family types for the specified country, or an empty list if not found.
    """
    for entry in family_types_json:
        if entry.get('Country') == country_name:
            return [family['Family Type'] for family in entry.get('Families', [])]
    return []

def load_family_csv(directory, country_name, family_type):
    """
    Load a family CSV file based on the country name and family type.

    Parameters:
    - directory (str): Path to the directory containing the CSV files.
    - country_name (str): Name of the country.
    - family_type (str): Family type for the country.

    Returns:
    - pd.DataFrame or None: DataFrame if the file exists, otherwise None.
    """
    # Prepare the country name and family type for file searching
    formatted_country = country_name.replace(" ", "-")
    formatted_family_type = family_type.replace(" ", "-").replace("'", "-")
    
    # Construct the file name
    file_name = f"{formatted_country}_{formatted_family_type}_combined.csv"
    file_path = os.path.join(directory, file_name)
    
    # Check if the file exists
    if os.path.isfile(file_path):
        # Load the CSV into a DataFrame
        print(f"  Loaded: {file_name}")
        return pd.read_csv(file_path)
    else:
        print(f"\tFile not found: {file_name}")
        return None

def load_weather_csv(directory, country_name):
    """
    Load a weather CSV file based on the country name.

    Parameters:
    - directory (str): Path to the directory containing the CSV files.
    - country_name (str): Name of the country.

    Returns:
    - pd.DataFrame or None: DataFrame if the file exists, otherwise None.
    """
    # Prepare the country name and family type for file searching
    formatted_country = country_name.replace(" ", "-")
    
    # Construct the file name
    file_name = f"{formatted_country}_weather_combined.csv"
    file_path = os.path.join(directory, file_name)
    
    # Check if the file exists
    if os.path.isfile(file_path):
        # Load the CSV into a DataFrame
        print(f"  Loaded: {file_name}")
        return pd.read_csv(file_path)
    else:
        print(f"\tFile not found: {file_name}")
        return None

def apply_dynamic_noise(expanded_df, col, noise_factor=0.1):
    """
    Apply dynamic noise to a column in a DataFrame based on its statistical properties.

    Parameters:
    - expanded_df (pd.DataFrame): The DataFrame containing the column.
    - col (str): The column to which noise is applied.
    - noise_factor (float): A scaling factor for the noise (default=0.1).

    Returns:
    - pd.Series: The column with dynamic noise applied.
    """
    # Calculate mean and standard deviation of the column
    col_mean = expanded_df[col].mean()
    col_std = expanded_df[col].std()
    # display(f"Mean: {col_mean}, Std: {col_std}")

    # Dynamically scale the noise based on column properties
    # Option 1: Noise proportional to standard deviation
    # scaled_noise = np.random.normal(0, noise_factor * col_std, size=len(expanded_df))
    
    # Option 2: Noise proportional to mean
    scaled_noise = np.random.normal(0, noise_factor * col_mean, size=len(expanded_df))
    
    return np.round(expanded_df[col] + scaled_noise, 3)

def expand_family_data(master_df, family_df, main_dir, noise_factor=0.1):
    """
    Expand family data for the full year using the master data.

    Parameters:
    - master_df (pd.DataFrame): The full-year master data (hourly steps).
    - family_df (pd.DataFrame): The family consumption data (template for expansion).
    - noise_factor (float): The factor to apply random noise to consumption values.

    Returns:
    - pd.DataFrame: Expanded DataFrame with family data for the full year.
    """
    # Normalize column names (convert to lowercase and strip whitespace)
    master_df.columns = master_df.columns.str.lower().str.strip()
    family_df.columns = family_df.columns.str.lower().str.strip()

    # Debug: Print columns to verify alignment
    # print("\tUnified Master DataFrame Columns:", master_df.columns.tolist())
    # print("\tUnified Family DataFrame Columns:", family_df.columns.tolist())

    # Ensure required columns are present
    required_columns = ["season", "pattern", "hour"]
    for col in required_columns:
        if col not in master_df.columns:
            raise KeyError(f"Column '{col}' is missing in master_df.")
        if col not in family_df.columns:
            raise KeyError(f"Column '{col}' is missing in family_df.")

    # Merge master with family template based on Season, Pattern, and Hour
    expanded_df = pd.merge(
        master_df,
        family_df,
        on=["season", "pattern", "hour"],
        how="left",
        suffixes=('', '_template')
    )
    
    # Apply noise to consumption columns
    consumption_columns = [col for col in expanded_df.columns if "consumption" in col]
    for col in consumption_columns:
        if col.endswith('_template'):  # Skip template columns
            continue
        # expanded_df[col] = np.round(expanded_df[col] * (1 + np.random.uniform(-noise_factor, noise_factor, len(expanded_df))),3)
        expanded_df[col] = apply_dynamic_noise(expanded_df, col, noise_factor).clip(lower=0)

    # Recalculate Total_Electricity_Usage as the sum of all individual consumption columns
    expanded_df['total_electricity_usage'] = np.round(expanded_df[consumption_columns].sum(axis=1), 3)

    # Drop the template columns (if any)
    expanded_df = expanded_df[[col for col in expanded_df.columns if not col.endswith('_template')]]
    expanded_df = expanded_df[[col for col in expanded_df.columns if not col.endswith('_action')]]
    expanded_df.drop(columns=['country', 'family_type', 'holiday_desc', 'year', 'month', 'day', 'hour'], inplace=True)

    # Prepare the country name and family type for safe file saving
    formatted_country = family_df['country'][0].replace(" ", "-")
    formatted_family_type = family_df['family_type'][0].replace(" ", "-").replace("'", "-")
    
    # save the expanded data to a csv file
    expanded_df.to_csv(f'{main_dir}/{formatted_country}_{formatted_family_type}_expanded.csv', index=False)

    return expanded_df

def expand_weather_data(master_df, weather_df, main_dir, noise_factor=0.1):
    """
    Expand weather data for the full year using the master data.

    Parameters:
    - master_df (pd.DataFrame): The full-year master data (hourly steps).
    - weather_df (pd.DataFrame): The weather consumption data (template for expansion).
    - noise_factor (float): The factor to apply random noise to consumption values.

    Returns:
    - pd.DataFrame: Expanded DataFrame with weatehr data for the full year.
    """
    # Normalize column names (convert to lowercase and strip whitespace)
    master_df.columns = master_df.columns.str.lower().str.strip()
    weather_df.columns = weather_df.columns.str.lower().str.strip()

    # Debug: Print columns to verify alignment
    # print("\tUnified Master DataFrame Columns:", master_df.columns.tolist())
    # print("\tUnified Weather DataFrame Columns:", weather_df.columns.tolist())

    # Ensure required columns are present
    required_columns = ["season", "hour"]
    for col in required_columns:
        if col not in master_df.columns:
            raise KeyError(f"Column '{col}' is missing in master_df.")
        if col not in weather_df.columns:
            raise KeyError(f"Column '{col}' is missing in weather_df.")

    # Merge master with family template based on Season, Pattern, and Hour
    expanded_df = pd.merge(
        master_df,
        weather_df,
        on=["season", "hour"],
        how="left",
        suffixes=('', '_template')
    )
    
    # Apply noise to consumption columns
    consumption_columns = [col for col in expanded_df.columns if "value" in col]
    for col in consumption_columns:
        if col.endswith('_template'):  # Skip template columns
            continue
        expanded_df[col] = np.round(expanded_df[col] * (1 + np.random.uniform(-noise_factor, noise_factor, len(expanded_df))),3)

    # Drop the template columns (if any)
    expanded_df = expanded_df[[col for col in expanded_df.columns if not col.endswith('_template')]]
    expanded_df = expanded_df[[col for col in expanded_df.columns if not col.endswith('_description')]]
    expanded_df.drop(columns=['country', 'holiday_desc', 'year', 'month', 'day', 'hour'], inplace=True)

    # Prepare the country name and family type for safe file saving
    formatted_country = weather_df['country'][0].replace(" ", "-")
    
    # save the expanded data to a csv file
    expanded_df.to_csv(f'{main_dir}/{formatted_country}_weather_expanded.csv', index=False)

    return expanded_df

### Load Family Types

In [3]:
# Read the json file
with open(JSON_MASTER_FILE_PATH, "r") as f:
    family_types_json = json.load(f)

### Expand Family Profiles

In [None]:
if not os.path.exists(EXP_PROFILES_EXPAND_PATH):
    os.makedirs(EXP_PROFILES_EXPAND_PATH)

for country_code in COUNTRY_CODE:
    if COUNTRY_CODE_TO_NAME.get(country_code) in COUNTRIES:
        country_processed = COUNTRY_CODE_TO_NAME[country_code]
        print(f"\nProcessing data for {country_processed} ({country_code})")
        
        try:
            master_df = load_master_file(country_code, YEARLY_FILE_TEMPLATE)
            print(f"\nMaster File for {country_code}:")
            # display(master_df.head(2))

            family_types_per_country = get_family_types_for_country(country_processed, family_types_json)
            # display(family_types_per_country)

            try:
                for family_type in family_types_per_country:
                    print(f"  Family Type: {family_type}")
                    family_df = load_family_csv(FAMILY_COMBINED_PATH, country_processed, family_type)
                    # display(family_df.head(2))

                    # Apply the function
                    full_year_family_data = expand_family_data(master_df, family_df, EXP_PROFILES_EXPAND_PATH, noise_factor=FAMILY_NOISE_FACTOR)

                    # Display the first few rows
                    # display(full_year_family_data.head(2))
                    # break
                # break
            except Exception as e:
                print(f"\t2.Error: {e}")
                continue
        except Exception as e:
            print(f"\t1.Error: {e}")
            continue

### Expand Weather Profiles

In [None]:
if not os.path.exists(EXP_WEATHER_EXPAND_PATH):
    os.makedirs(EXP_WEATHER_EXPAND_PATH)


for country_code in COUNTRY_CODE:
    if COUNTRY_CODE_TO_NAME.get(country_code) in COUNTRIES:
        country_processed = COUNTRY_CODE_TO_NAME[country_code]
        print(f"\nProcessing data for {country_processed} ({country_code})")
    
        try:
            master_df = load_master_file(country_code, YEARLY_FILE_TEMPLATE)
            print(f"\nMaster File for {country_code}:")
            # display(master_df.head(2))

            weather_df = load_weather_csv(WEATHER_COMBINED_PATH, country_processed)
            # display(weather_df.head(2))

            # Apply the function
            full_year_weather_data = expand_weather_data(master_df, weather_df, EXP_WEATHER_EXPAND_PATH, noise_factor=WEATHER_NOISE_FACTOR)

            # Display the first few rows
            # display(full_year_weather_data.head(2))
            # break
        except Exception as e:
            print(f"Error: {e}")
            continue