In [None]:
import os
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt

from config import SEASON_COLORS, AGGREGATE_BY, NUMBER_FAMILIES_PER_COUNTRY, JSON_MASTER_FILE_PATH, SAVE_PLOTS, \
    EXP_PROFILES_EXPAND_PATH, EXP_WEATHER_EXPAND_PATH, CSV_FINAL_PROFILES_WEATHER

### Load Family Types

In [2]:
# Read the json file
with open(JSON_MASTER_FILE_PATH, "r") as f:
    family_types_json = json.load(f)

In [3]:
def process_family_json(json_data):
    """
    Processes the JSON data to create a mapping of family types and member counts per country.

    Parameters:
    - json_data (list): JSON data loaded as a Python list.

    Returns:
    - dict: A dictionary with country-specific family types and their member counts.
    """
    family_member_counts = {}
    for entry in json_data:
        country = entry['Country']
        families = entry['Families']
        family_member_counts[country] = {
            family['Family Type']: len(family['Members']) for family in families
        }
    
    return family_member_counts

family_member_counts = process_family_json(family_types_json)
# display(family_member_counts)

### Get CSV Mapping

In [4]:
def get_csv_mapping(PROJ_PATH, family_types_json, process_type):    
    if process_type == "family":
        family_types = {entry["Country"]: [family["Family Type"] for family in entry["Families"]] for entry in family_types_json}
        
        # Extract countries and families
        countries = [entry["Country"] for entry in family_types_json]
        # print("Available Countries:", countries)
        # print("Family Types by Country:", family_types)

        # Get all CSV file paths in the folder
        csv_files = glob.glob(f"{PROJ_PATH}/*.csv")

        # Create a dictionary to map parsed JSON to CSV files
        csv_mapping = {}
        for country, families in family_types.items():
            csv_mapping[country] = {}
            for family_type in families:
                # Find matching CSV files for this country and family type
                family_type_clean = family_type.replace(' ', '-').replace("'", '-')
                matching_files = [
                    # file for file in csv_files if f"{country}_{family_type.replace(' ', '_')}" in file
                    file for file in csv_files if f"{country.replace(' ', '-')}_{family_type_clean}" in file
                ]
                csv_mapping[country][family_type] = matching_files

        # # Check the mapping
        # for country, families in csv_mapping.items():
        #     print(f"Country: {country}")
        #     for family, files in families.items():
        #         file_names = [os.path.basename(file) for file in files]
        #         print(f"  Family Type: {family} -> Files: {file_names}")
                
    elif process_type == "weather":        
        # Extract countries
        countries = [entry["Country"] for entry in family_types_json]
        # print("Available Countries:", countries)
        # print("Weather by Country")

        # Get all CSV file paths in the folder
        csv_files = glob.glob(f"{PROJ_PATH}/*.csv")

        # Create a dictionary to map parsed JSON to CSV files
        csv_mapping = {}
        for country in countries:
            # Find matching CSV files for this country
            matching_files = [
                file for file in csv_files if f"{country.replace(' ', '-')}" in file
                ]
            csv_mapping[country] = matching_files

        # # Check the mapping
        # for country, files in csv_mapping.items():
        #     file_names = [os.path.basename(file) for file in files]
        #     print(f"  Country: {country} -> Files: {file_names}")

    return csv_mapping

In [None]:
csv_mapping_family = get_csv_mapping(EXP_PROFILES_EXPAND_PATH, family_types_json, "family")
print("Family CSV Mapping:")
display(csv_mapping_family)

In [None]:
csv_mapping_weather = get_csv_mapping(EXP_WEATHER_EXPAND_PATH, family_types_json, "weather")
print("\nWeather CSV Mapping")
display(csv_mapping_weather)

## Plot Energy Signatures

In [None]:
def combine_family_and_weather_debug(family_data, weather_data):
    if 'datetime' not in family_data.columns or 'datetime' not in weather_data.columns:
        raise KeyError("Both family_data and weather_data must contain a 'datetime' column.")

    # Perform an outer merge to identify mismatches
    master_data = pd.merge(
        family_data,
        weather_data,
        on=['datetime'],
        how='outer',
        indicator=True  # Add a column to show the merge status
    )

    # Separate unmatched rows
    unmatched_family = master_data[master_data['_merge'] == 'left_only']
    unmatched_weather = master_data[master_data['_merge'] == 'right_only']

    # Keep only the successfully matched rows
    master_data = master_data[master_data['_merge'] == 'both'].drop(columns=['_merge'])

    # Remove duplicate columns from the merge (_x and _y)
    for col in master_data.columns:
        if col.endswith('_x') and col[:-2] + '_y' in master_data.columns:
            master_data.rename(columns={col: col[:-2]}, inplace=True)
            master_data.drop(columns=[col[:-2] + '_y'], inplace=True)

    return master_data, unmatched_family, unmatched_weather


def combine_family_and_weather(family_csv_mapping, weather_csv_mapping, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for country, family_files in family_csv_mapping.items():
        print(f"\nProcessing country: {country}")
        weather_file = weather_csv_mapping.get(country, [None])[0]
        if not weather_file or not os.path.isfile(weather_file):
            print(f"Weather file missing for {country}. Skipping.")
            continue

        weather_data = pd.read_csv(weather_file, parse_dates=['datetime'])
        print(f"Weather data loaded for {country}: {weather_file}")

        for family_type, family_file_list in family_files.items():
            family_file = family_file_list[0]
            if not os.path.isfile(family_file):
                print(f"Family file missing for {family_type} in {country}. Skipping.")
                continue

            family_data = pd.read_csv(family_file, parse_dates=['datetime'])

            if 'datetime' not in family_data.columns:
                raise KeyError("The 'datetime' column is required in family_data.")

            master_data, unmatched_family, unmatched_weather = combine_family_and_weather_debug(family_data, weather_data)

            formatted_country = country.replace(" ", "-")
            formatted_family_type = family_type.replace(" ", "-").replace("'", "-")
            master_file_path = os.path.join(output_dir, f"{formatted_country}_{formatted_family_type}_combined.csv")

            # Save combined data
            master_data.to_csv(master_file_path, index=False)
            print(f"Combined data saved for {country} - {family_type}: {master_file_path}")

            # Save unmatched data only if non-empty
            if not unmatched_family.empty:
                unmatched_family_path = os.path.join(output_dir, f"{formatted_country}_{formatted_family_type}_unmatched_family.csv")
                unmatched_family.to_csv(unmatched_family_path, index=False)
                print(f"Unmatched family rows saved: {unmatched_family_path}")
            # else:
            #     print(f"No unmatched family rows for {country} - {family_type}")

            if not unmatched_weather.empty:
                unmatched_weather_path = os.path.join(output_dir, f"{formatted_country}_{formatted_family_type}_unmatched_weather.csv")
                unmatched_weather.to_csv(unmatched_weather_path, index=False)
                print(f"Unmatched weather rows saved: {unmatched_weather_path}")
            # else:
                # print(f"No unmatched weather rows for {country} - {family_type}")
            # break
        # break


combine_family_and_weather(csv_mapping_family, csv_mapping_weather, CSV_FINAL_PROFILES_WEATHER)

In [8]:
def get_df_combined(PROJ_PATH, family_types_json):    
    family_types = {entry["Country"]: [family["Family Type"] for family in entry["Families"]] for entry in family_types_json}
    
    # Get all CSV file paths in the folder
    csv_files = glob.glob(f"{PROJ_PATH}/*.csv")

    # Create a dictionary to map parsed JSON to CSV files
    csv_mapping = {}
    for country, families in family_types.items():
        csv_mapping[country] = {}
        for family_type in families:
            # Find matching CSV files for this country and family type
            family_type_clean = family_type.replace(' ', '-').replace("'", '-')
            matching_files = [
                # file for file in csv_files if f"{country}_{family_type.replace(' ', '_')}" in file
                file for file in csv_files if f"{country.replace(' ', '-')}_{family_type_clean}" in file
            ]
            csv_mapping[country][family_type] = matching_files

    return csv_mapping


csv_combined_mapping = get_df_combined(CSV_FINAL_PROFILES_WEATHER, family_types_json)

In [9]:
def aggregate_data(data, aggregation_level="daily"):
    """
    Aggregates the given data to the specified level.

    Parameters:
    - data (pd.DataFrame): The input data containing hourly information.
    - aggregation_level (str): The level of aggregation ('daily', 'weekly', 'monthly', 'seasonal').

    Returns:
    - pd.DataFrame: The aggregated data.
    """
    # Add necessary grouping columns to the data
    if 'year' not in data.columns:
        data['year'] = data['datetime'].dt.year
    if 'month' not in data.columns:
        data['month'] = data['datetime'].dt.month
    if 'week' not in data.columns:
        data['week'] = data['datetime'].dt.isocalendar().week
    if 'day' not in data.columns:
        data['day'] = data['datetime'].dt.day

    # Define grouping columns based on aggregation level
    if aggregation_level == "daily":
        group_cols = ["year", "season", "quarter", "month", "week", "day", "day_name"] #, "is_weekend", "is_holiday", "pattern"]
    elif aggregation_level == "weekly":
        group_cols = ["year", "week"]
    elif aggregation_level == "monthly":
        group_cols = ["year", "month"]
    elif aggregation_level == "seasonal":
        group_cols = ["year", "season"]
    else:
        raise ValueError("Invalid aggregation level. Choose from 'daily', 'weekly', 'monthly', or 'seasonal'.")

    # Define aggregation functions for different columns
    aggregation_functions = {
        "temperature_value": "mean",
        "humidity_value": "mean",
        "solrad-diffuse_value": "mean",
        "solrad-direct_value": "mean",
        "wind-speed_value": "mean",
        "total_electricity_usage": "sum",
        'heating_consumption': 'sum',
        'cooling_consumption': 'sum',
        # # Sum consumptions for individual family members
        **{col: 'sum' for col in data.columns if 'consumption' in col and col not in ['heating_consumption', 'cooling_consumption']},
        "is_weekend": "sum",
        "is_holiday": "sum",
        # "pattern": "first" if aggregation_level == "daily" else None,
    }

    # Drop `None` aggregation functions
    aggregation_functions = {k: v for k, v in aggregation_functions.items() if v is not None}

    # Group and aggregate the data
    aggregated_data = data.groupby(group_cols).agg(aggregation_functions).reset_index()

    # Identify numeric columns (excluding grouping columns)
    numeric_columns = aggregated_data.select_dtypes(include=['float64', 'int64']).columns
    holiday_columns = ['is_weekend', 'is_holiday']

    # Divide numeric columns by 24
    aggregated_data[holiday_columns] = aggregated_data[holiday_columns] / 24

    # Round numeric values to the specified number of decimals
    aggregated_data[numeric_columns] = aggregated_data[numeric_columns].round(2)

    # Handle specific columns for weekend/holiday aggregation
    if aggregation_level != "daily":
        aggregated_data.rename(columns={"is_weekend": "total_weekend_days", "is_holiday": "total_holiday_days"}, inplace=True)

    # Drop `pattern` column for non-daily aggregations
    if aggregation_level != "daily" and "pattern" in aggregated_data.columns:
        aggregated_data.drop(columns=["pattern"], inplace=True)

    return aggregated_data


def plot_energy_signatures_sorted(data, x_col, y_col, family_member_counts, country, ax_list, season_colors=None):
    """
    Plots energy signatures for a specific country with subplots for each family type.
    """

    # Sort family types by number of members
    sorted_families = sorted(
        data['family_type'].unique(),
        key=lambda ft: family_member_counts[country].get(ft, 0)  # Default to 0 if family type not in the mapping
    )

    # Determine consistent y-limits for the country
    y_min = data[y_col].min() - 0.05 * data[y_col].max()
    y_max = data[y_col].max() + 0.05 * data[y_col].max()

    # Iterate over family types and plot on the corresponding subplot
    for ax, family in zip(ax_list, sorted_families[:NUMBER_FAMILIES_PER_COUNTRY]):
        family_data = data[data['family_type'] == family]
        
        if season_colors != None:
            # Plot data for each season
            for season, color in season_colors.items():
                season_data = family_data[family_data['season'] == season]
                ax.scatter(
                    season_data[x_col],
                    season_data[y_col],
                    label=season,
                    color=color,
                    alpha=0.7
                )
        else:
            ax.scatter(
                family_data[x_col],
                family_data[y_col],
                # color='orange',
                # alpha=0.7
            )

        # Set subplot title and labels
        ax.set_title(f"{family} ({family_member_counts[country].get(family, 0)} members)", fontsize=10)
        ax.set_xlabel("Outside Temperature (°C)")
        ax.set_ylabel("Total Consumption (kWh)" if ax == ax_list[0] else "")  # Only add ylabel to the first subplot
        ax.set_ylim(y_min, y_max)
        ax.grid(True)
    
    # Add legend to the first subplot only
    handles, labels = ax_list[0].get_legend_handles_labels()
    # ax_list[0].legend(handles, labels, title="Season", loc="upper left")
    if season_colors != None:
        LEGEND_LOC = NUMBER_FAMILIES_PER_COUNTRY // 2
        ax_list[LEGEND_LOC].legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=len(season_colors), fontsize=10)

In [None]:
for country, family_files in csv_combined_mapping.items():
    print(f"\nProcessing Country: {country}")
    
    # Create a new figure for each country
    FIG, AXES = plt.subplots(1, NUMBER_FAMILIES_PER_COUNTRY, figsize=(25, 5), sharey=False)  # 5 subplots per country
    # FIG.suptitle(f"Energy Signature for {country}", fontsize=16)
    
    country_data = []
    
    for family_type, family_file_list in family_files.items():
        if len(family_file_list) > 0:
            family_file = family_file_list[0]
            if not os.path.isfile(family_file):
                print(f"\t[MISSING-L2] Family file missing for {family_type} in {country}.")
            else:
                # print(f"\tFamily Type: {family_type}")
                master_family_data = pd.read_csv(family_file, parse_dates=['datetime'])
    
                # Aggregating to different levels
                if AGGREGATE_BY not in ['daily', 'seasonal', 'weekly', 'monthly']:
                    data_aggregated = master_family_data
                else:
                    data_aggregated = aggregate_data(master_family_data, aggregation_level=AGGREGATE_BY)

                data_aggregated['family_type'] = family_type  # Add family type to the aggregated data

                # Identify columns for family members and HVAC consumption
                hvac_columns = ['heating_consumption', 'cooling_consumption']

                family_consumption_columns = [col for col in data_aggregated.columns if 'consumption' in col and col not in ['heating_consumption', 'cooling_consumption']]
                data_aggregated['total_family_consumption'] = data_aggregated[family_consumption_columns].sum(axis=1)
                data_aggregated['total_hvac_consumption'] = data_aggregated[hvac_columns].sum(axis=1)
                data_aggregated.drop(columns=family_consumption_columns, inplace=True)

                country_data.append(data_aggregated)
        # break

    if country_data:
        country_data = pd.concat(country_data)  # Combine all family data for the country
        # display(country_data)
        x_col = "temperature_value"
        y_col = "total_electricity_usage"
        # y_col = "total_family_consumption"
        # y_col = "total_hvac_consumption"
        plot_energy_signatures_sorted(country_data, x_col, y_col, family_member_counts, country=country, ax_list=AXES, season_colors=SEASON_COLORS)

    # Adjust layout and show the figure
    plt.tight_layout()

    PLOT_FOLDER = f"{CSV_FINAL_PROFILES_WEATHER}/plots"
    if not os.path.exists(PLOT_FOLDER):
        os.makedirs(PLOT_FOLDER)

    if SAVE_PLOTS:
        # Save as PNG with high resolution
        plt.savefig(os.path.join(PLOT_FOLDER, f"{country}.png"), dpi=300, bbox_inches="tight")
        # Save as PDF
        plt.savefig(os.path.join(PLOT_FOLDER, f"{country}.pdf"), bbox_inches="tight")
    plt.show()

    # break