## Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import glob

## Read all required files (ffilel price + ffiled macro)

In [2]:
# Carbon dir
path_HBEA = "../../02_Data_Processed/01_Carbon_Markets/01_Regional/HBEA_forward_filled.parquet"
path_GDEA = "../../02_Data_Processed/01_Carbon_Markets/01_Regional/GDEA_forward_filled.parquet"

macro_dir_hubei = "../../02_Data_Processed/02_Macroeconomic_Indicators/03_Forward_Filled_Daily/hubei/"
macro_dir_guangdong = "../../02_Data_Processed/02_Macroeconomic_Indicators/03_Forward_Filled_Daily/guangdong/"
macro_dir_national_global = "../../02_Data_Processed/02_Macroeconomic_Indicators/03_Forward_Filled_Daily/national_or_global/"


In [3]:
macro_paths_hubei = glob.glob(os.path.join(macro_dir_hubei, "*.parquet"))
macro_paths_guangdong = glob.glob(os.path.join(macro_dir_guangdong, "*.parquet"))
macro_paths_national_global = glob.glob(os.path.join(macro_dir_national_global, "*.parquet"))


In [4]:

hbea = pd.read_parquet(path_HBEA)
gdea = pd.read_parquet(path_GDEA)

## Detect frequency

In [5]:
def detect_frequency(df: pd.DataFrame):
    if 'date' in df.columns:
        df = df.set_index('date')
    df.index = pd.to_datetime(df.index)

    # Find where the value changes (i.e., not forward-filled)
    changes = df['value'].ne(df['value'].shift())
    change_dates = df.index[changes]
    # Calculate gaps
    gaps = (change_dates[1:] - change_dates[:-1]).days
    median_gap = np.median(gaps)
    if median_gap <= 1.5:
        return 'D'
    elif 25 < median_gap < 35:
        return 'M'
    elif 80 < median_gap < 100:
        return 'Q'
    else:
        return f'unknown (median gap: {median_gap})'

In [6]:
macro_groups = {
    "hubei": macro_paths_hubei,
    "guangdong": macro_paths_guangdong,
    "national_global": macro_paths_national_global,
}

dfs_by_group = {}

for group_name, file_list in macro_groups.items():
    group_dfs = {}
    for path in file_list:
        name = os.path.splitext(os.path.basename(path))[0]
        df = pd.read_parquet(path)
        df = df.set_index('date')
        group_dfs[name] = df
    dfs_by_group[group_name] = group_dfs

## Shift either 1 day (daily) or 15 day (monthely, quarterly) based on frequency


In [7]:
shift_map = {'D': 1, 'M': 15, 'Q': 15}

shifted_dfs_by_group = {}

for group, group_dfs in dfs_by_group.items():
    shifted_group = {}
    for name, df in group_dfs.items():
        freq = detect_frequency(df) 
        shift_n = shift_map.get(freq, 1)
        name_shifted = f"{name}_{shift_n}"
        shifted_group[name_shifted] = df.shift(shift_n)
    shifted_dfs_by_group[group] = shifted_group

In [8]:
for key in shifted_dfs_by_group.keys():
    for k in shifted_dfs_by_group[key].keys():
        print(k)

Hubei_ElectricityConsumption_Monthly_ffill_daily_15
Hubei_IndustrialAddedValue_RealPrices_AboveScaleIndustry_YoY_ffill_daily_15
Hubei_GDP_Cumulative_ffill_daily_15
Guangdong_GDP_Cumulative_ffill_daily_15
Guangdong_ElectricityConsumption_Monthly_ffill_daily_15
Guangdong_IndustrialAddedValue_RealPrices_AboveScaleIndustry_YoY_ffill_daily_15
China_Output_CrudeOilProcessing_Monthly_ffill_daily_15
SpotPrice_ThermalCoal_ARA_Europe_ffill_daily_1
Hubei_ElectricityConsumption_Monthly_ffill_daily_15
FuturesSettle(Cont)_BrentCrude_ffill_daily_1
Hubei_IndustrialAddedValue_RealPrices_AboveScaleIndustry_YoY_ffill_daily_15
FuturesClose(Cont)_NYMEX_NatGas_ffill_daily_1
China_Output_CrudeSteel_Monthly_ffill_daily_15
Guangdong_GDP_Cumulative_ffill_daily_15
SpotPrice_ThermalCoal_欧洲ARA港_ffill_daily_1
China_ElectricityGeneration_ThermalPower_Monthly_ffill_daily_15
China_TotalSocialFinancing_Monthly_ffill_daily_15
FuturesSettle(Cont)_EUA_Futures_ffill_daily_1
China_CPI_YoY_ffill_daily_15
China_TotalElectrici

## Final join

In [11]:
def prep_macro_for_join_index(df, new_col_name):
    if 'date' in df.columns:
        df = df.set_index('date')
    df.index = pd.to_datetime(df.index)
    df.index.name = 'date'
    return df[['value']].rename(columns={'value': new_col_name})

def join_macros_on_index(main_df, macro_groups, group_keys):
    result = main_df.copy()
    if 'date' in result.columns:
        result = result.set_index('date')
    result.index = pd.to_datetime(result.index)
    result.index.name = 'date'
    for group in group_keys:
        for col_name, macro_df in macro_groups[group].items():
            macro_ready = prep_macro_for_join_index(macro_df, col_name)
            result = result.join(macro_ready, how='left')
    return result


In [12]:
hbea_final = join_macros_on_index(hbea, shifted_dfs_by_group, ['hubei', 'national_global'])
gdea_final = join_macros_on_index(gdea, shifted_dfs_by_group, ['guangdong', 'national_global'])
hbea_final

ValueError: columns overlap but no suffix specified: Index(['Hubei_ElectricityConsumption_Monthly_ffill_daily_15'], dtype='object')