## Import packages

In [10]:
import os
from pathlib import Path
import logging
import warnings

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ruptures as rpt

from datetime import datetime
import exchange_calendars as xc

# for Parquet I/O
import pyarrow as pa
import pyarrow.parquet as pq

In [11]:
# Read in the file names from "../../../01_Data_Raw/02_Macroeconomic_Indicators/_fileNames.txt"
file_names = []
with open("../../../01_Data_Raw/02_Macroeconomic_Indicators/_fileNames.txt", "r") as f:
    file_names = [line.strip() for line in f if line.strip()]
file_names

['广东_工业增加值_可比价_规模以上工业企业_当月同比.csv',
 '湖北_工业增加值_可比价_规模以上工业企业_当月同比.csv',
 '中国_产量_原油加工量_当月值.csv',
 '中国_产量_原煤_当月值.csv',
 '中国_产量_水泥_当月值.csv',
 '中国_产量_粗钢_当月值.csv',
 '中国_发电量_火电_当月值.csv',
 '中国_全社会用电量_当月值.csv',
 '中国_社会融资规模_当月值.csv',
 '广东_用电量_当月值.csv',
 '湖北_用电量_当月值.csv',
 '现货价_动力煤_欧洲ARA港.csv',
 '中国_CPI_当月同比.csv',
 '中国_GDP_现价_累计值.csv',
 '广东_GDP_累计值.csv',
 '湖北_GDP_累计值.csv',
 '中国_制造业PMI.csv',
 '期货结算价(连续)_布伦特原油.csv',
 '期货结算价(连续)_欧盟排放配额(EUA).csv',
 '期货收盘价(连续)_NYMEX天然气.csv',
 'CFETS_即期汇率_美元兑人民币.csv']

## For all file in `_fileNmaes.txt`
1. Remove first $8$ rows (meta data).
2. Remove last $2$ rows (meta data).
3. Cast first column to datetime object.
4. Remove all data prior year 2012.
5. Cast second column to numerical.

In [12]:
def remove_meta_data_and_cast_types(relative_path_input, filename):
    input_file = Path(relative_path_input) / filename
    output_file = f"../_temp_data_files/01_{Path(filename).stem}_Interim1.parquet"
    # Add header row: date, value
    df = pd.read_csv(
        input_file,
        skiprows=8,
        skipfooter=2,
        encoding="gbk",
        engine='python',
        header=None,
        names=["date", "value"]
    )
    df["date"] = pd.to_datetime(df["date"], errors='coerce')
    df["value"] = pd.to_numeric(df["value"], errors='coerce')
    df = df[(pd.to_datetime("2025-04-29") >= df["date"]) & (df["date"] >= pd.to_datetime("2012-01-01"))]
    df.to_parquet(output_file, index=False)
    return output_file

In [13]:
output_txt = "01_Interim1_paths.txt"
with open(output_txt, "w") as f_out:
    for f in file_names:
        output_path = remove_meta_data_and_cast_types("../../../01_Data_Raw/02_Macroeconomic_Indicators", f)
        f_out.write(output_path + "\n")

## Group Files Based on Frequency

In [14]:
def infer_freq(df):
    dates = df.iloc[:, 0].sort_values()
    diffs = dates.diff().dropna()
    median_days = diffs.median().days
    if 27 < median_days < 32:
        return "M"  # Monthly
    elif 85 < median_days < 95:
        return "Q"  # Quarterly
    elif median_days == 1:
        return "D"  # Daily
    else:
        return f"Other_{median_days}"

In [15]:
# Read the list of Parquet file 
with open("01_Interim1_paths.txt", "r") as f:
    parquet_files = [line.strip() for line in f if line.strip()]
    
freq_groups = {}
for f in parquet_files:
    df = pd.read_parquet(f)
    freq = infer_freq(df)
    freq_groups.setdefault(freq, []).append(f)
    

# Print results
for freq, files in freq_groups.items():
    print(f"{freq}:")
    for f in files:
        print(f"  {Path(f).name}")

M:
  01_广东_工业增加值_可比价_规模以上工业企业_当月同比_Interim1.parquet
  01_湖北_工业增加值_可比价_规模以上工业企业_当月同比_Interim1.parquet
  01_中国_产量_原油加工量_当月值_Interim1.parquet
  01_中国_产量_原煤_当月值_Interim1.parquet
  01_中国_产量_水泥_当月值_Interim1.parquet
  01_中国_产量_粗钢_当月值_Interim1.parquet
  01_中国_发电量_火电_当月值_Interim1.parquet
  01_中国_全社会用电量_当月值_Interim1.parquet
  01_中国_社会融资规模_当月值_Interim1.parquet
  01_广东_用电量_当月值_Interim1.parquet
  01_湖北_用电量_当月值_Interim1.parquet
  01_中国_CPI_当月同比_Interim1.parquet
  01_中国_制造业PMI_Interim1.parquet
D:
  01_现货价_动力煤_欧洲ARA港_Interim1.parquet
  01_期货结算价(连续)_布伦特原油_Interim1.parquet
  01_期货结算价(连续)_欧盟排放配额(EUA)_Interim1.parquet
  01_期货收盘价(连续)_NYMEX天然气_Interim1.parquet
  01_CFETS_即期汇率_美元兑人民币_Interim1.parquet
Q:
  01_中国_GDP_现价_累计值_Interim1.parquet
  01_广东_GDP_累计值_Interim1.parquet
  01_湖北_GDP_累计值_Interim1.parquet


In [16]:
for freq, files in freq_groups.items():
    print(f"{freq}:")
    for f in files:
        df = pd.read_parquet(f)
        n_rows = len(df)
        n_missing = df.isna().sum().sum()
        pct_missing = 100 * n_missing / (n_rows * df.shape[1]) if n_rows > 0 else 0
        print(f"  {Path(f).name}: {pct_missing:.2f}% missing")

M:
  01_广东_工业增加值_可比价_规模以上工业企业_当月同比_Interim1.parquet: 8.18% missing
  01_湖北_工业增加值_可比价_规模以上工业企业_当月同比_Interim1.parquet: 8.49% missing
  01_中国_产量_原油加工量_当月值_Interim1.parquet: 6.29% missing
  01_中国_产量_原煤_当月值_Interim1.parquet: 7.23% missing
  01_中国_产量_水泥_当月值_Interim1.parquet: 7.23% missing
  01_中国_产量_粗钢_当月值_Interim1.parquet: 6.29% missing
  01_中国_发电量_火电_当月值_Interim1.parquet: 6.29% missing
  01_中国_全社会用电量_当月值_Interim1.parquet: 0.63% missing
  01_中国_社会融资规模_当月值_Interim1.parquet: 0.00% missing
  01_广东_用电量_当月值_Interim1.parquet: 1.89% missing
  01_湖北_用电量_当月值_Interim1.parquet: 1.89% missing
  01_中国_CPI_当月同比_Interim1.parquet: 0.00% missing
  01_中国_制造业PMI_Interim1.parquet: 0.00% missing
D:
  01_现货价_动力煤_欧洲ARA港_Interim1.parquet: 25.33% missing
  01_期货结算价(连续)_布伦特原油_Interim1.parquet: 14.69% missing
  01_期货结算价(连续)_欧盟排放配额(EUA)_Interim1.parquet: 14.68% missing
  01_期货收盘价(连续)_NYMEX天然气_Interim1.parquet: 15.76% missing
  01_CFETS_即期汇率_美元兑人民币_Interim1.parquet: 16.78% missing
Q:
  01_中国_GDP_现价_累计值_Interim1.parquet

## Fill all Missing Data

### Forward Fill

In [17]:
dir_forward_fill = Path("../../../02_Data_Processed/02_Macroeconomic_Indicators/01_Forward_Filled")
dir_forward_fill.mkdir(parents=True, exist_ok=True)

output_filenames = []


for freq, files in freq_groups.items():
    for f in files:
        df = pd.read_parquet(f)
        for col in df.columns[1:]:  # skip date col
            filled_col = f"{col}_filled"
            isna = df[col].isna()
            df[filled_col] = False
            df[col] = df[col].ffill()
            df.loc[isna & df[col].notna(), filled_col] = True
        base_name = Path(f).stem.replace("_Interim1", "").removeprefix("01_")
        out_path_parquet = dir_forward_fill / f"{base_name}_ffill.parquet"
        out_path_csv = dir_forward_fill / f"{base_name}_ffill.csv"
        df.to_parquet(out_path_parquet, index=False)
        df.to_csv(out_path_csv, index=False)
        output_filenames.append(out_path_parquet.name)

# Write _fileNames.txt
with open(dir_forward_fill / "_fileNames.txt", "w") as f:
    for name in output_filenames:
        f.write(name + "\n")

### Fill with interpolation

In [18]:
dir_interp = Path("../../../02_Data_Processed/02_Macroeconomic_Indicators/02_Interpolated")
dir_interp.mkdir(parents=True, exist_ok=True)

output_filenames_interp = []

for freq, files in freq_groups.items():
    for f in files:
        df = pd.read_parquet(f)
        for col in df.columns[1:]:  # skip date col
            filled_col = f"{col}_filled"
            isna = df[col].isna()
            df[filled_col] = False
            # Interpolate
            df[col] = df[col].interpolate(method="linear", limit_direction="forward")
            # Round to one decimal
            df[col] = df[col].round(1)
            df.loc[isna & df[col].notna(), filled_col] = True
        base_name = Path(f).stem.replace("_Interim1", "").removeprefix("01_")
        out_path_parquet = dir_interp / f"{base_name}_interp.parquet"
        out_path_csv = dir_interp / f"{base_name}_interp.csv"
        df.to_parquet(out_path_parquet, index=False)
        df.to_csv(out_path_csv, index=False)
        output_filenames_interp.append(out_path_parquet.name)

# Write _fileNames.txt
with open(dir_interp / "_fileNames.txt", "w") as f:
    for name in output_filenames_interp:
        f.write(name + "\n")