In [1]:
import os
import re
import pandas as pd

model_names = [
    "iTransformer", "SAITS", "NonstationaryTransformer", "ETSformer", "PatchTST", "Crossformer", "Informer", "Autoformer", "Pyraformer", "Transformer", 
    "BRITS", "MRNN", "GRUD", 
    "TimesNet", "MICN", "SCINet", 
    "StemGNN", 
    "FreTS", "Koopa", "DLinear", "FiLM", 
    "CSDI", "USGAN", "GPVAE"
]

metrics_pattern = re.compile(r"MAE=(\d+\.\d+) ± (\d+\.\d+), MSE=(\d+\.\d+) ± (\d+\.\d+), MRE=(\d+\.\d+) ± (\d+\.\d+), average inference time=(\d+\.\d+)")
params_pattern = re.compile(r"the number of trainable parameters: ([\d,]+)")

def extract_and_format_naive_classification(content):
    imputation_methods = ['Mean', 'Median', 'LOCF', 'Linear']
    data = {
        "methods": ['PR_AUC w XGB', 'PR_AUC w RNN', 'PR_AUC w Transformer', 
                    'ROC_AUC w XGB', 'ROC_AUC w RNN', 'ROC_AUC w Transformer']
    }
    formatted_data = {method: [] for method in imputation_methods}
    
    current_method_index = 0
    for line in content:
        if match:= re.match(r"(\w+)\s+with\s+\w+\s+imputation\s+PR_AUC:\s+([\d.]+)±([\d.]+),\s+ROC_AUC:\s+([\d.]+)±([\d.]+)", line):
            method, pr_auc_mean, pr_auc_std, roc_auc_mean, roc_auc_std = match.groups()
            formatted_data[imputation_methods[current_method_index]].append(f"{float(pr_auc_mean):.3f} ({float(pr_auc_std):.3f})")
            formatted_data[imputation_methods[current_method_index]].append(f"{float(roc_auc_mean):.3f} ({float(roc_auc_std):.3f})")
            
            # Move to the next imputation method after every three lines
            if len(formatted_data[imputation_methods[current_method_index]]) == 6:
                current_method_index += 1
    
    # Convert to DataFrame for better visualization
    final_data = {"name": [], "PR_AUC w XGB": [], "PR_AUC w RNN": [], "PR_AUC w Transformer": [], 
                  "ROC_AUC w XGB": [], "ROC_AUC w RNN": [], "ROC_AUC w Transformer": []}
    
    for method in imputation_methods:
        final_data["name"].append(method)
        final_data["PR_AUC w XGB"].append(formatted_data[method][0])
        final_data["PR_AUC w RNN"].append(formatted_data[method][2])
        final_data["PR_AUC w Transformer"].append(formatted_data[method][4])
        final_data["ROC_AUC w XGB"].append(formatted_data[method][1])
        final_data["ROC_AUC w RNN"].append(formatted_data[method][3])
        final_data["ROC_AUC w Transformer"].append(formatted_data[method][5])
    
    return final_data

In [2]:
for dataset in ["BeijingAir", "Electricity", "ETT_h1", "ItalyAir", "Pedestrian", "PeMS", "PhysioNet2012", "PhysioNet2019"]:
    log_dir = f"./imputation_log/point01_log/{dataset}_log"
    # Dictionary to store the results
    results = {
        "Model": [],
        "Size": [],
        "MAE": [],
        "MSE": [],
        "MRE": [],
        "Time": []
    }

    # Loop through each model's log file and extract metrics
    for model in model_names:
        file_path = os.path.join(log_dir, f"{model}_{dataset}.log")
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                params = "0"
                for line in lines:
                    if params_pattern.search(line):
                        params_match = params_pattern.search(line)
                        params = params_match.group(1)
                    if metrics_pattern.search(line):
                        metrics_match = metrics_pattern.search(line)
                        if metrics_match:
                            mae, mae_std, mse, mse_std, mre, mre_std, time = metrics_match.groups()
                            results["Model"].append(model)
                            results["Size"].append(params)
                            results["MAE"].append(f"{float(mae):.3f} ({float(mae_std):.3f})")
                            results["MSE"].append(f"{float(mse):.3f} ({float(mse_std):.3f})")
                            results["MRE"].append(f"{float(mre):.3f} ({float(mre_std):.3f})")
                            results["Time"].append(time)
                            break
                else:
                    results["Model"].append(model)
                    results["Size"].append(params)
                    results["MAE"].append("0")
                    results["MSE"].append("0")
                    results["MRE"].append("0")
                    results["Time"].append("0")
        else:
            results["Model"].append(model)
            results["Size"].append("0")
            results["MAE"].append("0")
            results["MSE"].append("0")
            results["MRE"].append("0")
            results["Time"].append("0")

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    df.to_csv(f"./results/imputation/point01/{dataset}.csv", index=False)

In [3]:
for dataset in ["BeijingAir", "Electricity", "ETT_h1", "ItalyAir", "Pedestrian", "PeMS"]:
    log_dir = f"./imputation_log/point05_log/{dataset}_log"

    results = {
        "Model": [],
        "Size": [],
        "MAE": [],
        "MSE": [],
        "MRE": [],
        "Time": []
    }

    # Loop through each model's log file and extract metrics
    for model in model_names:
        file_path = os.path.join(log_dir, f"{model}_{dataset}.log")
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                params = "0"
                for line in lines:
                    if params_pattern.search(line):
                        params_match = params_pattern.search(line)
                        params = params_match.group(1)
                    if metrics_pattern.search(line):
                        metrics_match = metrics_pattern.search(line)
                        if metrics_match:
                            mae, mae_std, mse, mse_std, mre, mre_std, time = metrics_match.groups()
                            results["Model"].append(model)
                            results["Size"].append(params)
                            results["MAE"].append(f"{float(mae):.3f} ({float(mae_std):.3f})")
                            results["MSE"].append(f"{float(mse):.3f} ({float(mse_std):.3f})")
                            results["MRE"].append(f"{float(mre):.3f} ({float(mre_std):.3f})")
                            results["Time"].append(time)
                            break
                else:
                    results["Model"].append(model)
                    results["Size"].append(params)
                    results["MAE"].append("0")
                    results["MSE"].append("0")
                    results["MRE"].append("0")
                    results["Time"].append("0")
        else:
            results["Model"].append(model)
            results["Size"].append("0")
            results["MAE"].append("0")
            results["MSE"].append("0")
            results["MRE"].append("0")
            results["Time"].append("0")

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    df.to_csv(f"./results/imputation/point05/{dataset}.csv", index=False)

In [4]:
for dataset in ["BeijingAir", "Electricity", "ETT_h1", "ItalyAir", "Pedestrian", "PeMS"]:
    log_dir = f"./imputation_log/point09_log/{dataset}_log"

    # Dictionary to store the results
    results = {
        "Model": [],
        "Size": [],
        "MAE": [],
        "MSE": [],
        "MRE": [],
        "Time": []
    }

    # Loop through each model's log file and extract metrics
    for model in model_names:
        file_path = os.path.join(log_dir, f"{model}_{dataset}.log")
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                params = "0"
                for line in lines:
                    if params_pattern.search(line):
                        params_match = params_pattern.search(line)
                        params = params_match.group(1)
                    if metrics_pattern.search(line):
                        metrics_match = metrics_pattern.search(line)
                        if metrics_match:
                            mae, mae_std, mse, mse_std, mre, mre_std, time = metrics_match.groups()
                            results["Model"].append(model)
                            results["Size"].append(params)
                            results["MAE"].append(f"{float(mae):.3f} ({float(mae_std):.3f})")
                            results["MSE"].append(f"{float(mse):.3f} ({float(mse_std):.3f})")
                            results["MRE"].append(f"{float(mre):.3f} ({float(mre_std):.3f})")
                            results["Time"].append(time)
                            break
                else:
                    results["Model"].append(model)
                    results["Size"].append(params)
                    results["MAE"].append("0")
                    results["MSE"].append("0")
                    results["MRE"].append("0")
                    results["Time"].append("0")
        else:
            results["Model"].append(model)
            results["Size"].append("0")
            results["MAE"].append("0")
            results["MSE"].append("0")
            results["MRE"].append("0")
            results["Time"].append("0")

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    df.to_csv(f"./results/imputation/point09/{dataset}.csv", index=False)

In [5]:
for dataset in ["BeijingAir", "Electricity", "ETT_h1", "ItalyAir", "PeMS"]:
    log_dir = f"./imputation_log/block05_log/{dataset}_log"

    # Dictionary to store the results
    results = {
        "Model": [],
        "Size": [],
        "MAE": [],
        "MSE": [],
        "MRE": [],
        "Time": []
    }

    # Loop through each model's log file and extract metrics
    for model in model_names:
        file_path = os.path.join(log_dir, f"{model}_{dataset}.log")
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                params = "0"
                for line in lines:
                    if params_pattern.search(line):
                        params_match = params_pattern.search(line)
                        params = params_match.group(1)
                    if metrics_pattern.search(line):
                        metrics_match = metrics_pattern.search(line)
                        if metrics_match:
                            mae, mae_std, mse, mse_std, mre, mre_std, time = metrics_match.groups()
                            results["Model"].append(model)
                            results["Size"].append(params)
                            results["MAE"].append(f"{float(mae):.3f} ({float(mae_std):.3f})")
                            results["MSE"].append(f"{float(mse):.3f} ({float(mse_std):.3f})")
                            results["MRE"].append(f"{float(mre):.3f} ({float(mre_std):.3f})")
                            results["Time"].append(time)
                            break
                else:
                    results["Model"].append(model)
                    results["Size"].append(params)
                    results["MAE"].append("0")
                    results["MSE"].append("0")
                    results["MRE"].append("0")
                    results["Time"].append("0")
        else:
            results["Model"].append(model)
            results["Size"].append("0")
            results["MAE"].append("0")
            results["MSE"].append("0")
            results["MRE"].append("0")
            results["Time"].append("0")

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    df.to_csv(f"./results/imputation/block05/{dataset}.csv", index=False)

In [6]:
for dataset in ["BeijingAir", "Electricity", "ETT_h1", "ItalyAir", "Pedestrian", "PeMS"]:
    log_dir = f"./imputation_log/subseq05_log/{dataset}_log"

    # Dictionary to store the results
    results = {
        "Model": [],
        "Size": [],
        "MAE": [],
        "MSE": [],
        "MRE": [],
        "Time": []
    }

    # Loop through each model's log file and extract metrics
    for model in model_names:
        file_path = os.path.join(log_dir, f"{model}_{dataset}.log")
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                params = "0"
                for line in lines:
                    if params_pattern.search(line):
                        params_match = params_pattern.search(line)
                        params = params_match.group(1)
                    if metrics_pattern.search(line):
                        metrics_match = metrics_pattern.search(line)
                        if metrics_match:
                            mae, mae_std, mse, mse_std, mre, mre_std, time = metrics_match.groups()
                            results["Model"].append(model)
                            results["Size"].append(params)
                            results["MAE"].append(f"{float(mae):.3f} ({float(mae_std):.3f})")
                            results["MSE"].append(f"{float(mse):.3f} ({float(mse_std):.3f})")
                            results["MRE"].append(f"{float(mre):.3f} ({float(mre_std):.3f})")
                            results["Time"].append(time)
                            break
                else:
                    results["Model"].append(model)
                    results["Size"].append(params)
                    results["MAE"].append("0")
                    results["MSE"].append("0")
                    results["MRE"].append("0")
                    results["Time"].append("0")
        else:
            results["Model"].append(model)
            results["Size"].append("0")
            results["MAE"].append("0")
            results["MSE"].append("0")
            results["MRE"].append("0")
            results["Time"].append("0")

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    df.to_csv(f"./results/imputation/subseq05/{dataset}.csv", index=False)


In [7]:
file_path = './naive_log/naive_imputation.log'
with open(file_path, 'r') as file:
    log_content = file.read()

# Extract type and dataset from the "Successfully saved the given data into data/" lines
save_pattern = re.compile(r'Successfully saved the given data into data/([^/]+)/([^/]+)')
save_matches = save_pattern.findall(log_content)

# Extract the metrics from the log
metric_pattern = re.compile(r'\[(.*?)\]: (.*?) imputation MAE: (.*?), MSE: (.*?), MRE: (.*?)\n')
metric_matches = metric_pattern.findall(log_content)

# Prepare lists to store extracted data
types = []
datasets = []
methods = []
maes = []
mses = []
mres = []

# Iterate over the matches and extract data
for i, (log_time, method, mae, mse, mre) in enumerate(metric_matches):
    log_type, dataset = save_matches[i // 4]  # Each save line corresponds to 4 metric lines
    types.append(log_type)
    datasets.append(dataset)
    methods.append(method)
    maes.append(round(float(mae), 3))
    mses.append(round(float(mse), 3))
    mres.append(round(float(mre), 3))

# Create DataFrame
data = {
    'type': types,
    'dataset': datasets,
    'method': methods,
    'mae': maes,
    'mse': mses,
    'mre': mres
}

df = pd.DataFrame(data)
df.to_csv(f"results/naive_imputation.csv", index=False)

In [8]:
datasets = ["PhysioNet2012_point_rate01_log", "Pedestrian_subseq_rate05_log", "Pedestrian_point_rate09_log", "Pedestrian_point_rate05_log", "Pedestrian_point_rate01_log"]
for dataset in datasets:
    # Define the path to the log directory
    log_dir = f"classification_log/{dataset}"

    # Initialize a dictionary to store the results for all models
    all_results = {
        "Model": [],
        "PR_AUC wt XGB": [],
        "PR_AUC w XGB": [],
        "PR_AUC w RNN": [],
        "PR_AUC w Transformer": [],
        "ROC_AUC wt XGB": [],
        "ROC_AUC w XGB": [],
        "ROC_AUC w RNN": [],
        "ROC_AUC w Transformer": []
    }

    # Loop through each model
    for model in model_names:
        # Define regex patterns to extract the required PR AUC metrics with confidence intervals
        xgb_without_pattern = re.compile(f"XGB without imputation PR_AUC: (\d+\.\d+)±(\d+\.\d+), ROC_AUC: (\d+\.\d+)±(\d+\.\d+)")
        xgb_with_pattern = re.compile(f"XGB with {model} imputation PR_AUC: (\d+\.\d+)±(\d+\.\d+), ROC_AUC: (\d+\.\d+)±(\d+\.\d+)")
        rnn_with_pattern = re.compile(f"RNN with {model} imputation PR_AUC: (\d+\.\d+)±(\d+\.\d+), ROC_AUC: (\d+\.\d+)±(\d+\.\d+)")
        transformer_with_pattern = re.compile(f"Transformer with {model} imputation PR_AUC: (\d+\.\d+)±(\d+\.\d+), ROC_AUC: (\d+\.\d+)±(\d+\.\d+)")

        # Dictionary to store the results for the current model
        results = {
            "PR_AUC wt XGB": "0",
            "PR_AUC w XGB": "0",
            "PR_AUC w RNN": "0",
            "PR_AUC w Transformer": "0",
            "ROC_AUC wt XGB": "0",
            "ROC_AUC w XGB": "0",
            "ROC_AUC w RNN": "0",
            "ROC_AUC w Transformer": "0"
        }
        if 'PhysioNet2012' in dataset:
            name = 'PhysioNet2012'
        elif 'Pedestrian' in dataset:
            name = 'Pedestrian'

        # Define the path to the model's log file
        log_file_path = os.path.join(log_dir, f"{model}_{name}.log")
        # Read the log file and extract the required metrics with confidence intervals
        if os.path.exists(log_file_path):
            with open(log_file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    if xgb_without_pattern.search(line):
                        xgb_without_match = xgb_without_pattern.search(line)
                        if xgb_without_match:
                            pr_auc, pr_conf_int, roc_auc, roc_conf_int = xgb_without_match.groups()
                            results["PR_AUC wt XGB"] = f"{float(pr_auc):.3f} ({float(pr_conf_int):.3f})"
                            results["ROC_AUC wt XGB"] = f"{float(roc_auc):.3f} ({float(roc_conf_int):.3f})"
                    if xgb_with_pattern.search(line):
                        xgb_with_match = xgb_with_pattern.search(line)
                        if xgb_with_match:
                            pr_auc, pr_conf_int, roc_auc, roc_conf_int = xgb_with_match.groups()
                            results["PR_AUC w XGB"] = f"{float(pr_auc):.3f} ({float(pr_conf_int):.3f})"
                            results["ROC_AUC w XGB"] = f"{float(roc_auc):.3f} ({float(roc_conf_int):.3f})"
                    if rnn_with_pattern.search(line):
                        rnn_with_match = rnn_with_pattern.search(line)
                        if rnn_with_match:
                            pr_auc, pr_conf_int, roc_auc, roc_conf_int = rnn_with_match.groups()
                            results["PR_AUC w RNN"] = f"{float(pr_auc):.3f} ({float(pr_conf_int):.3f})"
                            results["ROC_AUC w RNN"] = f"{float(roc_auc):.3f} ({float(roc_conf_int):.3f})"
                    if transformer_with_pattern.search(line):
                        transformer_with_match = transformer_with_pattern.search(line)
                        if transformer_with_match:
                            pr_auc, pr_conf_int, roc_auc, roc_conf_int = transformer_with_match.groups()
                            results["PR_AUC w Transformer"] = f"{float(pr_auc):.3f} ({float(pr_conf_int):.3f})"
                            results["ROC_AUC w Transformer"] = f"{float(roc_auc):.3f} ({float(roc_conf_int):.3f})"

        # Append the results for the current model to the overall results
        all_results["Model"].append(model)
        all_results["PR_AUC wt XGB"].append(results["PR_AUC wt XGB"])
        all_results["PR_AUC w XGB"].append(results["PR_AUC w XGB"])
        all_results["PR_AUC w RNN"].append(results["PR_AUC w RNN"])
        all_results["PR_AUC w Transformer"].append(results["PR_AUC w Transformer"])
        all_results["ROC_AUC wt XGB"].append(results["ROC_AUC wt XGB"])
        all_results["ROC_AUC w XGB"].append(results["ROC_AUC w XGB"])
        all_results["ROC_AUC w RNN"].append(results["ROC_AUC w RNN"])
        all_results["ROC_AUC w Transformer"].append(results["ROC_AUC w Transformer"])

    df_all = pd.DataFrame(all_results)
    df_all.to_csv(f"./results/classification/{dataset[:-4]}.csv", index=False)

datasets = ["PhysioNet2012", "Pedestrian_subseq05","Pedestrian_point01","Pedestrian_point05","Pedestrian_point09"]
# Read the content of the file
for dataset in datasets:
    file_path = f'./naive_log/downstream_classification_naive_{dataset}.log'
    with open(file_path, 'r') as file:
        content = file.readlines()
    # Reformat the data
    reformatted_data = extract_and_format_naive_classification(content)

    df_final = pd.DataFrame(reformatted_data)
    df_final.to_csv(f"./results/classification/downstream_classification_naive_{dataset}.csv", index=False)

In [9]:
datasets = ["ETT_h1_point_rate01","ETT_h1_block_rate05", "ETT_h1_point_rate05", "ETT_h1_subseq_rate05",
            "PeMS_block_rate05", "PeMS_point_rate05","PeMS_subseq_rate05"]

# Define regex patterns to extract the required metrics with confidence intervals
xgb_without_pattern = re.compile(r"XGB \(without imputation\) regression MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
xgb_with_pattern = re.compile(r"XGB \(with (.+) imputation\) regression MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
rnn_with_pattern = re.compile(r"RNN \(with (.+) imputation\) regression MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
transformer_with_pattern = re.compile(r"Transformer \(with (.+) imputation\) regression MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")

# Loop through each dataset
for dataset in datasets:
    # Define the path to the log directory
    log_dir = f"./regression_log/{dataset}_log"
    # Initialize a dictionary to store the results for all models
    all_results = {
        "Model": [],
        "MAE wt XGB": [],
        "MRE wt XGB": [],
        "MSE wt XGB": [],
        "MAE w XGB": [],
        "MRE w XGB": [],
        "MSE w XGB": [],
        "MAE w RNN": [],
        "MRE w RNN": [],
        "MSE w RNN": [],
        "MAE w Transformer": [],
        "MRE w Transformer": [],
        "MSE w Transformer": []
    }
    if 'ETT_h1' in dataset:
        dataset_name = 'ETT_h1'
    elif 'PeMS' in dataset:
        dataset_name = 'PeMS'
    # Loop through each model
    for model in model_names:
        log_file_path = os.path.join(log_dir, f"{model}_{dataset_name}.log")
        if os.path.exists(log_file_path):
            results = {
                "Model": model,
                "MAE wt XGB": "",
                "MRE wt XGB": "",
                "MSE wt XGB": "",
                "MAE w XGB": "",
                "MRE w XGB": "",
                "MSE w XGB": "",
                "MAE w RNN": "",
                "MRE w RNN": "",
                "MSE w RNN": "",
                "MAE w Transformer": "",
                "MRE w Transformer": "",
                "MSE w Transformer": ""
            }
            
            with open(log_file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    if xgb_without_pattern.search(line):
                        xgb_without_match = xgb_without_pattern.search(line)
                        if xgb_without_match:
                            mae, mae_std, mse, mse_std, mre, mre_std = xgb_without_match.groups()
                            results["MAE wt XGB"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MSE wt XGB"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                            results["MRE wt XGB"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                    if xgb_with_pattern.search(line):
                        xgb_with_match = xgb_with_pattern.search(line)
                        if xgb_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = xgb_with_match.groups()
                            results["MAE w XGB"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w XGB"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w XGB"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                    if rnn_with_pattern.search(line):
                        rnn_with_match = rnn_with_pattern.search(line)
                        if rnn_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = rnn_with_match.groups()
                            results["MAE w RNN"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w RNN"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w RNN"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                    if transformer_with_pattern.search(line):
                        transformer_with_match = transformer_with_pattern.search(line)
                        if transformer_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = transformer_with_match.groups()
                            results["MAE w Transformer"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w Transformer"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w Transformer"] = f"{float(mse):.3f} ({float(mse_std):.3f})"

            all_results["Model"].append(results["Model"])
            all_results["MAE wt XGB"].append(results["MAE wt XGB"])
            all_results["MRE wt XGB"].append(results["MRE wt XGB"])
            all_results["MSE wt XGB"].append(results["MSE wt XGB"])
            all_results["MAE w XGB"].append(results["MAE w XGB"])
            all_results["MRE w XGB"].append(results["MRE w XGB"])
            all_results["MSE w XGB"].append(results["MSE w XGB"])
            all_results["MAE w RNN"].append(results["MAE w RNN"])
            all_results["MRE w RNN"].append(results["MRE w RNN"])
            all_results["MSE w RNN"].append(results["MSE w RNN"])
            all_results["MAE w Transformer"].append(results["MAE w Transformer"])
            all_results["MRE w Transformer"].append(results["MRE w Transformer"])
            all_results["MSE w Transformer"].append(results["MSE w Transformer"])

    df_all = pd.DataFrame(all_results)
    df_all.to_csv(f'./results/regression/{dataset}.csv', index=False)

log_file_path = './naive_log/downstream_regression_naive.log'
with open(log_file_path, 'r') as file:
    log_data = file.readlines()

# Define regex patterns for extracting relevant data
pattern_type_dataset = re.compile(r"Start running downstream regression task on data/(?P<type>[\w_]+)/(?P<dataset>[\w_]+)")
pattern_results = re.compile(r"(?P<model>\w+) \((?P<method>[\w\s]+)\) regression MAE: (?P<MAE>[\d.]+)±(?P<MAE_CI>[\d.]+), MSE: (?P<MSE>[\d.]+)±(?P<MSE_CI>[\d.]+), MRE: (?P<MRE>[\d.]+)±(?P<MRE_CI>[\d.]+)")

# Container for extracted data
data = []

current_type = None
current_dataset = None

# Parse the log file
for line in log_data:
    match_type_dataset = pattern_type_dataset.search(line)
    if match_type_dataset:
        current_type = match_type_dataset.group('type')
        current_dataset = match_type_dataset.group('dataset')
        continue

    match_results = pattern_results.search(line)
    if match_results and current_type and current_dataset:
        method = match_results.group('method').strip()
        model = match_results.group('model').strip()
        MAE = f"{float(match_results.group('MAE')):.3f} ({float(match_results.group('MAE_CI')):.3f})"
        MSE = f"{float(match_results.group('MSE')):.3f} ({float(match_results.group('MSE_CI')):.3f})"
        MRE = f"{float(match_results.group('MRE')):.3f} ({float(match_results.group('MRE_CI')):.3f})"
        
        entry = {
            'Method': method,
            'Type': current_type,
            'Dataset': current_dataset,
            f"MAE w {model}": MAE,
            f"MSE w {model}": MSE,
            f"MRE w {model}": MRE
        }
        data.append(entry)

# Convert list of dicts to DataFrame
df = pd.DataFrame(data)
# Pivot the data to get the desired format
df_pivot = df.pivot_table(index=['Method', 'Type', 'Dataset'], 
                          values=['MAE w XGB', 'MSE w XGB', 'MRE w XGB',
                                  'MAE w RNN', 'MSE w RNN', 'MRE w RNN',
                                  'MAE w Transformer', 'MSE w Transformer', 'MRE w Transformer'], 
                          aggfunc='first').reset_index()

# # Reorder columns to match the example format
df_pivot = df_pivot[['Method', 'Type', 'Dataset', 
                     'MAE w XGB', 'MRE w XGB', 'MSE w XGB', 
                     'MAE w RNN', 'MRE w RNN', 'MSE w RNN', 
                     'MAE w Transformer', 'MRE w Transformer', 'MSE w Transformer']]
df_pivot.to_csv('results/regression/downstream_regression_naive.csv', index=False)


In [10]:
datasets = ["ETT_h1_point_rate01","ETT_h1_block_rate05", "ETT_h1_point_rate05", "ETT_h1_subseq_rate05",
            "PeMS_block_rate05", "PeMS_point_rate05","PeMS_subseq_rate05"]

# Define regex patterns to extract the required metrics with confidence intervals
xgb_without_pattern = re.compile(r"XGB \(without imputation\) forecasting MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
xgb_with_pattern = re.compile(r"XGB \(with (.+) imputation\) forecasting MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
rnn_with_pattern = re.compile(r"RNN \(with (.+) imputation\) forecasting MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")
transformer_with_pattern = re.compile(r"Transformer \(with (.+) imputation\) forecasting MAE: (\d+\.\d+)±(\d+\.\d+), MSE: (\d+\.\d+)±(\d+\.\d+), MRE: (\d+\.\d+)±(\d+\.\d+)")

# Loop through each dataset
for dataset in datasets:
    # Define the path to the log directory
    log_dir = f"./forecasting_log/{dataset}_log"
    # Initialize a dictionary to store the results for all models
    all_results = {
        "Model": [],
        "MAE wt XGB": [],
        "MRE wt XGB": [],
        "MSE wt XGB": [],
        "MAE w XGB": [],
        "MRE w XGB": [],
        "MSE w XGB": [],
        "MAE w RNN": [],
        "MRE w RNN": [],
        "MSE w RNN": [],
        "MAE w Transformer": [],
        "MRE w Transformer": [],
        "MSE w Transformer": []
    }

    if 'ETT_h1' in dataset:
        dataset_name = 'ETT_h1'
    elif 'PeMS' in dataset:
        dataset_name = 'PeMS'
    # Loop through each model
    for model in model_names:
        log_file_path = os.path.join(log_dir, f"{model}_{dataset_name}.log")
        if os.path.exists(log_file_path):
            results = {
                "Model": model,
                "MAE wt XGB": "",
                "MRE wt XGB": "",
                "MSE wt XGB": "",
                "MAE w XGB": "",
                "MRE w XGB": "",
                "MSE w XGB": "",
                "MAE w RNN": "",
                "MRE w RNN": "",
                "MSE w RNN": "",
                "MAE w Transformer": "",
                "MRE w Transformer": "",
                "MSE w Transformer": ""
            }
            
            with open(log_file_path, 'r') as file:
                lines = file.readlines()
                for line in lines:
                    if xgb_without_pattern.search(line):
                        xgb_without_match = xgb_without_pattern.search(line)
                        if xgb_without_match:
                            mae, mae_std, mse, mse_std, mre, mre_std = xgb_without_match.groups()
                            results["MAE wt XGB"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MSE wt XGB"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                            results["MRE wt XGB"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                    if xgb_with_pattern.search(line):
                        xgb_with_match = xgb_with_pattern.search(line)
                        if xgb_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = xgb_with_match.groups()
                            results["MAE w XGB"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w XGB"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w XGB"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                    if rnn_with_pattern.search(line):
                        rnn_with_match = rnn_with_pattern.search(line)
                        if rnn_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = rnn_with_match.groups()
                            results["MAE w RNN"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w RNN"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w RNN"] = f"{float(mse):.3f} ({float(mse_std):.3f})"
                    if transformer_with_pattern.search(line):
                        transformer_with_match = transformer_with_pattern.search(line)
                        if transformer_with_match:
                            _, mae, mae_std, mse, mse_std, mre, mre_std = transformer_with_match.groups()
                            results["MAE w Transformer"] = f"{float(mae):.3f} ({float(mae_std):.3f})"
                            results["MRE w Transformer"] = f"{float(mre):.3f} ({float(mre_std):.3f})"
                            results["MSE w Transformer"] = f"{float(mse):.3f} ({float(mse_std):.3f})"

            all_results["Model"].append(results["Model"])
            all_results["MAE wt XGB"].append(results["MAE wt XGB"])
            all_results["MRE wt XGB"].append(results["MRE wt XGB"])
            all_results["MSE wt XGB"].append(results["MSE wt XGB"])
            all_results["MAE w XGB"].append(results["MAE w XGB"])
            all_results["MRE w XGB"].append(results["MRE w XGB"])
            all_results["MSE w XGB"].append(results["MSE w XGB"])
            all_results["MAE w RNN"].append(results["MAE w RNN"])
            all_results["MRE w RNN"].append(results["MRE w RNN"])
            all_results["MSE w RNN"].append(results["MSE w RNN"])
            all_results["MAE w Transformer"].append(results["MAE w Transformer"])
            all_results["MRE w Transformer"].append(results["MRE w Transformer"])
            all_results["MSE w Transformer"].append(results["MSE w Transformer"])

    df_all = pd.DataFrame(all_results)
    df_all.to_csv(f'./results/forecasting/{dataset}.csv', index=False)

log_file_path = './naive_log/downstream_forecasting_naive.log'
with open(log_file_path, 'r') as file:
    log_data = file.readlines()

# Define regex patterns for extracting relevant data
pattern_type_dataset = re.compile(r"Start running downstream forecasting task on data/(?P<type>[\w_]+)/(?P<dataset>[\w_]+)")
pattern_results = re.compile(r"(?P<model>\w+) \((?P<method>[\w\s]+)\) forecasting MAE: (?P<MAE>[\d.]+)±(?P<MAE_CI>[\d.]+), MSE: (?P<MSE>[\d.]+)±(?P<MSE_CI>[\d.]+), MRE: (?P<MRE>[\d.]+)±(?P<MRE_CI>[\d.]+)")

# Container for extracted data
data = []

current_type = None
current_dataset = None

# Parse the log file
for line in log_data:
    match_type_dataset = pattern_type_dataset.search(line)
    if match_type_dataset:
        current_type = match_type_dataset.group('type')
        current_dataset = match_type_dataset.group('dataset')
        continue

    match_results = pattern_results.search(line)
    if match_results and current_type and current_dataset:
        method = match_results.group('method').strip()
        model = match_results.group('model').strip()
        MAE = f"{float(match_results.group('MAE')):.3f} ({float(match_results.group('MAE_CI')):.3f})"
        MSE = f"{float(match_results.group('MSE')):.3f} ({float(match_results.group('MSE_CI')):.3f})"
        MRE = f"{float(match_results.group('MRE')):.3f} ({float(match_results.group('MRE_CI')):.3f})"
        
        entry = {
            'Method': method,
            'Type': current_type,
            'Dataset': current_dataset,
            f"MAE w {model}": MAE,
            f"MSE w {model}": MSE,
            f"MRE w {model}": MRE
        }
        data.append(entry)

# Convert list of dicts to DataFrame
df = pd.DataFrame(data)
# Pivot the data to get the desired format
df_pivot = df.pivot_table(index=['Method', 'Type', 'Dataset'], 
                          values=['MAE w XGB', 'MSE w XGB', 'MRE w XGB',
                                  'MAE w RNN', 'MSE w RNN', 'MRE w RNN',
                                  'MAE w Transformer', 'MSE w Transformer', 'MRE w Transformer'], 
                          aggfunc='first').reset_index()

# # Reorder columns to match the example format
df_pivot = df_pivot[['Method', 'Type', 'Dataset', 
                     'MAE w XGB', 'MRE w XGB', 'MSE w XGB', 
                     'MAE w RNN', 'MRE w RNN', 'MSE w RNN', 
                     'MAE w Transformer', 'MRE w Transformer', 'MSE w Transformer']]
df_pivot.to_csv('./results/forecasting/downstream_forecasting_naive.csv', index=False)
