In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [2]:
from task_config import ZeroFlatConfig1, ZeroFlatConfig2, FewFlatConfig1, FewFlatConfig2
configs_list = [ZeroFlatConfig1, ZeroFlatConfig2, FewFlatConfig1, FewFlatConfig2]

In [36]:
def extract_claim_subclaim(code):
    if pd.isna(code) or code == '-1':
        return -1, -1
    try:
        claim, _ = map(int, code.split('_'))
        return claim, code
    except Exception:
        return -1, -1

In [37]:
def calculate_metrics(y_true, y_pred):
    labels = sorted(set(y_true) | set(y_pred))
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average='macro', zero_division=0
    )
    # print(f"Precision: {precision:.3f} | Recall: {recall:.3f} | F1 Score: {f1:.3f}")
    return precision, recall, f1

In [49]:
def evaluate_predictions(df):
    metrics = []

    df[['true_claim', 'true_subclaim']] = df['sub_claim_code'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))
    df[['pred_claim', 'pred_subclaim']] = df['prediction'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))

    total = len(df)
    invalid = len(df[df['pred_subclaim'] == -1])
    valid = total - invalid
    
    metrics.append({
        "Category": "Summary",
        "Precision": None,
        "Recall": None,
        "F1 Score": None,
        "Support": f"{valid}/{total} ({valid / total:.1%} valid)"
    })

    # Filter valid examples only
    valid_df = df[(df['pred_subclaim'] != -1) & (df['true_subclaim'] != -1)]

    # ---------- Overall Metrics ----------
    precision, recall, f1 = calculate_metrics(valid_df['true_subclaim'], valid_df['pred_subclaim'])
    metrics.append({
            "Category": "All Sub-claims",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
    })

    # print("\n--- Overall Claim Classification ---")
    precision, recall, f1 = calculate_metrics(valid_df['true_claim'], valid_df['pred_claim'])
    metrics.append({
            "Category": "All Claims",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
    })

    # ---------- Per-Claim Category ----------
    for claim in sorted(valid_df['true_claim'].unique()):
        subset = valid_df[valid_df['true_claim'] == claim]
        precision, recall, f1 = calculate_metrics(subset['true_claim'], subset['pred_claim'])
        metrics.append({
            "Category": f"Claim {claim}",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
        })

    # ---------- Per Sub-Claim Category ----------
    for subclaim in sorted(valid_df['true_subclaim'].unique()):
        subset = valid_df[valid_df['true_subclaim'] == subclaim]
        calculate_metrics(subset['true_subclaim'], subset['pred_subclaim'])
        metrics.append({
            "Category": f"Sub-claim {subclaim}",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
        })

    return pd.DataFrame(metrics)
    


In [50]:
def print_statistics(df):

    df[['true_claim', 'true_subclaim']] = df['sub_claim_code'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))
    df[['pred_claim', 'pred_subclaim']] = df['prediction'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))

    total = len(df)
    invalid = len(df[df['pred_subclaim'] == -1])
    valid = total - invalid
    
    valid_df = df[(df['pred_subclaim'] != -1) & (df['true_subclaim'] != -1)]

    for claim in sorted(valid_df['true_claim'].unique()):
        subset = valid_df[valid_df['true_claim'] == claim]
        print(f"Claim: {claim}: {len(subset)}")
        
    for subclaim in sorted(valid_df['true_subclaim'].unique()):
        subset = valid_df[valid_df['true_subclaim'] == subclaim]
        print(f"Sublaim: {subclaim}: {len(subset)}")

In [51]:
combined_df = None
printed_stats_test = False
printed_stats_val = False

for task_config in configs_list:
    result_dir = task_config.result_dir.format("val" if task_config.val_mode else "test") 
    result_path = f"{result_dir}/{task_config.id}.csv"
    if task_config.val_mode and not printed_stats_val:
        print_statistics(df)
        printed_stats_val = True
    if not task_config.val_mode and not printed_stats_test:
        print_statistics(df)
        printed_stats_test = True
    df = pd.read_csv(result_path)
    metrics_df = evaluate_predictions(df)
    metrics_df.insert(0, 'Task', task_config.name)
    if combined_df is None:
        combined_df = metrics_df
    else:
        combined_df = pd.concat([combined_df, metrics_df], ignore_index=True)

Claim: 0: 627
Claim: 1: 62
Claim: 2: 52
Claim: 3: 31
Claim: 4: 25
Claim: 5: 73
Sublaim: 0_0: 627
Sublaim: 1_1: 13
Sublaim: 1_2: 4
Sublaim: 1_3: 12
Sublaim: 1_4: 20
Sublaim: 1_6: 3
Sublaim: 1_7: 10
Sublaim: 2_1: 41
Sublaim: 2_3: 11
Sublaim: 3_1: 6
Sublaim: 3_2: 12
Sublaim: 3_3: 13
Sublaim: 4_1: 7
Sublaim: 4_2: 7
Sublaim: 4_4: 5
Sublaim: 4_5: 6
Sublaim: 5_1: 43
Sublaim: 5_2: 30
Claim: 0: 1730
Claim: 1: 242
Claim: 2: 168
Claim: 3: 119
Claim: 4: 169
Claim: 5: 215
Sublaim: 0_0: 1730
Sublaim: 1_1: 48
Sublaim: 1_2: 21
Sublaim: 1_3: 30
Sublaim: 1_4: 68
Sublaim: 1_6: 12
Sublaim: 1_7: 63
Sublaim: 2_1: 122
Sublaim: 2_3: 46
Sublaim: 3_1: 25
Sublaim: 3_2: 49
Sublaim: 3_3: 45
Sublaim: 4_1: 62
Sublaim: 4_2: 34
Sublaim: 4_4: 38
Sublaim: 4_5: 35
Sublaim: 5_1: 150
Sublaim: 5_2: 65


In [43]:
import pandas as pd

# Show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 0)

combined_df.head(len(combined_df))

Unnamed: 0,Task,Category,Precision,Recall,F1 Score,Support
0,ClaimCheck::ZeroFlat::Test,Summary,,,,2643/2701 (97.9% valid)
1,ClaimCheck::ZeroFlat::Test,All Sub-claims,0.314758,0.300081,0.262905,
2,ClaimCheck::ZeroFlat::Test,All Claims,0.497916,0.45455,0.453625,
3,ClaimCheck::ZeroFlat::Test,Claim 0,0.166667,0.144605,0.154854,
4,ClaimCheck::ZeroFlat::Test,Claim 1,0.166667,0.07989,0.108007,
5,ClaimCheck::ZeroFlat::Test,Claim 2,0.2,0.075,0.109091,
6,ClaimCheck::ZeroFlat::Test,Claim 3,0.166667,0.065826,0.094378,
7,ClaimCheck::ZeroFlat::Test,Claim 4,0.25,0.115385,0.157895,
8,ClaimCheck::ZeroFlat::Test,Claim 5,0.166667,0.024806,0.043185,
9,ClaimCheck::ZeroFlat::Test,Sub-claim 0_0,0.166667,0.024806,0.043185,
