In [4]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [5]:
from task_config import *
# configs_list = [ZeroFlatConfig1, ZeroFlatConfig2, FewFlatConfig1, FewFlatConfig2]
configs_list = [
    ZeroFlatConfig1, ZeroFlatConfig2, 
    FewFlatConfig1, FewFlatConfig2, 
    ZeroHierConfig1, ZeroHierConfig2
]

In [6]:
def extract_claim_subclaim(code):
    if pd.isna(code) or code == '-1':
        return -1, -1
    try:
        claim, _ = map(int, code.split('_'))
        return claim, code
    except Exception:
        return -1, -1

In [4]:
def calculate_metrics(y_true, y_pred):
    if y_true.unique()[0] == "0_0":
        print("Here")
    TP = (y_true == y_pred).sum()

    total_pred = len(y_pred)
    total_true = len(y_true)

    precision = TP / total_pred
    recall = TP / total_true
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")
    return precision, recall, f1

In [5]:
def evaluate_predictions(df):
    metrics = []

    df[['true_claim', 'true_subclaim']] = df['sub_claim_code'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))
    df[['pred_claim', 'pred_subclaim']] = df['prediction'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))

    total = len(df)
    invalid = len(df[df['pred_subclaim'] == -1])
    valid = total - invalid
    
    metrics.append({
        "Category": "Summary",
        "Precision": None,
        "Recall": None,
        "F1 Score": None,
        "Support": f"{valid}/{total} ({valid / total:.1%} valid)"
    })

    # Filter valid examples only
    valid_df = df[(df['pred_subclaim'] != -1) & (df['true_subclaim'] != -1)]

    # ---------- Overall Metrics ----------
    precision, recall, f1 = calculate_metrics(valid_df['true_subclaim'], valid_df['pred_subclaim'])
    metrics.append({
            "Category": "All Sub-claims",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
    })

    # print("\n--- Overall Claim Classification ---")
    precision, recall, f1 = calculate_metrics(valid_df['true_claim'], valid_df['pred_claim'])
    metrics.append({
            "Category": "All Claims",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
    })

    # ---------- Per-Claim Category ----------
    for claim in sorted(valid_df['true_claim'].unique()):
        subset = valid_df[valid_df['true_claim'] == claim]
        precision, recall, f1 = calculate_metrics(subset['true_claim'], subset['pred_claim'])
        metrics.append({
            "Category": f"Claim {claim}",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
        })

    # ---------- Per Sub-Claim Category ----------
    for subclaim in sorted(valid_df['true_subclaim'].unique()):

        subset = valid_df[valid_df['true_subclaim'] == subclaim][["true_subclaim", "pred_subclaim"]]

        # if "0_0" == subclaim:
        #     print(subset.head(100))
        #     print("HERE")
        #     calculate_metrics(subset['true_subclaim'], subset['pred_subclaim'])
        #     return None
        precision, recall, f1 = calculate_metrics(subset['true_subclaim'], subset['pred_subclaim'])
        metrics.append({
            "Category": f"Sub-claim {subclaim}",
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Support": None
        })

    return pd.DataFrame(metrics)
    


In [6]:
def print_statistics(df):

    df[['true_claim', 'true_subclaim']] = df['sub_claim_code'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))
    df[['pred_claim', 'pred_subclaim']] = df['prediction'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))

    total = len(df)
    invalid = len(df[df['pred_subclaim'] == -1])
    valid = total - invalid
    
    valid_df = df[(df['pred_subclaim'] != -1) & (df['true_subclaim'] != -1)]

    for claim in sorted(valid_df['true_claim'].unique()):
        subset = valid_df[valid_df['true_claim'] == claim]
        print(f"Claim: {claim}: {len(subset)}")
        
    for subclaim in sorted(valid_df['true_subclaim'].unique()):
        subset = valid_df[valid_df['true_subclaim'] == subclaim]
        print(f"Sublaim: {subclaim}: {len(subset)}")

In [7]:
combined_df = None
printed_stats_test = False
printed_stats_val = False

for task_config in configs_list:
    result_dir = task_config.result_dir.format("val" if task_config.val_mode else "test") 
    result_path = f"{result_dir}/{task_config.id}.csv"
    df = pd.read_csv(result_path)

    if task_config.val_mode and not printed_stats_val:
        print_statistics(df)
        printed_stats_val = True
    if not task_config.val_mode and not printed_stats_test:
        print_statistics(df)
        printed_stats_test = True
    
    metrics_df = evaluate_predictions(df)
    metrics_df.insert(0, 'Task', task_config.name)
    if combined_df is None:
        combined_df = metrics_df
    else:
        combined_df = pd.concat([combined_df, metrics_df], ignore_index=True)

Claim: 0: 1730
Claim: 1: 242
Claim: 2: 168
Claim: 3: 119
Claim: 4: 169
Claim: 5: 215
Sublaim: 0_0: 1730
Sublaim: 1_1: 48
Sublaim: 1_2: 21
Sublaim: 1_3: 30
Sublaim: 1_4: 68
Sublaim: 1_6: 12
Sublaim: 1_7: 63
Sublaim: 2_1: 122
Sublaim: 2_3: 46
Sublaim: 3_1: 25
Sublaim: 3_2: 49
Sublaim: 3_3: 45
Sublaim: 4_1: 62
Sublaim: 4_2: 34
Sublaim: 4_4: 38
Sublaim: 4_5: 35
Sublaim: 5_1: 150
Sublaim: 5_2: 65
Precision: 0.6387 | Recall: 0.6387 | F1 Score: 0.6387
Precision: 0.6950 | Recall: 0.6950 | F1 Score: 0.6950
Precision: 0.8676 | Recall: 0.8676 | F1 Score: 0.8676
Precision: 0.4793 | Recall: 0.4793 | F1 Score: 0.4793
Precision: 0.3750 | Recall: 0.3750 | F1 Score: 0.3750
Precision: 0.3950 | Recall: 0.3950 | F1 Score: 0.3950
Precision: 0.4615 | Recall: 0.4615 | F1 Score: 0.4615
Precision: 0.1488 | Recall: 0.1488 | F1 Score: 0.1488
Here
Precision: 0.8676 | Recall: 0.8676 | F1 Score: 0.8676
Precision: 0.1667 | Recall: 0.1667 | F1 Score: 0.1667
Precision: 0.8095 | Recall: 0.8095 | F1 Score: 0.8095
Precis

In [8]:
import pandas as pd

# Show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 0)

combined_df.head(len(combined_df))

Unnamed: 0,Task,Category,Precision,Recall,F1 Score,Support
0,ClaimCheck::ZeroFlat,Summary,,,,2643/2701 (97.9% valid)
1,ClaimCheck::ZeroFlat,All Sub-claims,0.638668,0.638668,0.638668,
2,ClaimCheck::ZeroFlat,All Claims,0.695044,0.695044,0.695044,
3,ClaimCheck::ZeroFlat,Claim 0,0.86763,0.86763,0.86763,
4,ClaimCheck::ZeroFlat,Claim 1,0.479339,0.479339,0.479339,
5,ClaimCheck::ZeroFlat,Claim 2,0.375,0.375,0.375,
6,ClaimCheck::ZeroFlat,Claim 3,0.394958,0.394958,0.394958,
7,ClaimCheck::ZeroFlat,Claim 4,0.461538,0.461538,0.461538,
8,ClaimCheck::ZeroFlat,Claim 5,0.148837,0.148837,0.148837,
9,ClaimCheck::ZeroFlat,Sub-claim 0_0,0.86763,0.86763,0.86763,


In [9]:
combined_df[combined_df["Task"] == "ClaimCheck::ZeroFlat"][1:][["Task", "Category", "Precision", "Recall", "F1 Score"]]

Unnamed: 0,Task,Category,Precision,Recall,F1 Score
1,ClaimCheck::ZeroFlat,All Sub-claims,0.638668,0.638668,0.638668
2,ClaimCheck::ZeroFlat,All Claims,0.695044,0.695044,0.695044
3,ClaimCheck::ZeroFlat,Claim 0,0.86763,0.86763,0.86763
4,ClaimCheck::ZeroFlat,Claim 1,0.479339,0.479339,0.479339
5,ClaimCheck::ZeroFlat,Claim 2,0.375,0.375,0.375
6,ClaimCheck::ZeroFlat,Claim 3,0.394958,0.394958,0.394958
7,ClaimCheck::ZeroFlat,Claim 4,0.461538,0.461538,0.461538
8,ClaimCheck::ZeroFlat,Claim 5,0.148837,0.148837,0.148837
9,ClaimCheck::ZeroFlat,Sub-claim 0_0,0.86763,0.86763,0.86763
10,ClaimCheck::ZeroFlat,Sub-claim 1_1,0.166667,0.166667,0.166667


In [10]:
combined_df[combined_df["Task"] == "ClaimCheck::FewShotFlat::Similar"][1:][["Task", "Category", "Precision", "Recall", "F1 Score"]]

Unnamed: 0,Task,Category,Precision,Recall,F1 Score
55,ClaimCheck::FewShotFlat::Similar,All Sub-claims,0.626074,0.626074,0.626074
56,ClaimCheck::FewShotFlat::Similar,All Claims,0.677251,0.677251,0.677251
57,ClaimCheck::FewShotFlat::Similar,Claim 0,0.882006,0.882006,0.882006
58,ClaimCheck::FewShotFlat::Similar,Claim 1,0.488372,0.488372,0.488372
59,ClaimCheck::FewShotFlat::Similar,Claim 2,0.380952,0.380952,0.380952
60,ClaimCheck::FewShotFlat::Similar,Claim 3,0.383178,0.383178,0.383178
61,ClaimCheck::FewShotFlat::Similar,Claim 4,0.373333,0.373333,0.373333
62,ClaimCheck::FewShotFlat::Similar,Claim 5,0.165289,0.165289,0.165289
63,ClaimCheck::FewShotFlat::Similar,Sub-claim 0_0,0.882006,0.882006,0.882006
64,ClaimCheck::FewShotFlat::Similar,Sub-claim 1_1,0.190476,0.190476,0.190476


In [11]:
combined_df[combined_df["Task"] == "ClaimCheck::ZeroHier"][1:][["Task", "Category", "Precision", "Recall", "F1 Score"]]

Unnamed: 0,Task,Category,Precision,Recall,F1 Score
109,ClaimCheck::ZeroHier,All Sub-claims,0.623361,0.623361,0.623361
110,ClaimCheck::ZeroHier,All Claims,0.633607,0.633607,0.633607
111,ClaimCheck::ZeroHier,Claim 0,0.978934,0.978934,0.978934
112,ClaimCheck::ZeroHier,Claim 1,0.041475,0.041475,0.041475
113,ClaimCheck::ZeroHier,Claim 2,0.013245,0.013245,0.013245
114,ClaimCheck::ZeroHier,Claim 3,0.009709,0.009709,0.009709
115,ClaimCheck::ZeroHier,Claim 4,0.006897,0.006897,0.006897
116,ClaimCheck::ZeroHier,Claim 5,0.15082,0.15082,0.15082
117,ClaimCheck::ZeroHier,Sub-claim 0_0,0.978934,0.978934,0.978934
118,ClaimCheck::ZeroHier,Sub-claim 1_1,0.0,0.0,0.0


In [11]:
%matplotlib notebook


In [1]:
! pip install seaborn


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
for task_config in configs_list:
    result_dir = task_config.result_dir.format("val" if task_config.val_mode else "test") 
    result_path = f"{result_dir}/{task_config.id}.csv"
    df = pd.read_csv(result_path)

    df[['true_claim', 'true_subclaim']] = df['sub_claim_code'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))
    df[['pred_claim', 'pred_subclaim']] = df['prediction'].apply(lambda x: pd.Series(extract_claim_subclaim(x)))


    df =  df[(df['pred_subclaim'] != -1) & (df['true_subclaim'] != -1)]    
    df =  df[(df['pred_subclaim'] != "0_0") & (df['true_subclaim'] != "0_0")]

    true_claim_labels = df['true_claim']
    pred_claim_labels = df['pred_claim']

    true_subclaim_labels = df['true_subclaim']
    pred_subclaim_labels = df['pred_subclaim']

    ## Claims 
    claim_labels = sorted(list(set(true_claim_labels) | set(pred_claim_labels)))
    cm_1 = confusion_matrix(true_claim_labels, pred_claim_labels, labels=claim_labels)

    plt.figure(figsize=(14, 10))
    sns.heatmap(cm_1, annot=True, fmt=".2f", cmap="Blues",
                xticklabels=claim_labels, yticklabels=claim_labels)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Normalized Confusion Matrix')
    plt.savefig(f"results/images/confusion_matrix_claim_{task_config.id}.png", dpi=300, bbox_inches='tight')

    ## Sub_Claims 
    subclaim_labels = sorted(list(set(true_subclaim_labels) | set(pred_subclaim_labels)))
    cm_2 = confusion_matrix(true_subclaim_labels, pred_subclaim_labels, labels=subclaim_labels)

    plt.figure(figsize=(24, 20))
    sns.heatmap(cm_2, annot=True, cmap="Blues",
                xticklabels=subclaim_labels, yticklabels=subclaim_labels)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Normalized Confusion Matrix')
    plt.savefig(f"results/images/confusion_matrix_subclaim_{task_config.id}.png", dpi=300, bbox_inches='tight')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
plt.figure(figsize=(14, 10))
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues",
            xticklabels=claim_labels, yticklabels=claim_labels)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Normalized Confusion Matrix')
plt.show()

<IPython.core.display.Javascript object>