# Inter-annotator agreement span-matching and consolidation pipeline for span-based annotations 🧪
This notebook evaluates inter-annotator agreement (IAA) on span-based labels in Danish political debates and builds a consolidated “gold” dataset of agreed spans.  
It loads each annotator’s CSV, computes pairwise soft‐match metrics (Dice ≥ 0.6 + containment), extracts longest common substrings, and outputs a final gold‐span dataset for model training.


In [None]:
"""
GOAL: Evaluate inter-annotator agreement (IAA) for span-based annotations and create a reliable gold dataset of agreed-upon labels.
"""

# ---------------------------------------------------------
# PIPELINE OVERVIEW:
# ---------------------------------------------------------

"""
1. Load CSV annotation files for each annotator.
   - Each CSV includes: debate_unit_id, full_text, and multiple label columns (e.g., answer, evasion).
   - Spans for each label are semicolon-separated text segments.

2. Normalize & combine data:
   - Extract each labeled span per annotator and explode into one row per (debate_unit_id, label, span, annotator).
   - Combine all annotators' data into one master dataframe.

3. Compute pairwise soft span matching:
   - For each (debate_unit_id, label), compare spans between all annotators who annotated that debate.
   - Compute Dice coefficient (soft match) between spans.
   - Also check for containment: whether one span is fully contained in the other.

4. Agreement Criteria:
   - Dice >= 0.6 → considered a soft match.
   - Containment = True → means high textual agreement.
   - Label must be the same.

5. Generate gold span set:
   - If at least two annotators agree (Dice >= 0.6 AND same label AND containment = True), extract shared span.
   - Save as a new row in a gold dataset.

6. Export results:
   - Full dataframe with matched spans, Dice, containment, label, etc.
   - Clean gold dataset with agreed spans & labels for model training.

"""

# ---------------------------------------------------------
# REASONS FOR CHOICES:
# ---------------------------------------------------------

"""
✅ Dice threshold of 0.6:
   - Literature (e.g. Zhang et al. 2017) uses soft matching thresholds around 0.5-0.7.
   - 0.6 balances precision and recall — it allows for minor differences in highlight boundaries.

✅ Containment check:
   - Especially useful for annotators who consistently over/under-highlight.
   - If one span is entirely inside another, it's a strong agreement signal.

✅ Use of shared span:
   - Ensures high precision and conservative gold data.
   - Simplifies pipeline and avoids overfitting to annotator habits.
"""

import os
import glob
import pandas as pd
import itertools
from difflib import SequenceMatcher

### ---------- UTILITIES ---------- ###

def dice_coefficient(a, b):
    a_tokens, b_tokens = set(a.split()), set(b.split())
    if not a_tokens or not b_tokens:
        return 0.0
    overlap = len(a_tokens & b_tokens)
    return 2 * overlap / (len(a_tokens) + len(b_tokens))

def is_contained(span1, span2):
    return span1 in span2 or span2 in span1

def get_overlap_substring(a, b):
    a, b = a.strip(), b.strip()
    if a in b:
        return a
    elif b in a:
        return b
    else:
        match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
        return a[match.a: match.a + match.size].strip()

### ---------- STEP 1: Load & Normalize ---------- ###

def extract_spans(df, label_col):
    rows = []
    for _, row in df.iterrows():
        # Only split if the value is not missing
        spans = str(row[label_col]).split(';') if pd.notna(row[label_col]) else []
        for span in spans:
            cleaned = span.strip()
            # Skip if cleaned is empty or equals "nan" (case insensitive)
            if cleaned and cleaned.lower() != "nan":
                rows.append({
                    'user_id': row['user_id'],
                    'user_id_study': row['user_id_study'],
                    'debate_unit_id': row['debate_unit_id'],
                    'label': label_col,
                    'span': cleaned,
                    'full_text': row['full_text']
                })
    return pd.DataFrame(rows)

def load_all_annotations(data_folder):
    all_csvs = glob.glob(os.path.join(data_folder, '*.csv'))
    all_dfs = []
    for path in all_csvs:
        df = pd.read_csv(path)
        # Treat _SH_PLD etc. as the same user
        user_id = os.path.basename(path).split('Annotations - ')[-1].split('.')[0].split('_')[0]
        df['user_id'] = user_id
        all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

def build_master_span_table(df, labels):
    return pd.concat([extract_spans(df, label) for label in labels], ignore_index=True)

### ---------- STEP 2: Compute Soft Matches ---------- ###

def compute_soft_matches(df_spans, dice_threshold=0.6):
    gold_rows = []
    match_rows = []

    grouped = df_spans.groupby(['debate_unit_id', 'label'])
    for (debate_id, label), group in grouped:
        annos = list(group.to_dict('records'))
        for a1, a2 in itertools.combinations(annos, 2):
            if a1['user_id'] == a2['user_id']:
                continue
            dice = dice_coefficient(a1['span'], a2['span'])
            containment = is_contained(a1['span'], a2['span'])
            match_rows.append({
                'debate_unit_id': debate_id,
                'label': label,
                'annotator_1': a1['user_id'],
                'annotator_2': a2['user_id'],
                'span_1': a1['span'],
                'span_2': a2['span'],
                'dice': dice,
                'contained': containment
            })
            if dice >= dice_threshold and containment:
                shared_span = get_overlap_substring(a1['span'], a2['span'])
                if shared_span.strip():
                    gold_rows.append({
                        'debate_unit_id': debate_id,
                        'label': label,
                        'span': shared_span,
                        'source_annotators': f"{a1['user_id']},{a2['user_id']}"
                    })

    return pd.DataFrame(match_rows), pd.DataFrame(gold_rows).drop_duplicates()


In [None]:
# 🧾 Load all annotations
data_folder = './data' 
df_all = load_all_annotations(data_folder)

# Remove linebreaks
import re

def clean_linebreaks(text):
    if pd.isna(text):
        return text
    # Replace colon followed by any whitespace (including newline) with ": "
    text = re.sub(r':\s+', ': ', text)
    return text.strip()

cols_to_fix = ['full_text', 'answer', 'stretch', 'evasion', 'self_promotion', 'attack']

for col in cols_to_fix:
    if col in df_all.columns:
        df_all[col] = df_all[col].apply(clean_linebreaks)


# Replace ; in full text with , 
df_all['full_text'] = df_all['full_text'].str.replace(';', ',', regex=False) # For Streamlit data reasons

# Replace p
df_all

In [None]:
# Remove linebreaks
import re

def clean_linebreaks(text):
    if pd.isna(text):
        return text
    # Replace colon followed by any whitespace (including newline) with ": "
    text = re.sub(r':\s+', ': ', text)
    return text.strip()

cols_to_fix = ['full_text', 'answer', 'stretch', 'evasion', 'self_promotion', 'attack']

for col in cols_to_fix:
    if col in df_all.columns:
        df_all[col] = df_all[col].apply(clean_linebreaks)


# Replace ; in full text with , 
df_all['full_text'] = df_all['full_text'].str.replace(';', ',', regex=False) # For Streamlit data reasons

# Replace party stuff again for safety
import re

# Define a mapping of party names (including historical names) to pseudonyms
party_pseudonyms = {
    # Socialdemokratiet
    "Socialdemokratiet": "Parti_A",
    "Socialdemokraterne": "Parti_A",
    "Socialdemokraternes": "Parti_As",
    "Socialdemokratiets": "Parti_As",
    "Socialdemokratisk": "Parti_As",
    "Socialdemokrater": "Parti_A",

    # Venstre
    "Venstre": "Parti_B",
    "Venstres": "Parti_Bs",

    # Radikale Venstre
    "Radikale Venstre": "Parti_C",
    "Det Radikale Venstre": "Parti_C",
    #"Radikale": "Parti_C",
    "Radikales": "Parti_Cs",
    "De Radikale": "Parti_C",
    "De Radikales": "Parti_Cs",
    "Radikale": "Parti_C",

    # Konservative Folkeparti
    "Konservative Folkeparti": "Parti_D",
    "Det Konservative Folkeparti": "Parti_D",
    "Konservative": "Parti_D",
    "Konservatives": "Parti_Ds",
    "De Konservative": "Parti_D",
    "De Konservatives": "Parti_Ds",
    "konservativ side": "Parti_Ds side",

    # Socialistisk Folkeparti
    "Socialistisk Folkeparti": "Parti_E",
    "Socialistisk Folkepartis": "Parti_Es",
    "Socialistiske Folkeparti": "Parti_E",
    "Socialistiskes": "Parti_Es",
    "SF": "Parti_E",
    "SFs": "Parti_Es",
    "SF's": "Parti_Es",

    # Dansk Folkeparti
    "Dansk Folkeparti": "Parti_F",
    "Dansk Folkepartis": "Parti_Fs",

    # Fremskridtspartiet (Historisk DF-navn)
    "Fremskridtspartiet": "Parti_F",
    "Fremskridtspartiets": "Parti_Fs",

    # Enhedslisten
    "Enhedslisten": "Parti_G",
    "Enhedslistens": "Parti_Gs",
    "Rød-Grøn Alliance": "Parti_G",
    "Rød-Grønne Alliance": "Parti_G",

    # Liberal Alliance
    "Liberal Alliance": "Parti_H",
    "Liberale Alliance": "Parti_H",
    "Liberal Alliances": "Parti_Hs",
    "Liberales": "Parti_Hs",  # Genitive form

    # Ny Alliance (Historisk før LA)
    "Ny Alliance": "Parti_H",
    "Ny Alliances": "Parti_Hs",

    # Alternativet
    "Alternativet": "Parti_I",
    "Alternativets": "Parti_Is",

    # Danmarksdemokraterne
    "Danmarksdemokraterne": "Parti_J",
    "Danmarksdemokraternes": "Parti_Js",

    # Nye Borgerlige
    "Nye Borgerlige": "Parti_K",
    "Nye Borgerliges": "Parti_Ks",

    # Frie Grønne
    "Frie Grønne": "Parti_L",
    "De Frie Grønne": "Parti_L",
    "Frie Grønnes": "Parti_Ls",

    # Kristendemokraterne
    "Kristendemokraterne": "Parti_M",
    "Kristendemokraternes": "Parti_Ms",
    "De Kristne Demokrater": "Parti_M",
    "Kristendemokratiet": "Parti_M",
    "Kristendemokratiets": "Parti_Ms",
}

# Compile regex pattern to match any of the party names (case insensitive)
party_pattern = re.compile(r'\b(' + '|'.join(re.escape(party) for party in party_pseudonyms.keys()) + r')\b', re.IGNORECASE)

# Function to replace party names with pseudonyms
def replace_party_names(text):
    if pd.isna(text):  # Handle missing values
        return text
    
    # Perform case-insensitive replacement while preserving original case
    return party_pattern.sub(lambda match: party_pseudonyms.get(match.group(0), 
                                                                party_pseudonyms.get(match.group(0).title(), 
                                                                match.group(0))), text)

# Apply function to the utterance and text cols
df_all["full_text"] = df_all["full_text"].astype(str).apply(replace_party_names)
df_all["answer"] = df_all["answer"].astype(str).apply(replace_party_names)
df_all["stretch"] = df_all["stretch"].astype(str).apply(replace_party_names)
df_all["evasion"] = df_all["evasion"].astype(str).apply(replace_party_names)
df_all["self_promotion"] = df_all["self_promotion"].astype(str).apply(replace_party_names)
df_all["attack"] = df_all["attack"].astype(str).apply(replace_party_names)
df_all["other"] = df_all["other"].astype(str).apply(replace_party_names)

print("✅ Party names replaced with pseudonyms, including historical names.")

df_all.to_csv("output/df_all.csv")

df_all

# Get demographics on

In [None]:
import pandas as pd

# Mapping for user_id to study field
# This originally held a full list which is removed here since these were the original user-ids and not the re-pseudonymized (so that annotators' cannot see contributions from other annotators, if they happened to see or hear their usernames during the study)

user_id_to_study = {
 ### anonymized ###
}

# Create the new column 'user_id_study' based on the user_id
df_all['user_id_study'] = df_all['user_id'].map(user_id_to_study)

## Run cleaner functions

In [83]:
# 🧠 Supported labels
labels = ['answer', 'evasion', 'attack', 'self_promotion', 'stretch']

# 🔄 Explode into one span per row
df_spans = build_master_span_table(df_all, labels)

# 🔍 Compute Dice + containment matching
matches_df, gold_df = compute_soft_matches(df_spans, dice_threshold=0.6)

# Save them
df_spans.to_csv("output/prelim_dataframes/df_spans.csv")
matches_df.to_csv("output/prelim_dataframes/matches_df.csv")
gold_df.to_csv("output/prelim_dataframes/gold_df.csv")

In [None]:
# Take a look
gold_df # before we have 232, then 733, then 736

In [None]:
# Also take a look at this one
df_spans 

In [None]:
## Get the Annotator Contributions (continued in next chunk)
import pandas as pd

# Load the spans dataframe
df_spans_check = pd.read_csv("output/prelim_dataframes/df_spans.csv")

# Compute per-annotator stats
user_stats = (
    df_spans_check
    .groupby(['user_id_study', 'user_id'])
    .agg(
        total_annotations=('span', 'count'),
        unique_units=('debate_unit_id', pd.Series.nunique)
    )
    .reset_index()
)
user_stats['normalized_per_unit'] = user_stats['total_annotations'] / user_stats['unique_units']

# Aggregate per study
study_stats = (
    user_stats
    .groupby('user_id_study')
    .agg(
        annotators=('user_id', 'nunique'),
        total_annotations=('total_annotations', 'sum'),
        mean_per_annotator=('total_annotations', 'mean'),
        sd_per_annotator=('total_annotations', 'std'),
        mean_normalized=('normalized_per_unit', 'mean'),
        sd_normalized=('normalized_per_unit', 'std')
    )
    .reset_index()
)

# Overall row
overall = pd.Series({
    'user_id_study': 'Overall',
    'annotators': user_stats['user_id'].nunique(),
    'total_annotations': user_stats['total_annotations'].sum(),
    'mean_per_annotator': user_stats['total_annotations'].mean(),
    'sd_per_annotator': user_stats['total_annotations'].std(),
    'mean_normalized': user_stats['normalized_per_unit'].mean(),
    'sd_normalized': user_stats['normalized_per_unit'].std()
})
final_stats = pd.concat([study_stats, overall.to_frame().T], ignore_index=True)

# Round numeric columns to two decimals (except annotators)
for col in ['total_annotations', 'mean_per_annotator', 'sd_per_annotator', 'mean_normalized', 'sd_normalized']:
    final_stats[col] = final_stats[col].astype(float).round(2)

# Ensure annotators is integer
final_stats['annotators'] = final_stats['annotators'].astype(int)

# Display
final_stats

Unnamed: 0,user_id_study,annotators,total_annotations,mean_per_annotator,sd_per_annotator,mean_normalized,sd_normalized
0,cognitive science,5,428.0,85.6,50.05,3.76,1.24
1,linguistics,10,1744.0,174.4,106.37,4.18,0.99
2,rhetorics,4,493.0,123.25,68.09,5.23,2.22
3,Overall,19,2665.0,140.26,92.35,4.29,1.39


### Getting relative rates

In [None]:
# Getting relative rates
import pandas as pd

# 1) Count spans per (user, unit)
unit_user = (
    df_spans_check
    .groupby(['debate_unit_id','user_id'])
    .size()
    .rename('user_spans')
    .reset_index()
)

# 2) Compute average spans per unit across all users
unit_density = (
    unit_user
    .groupby('debate_unit_id')['user_spans']
    .mean()
    .rename('mean_spans_per_unit')
    .reset_index()
)

# 3) Merge back so each (user, unit) row has actual vs. expected
merged = unit_user.merge(unit_density, on='debate_unit_id')

# 4) Summarize per‐user
user_density = (
    merged
    .groupby('user_id')
    .agg(
        total_spans=('user_spans','sum'),
        expected_spans=('mean_spans_per_unit','sum'),
        units_seen=('debate_unit_id','nunique')
    )
    .reset_index()
)

# 5) Compute relative rate
user_density['relative_rate'] = user_density['total_spans'] / user_density['expected_spans']

# 6) Bring in each user’s study field
user_info = (
    df_spans_check[['user_id','user_id_study']]
    .drop_duplicates()
)
user_density = user_density.merge(user_info, on='user_id')

# 7) Aggregate by study
study_density = (
    user_density
    .groupby('user_id_study')
    .agg(
        annotators=('user_id','nunique'),
        mean_relative_rate=('relative_rate','mean'),
        sd_relative_rate=('relative_rate','std')
    )
    .reset_index()
)

# 8) z‐score the per-user rates across all annotators
user_density['relative_rate_z'] = (
    (user_density['relative_rate'] - user_density['relative_rate'].mean())
    / user_density['relative_rate'].std()
)

# 9) Round for presentation
study_density[['mean_relative_rate','sd_relative_rate']] = study_density[['mean_relative_rate','sd_relative_rate']].round(2)
user_density['relative_rate'] = user_density['relative_rate'].round(2)
user_density['relative_rate_z'] = user_density['relative_rate_z'].round(2)

print(study_density)
print(user_density[['user_id','user_id_study','relative_rate','relative_rate_z']])


In [None]:
# Remove duplicates (before I had 150, then 556, then 557
gold_df_no_dupes = gold_df.drop_duplicates(subset = ['span'])
gold_df_no_dupes 

In [87]:
# Ensure no trailing spaces etc.
gold_df_no_dupes['span'] = gold_df_no_dupes['span'].str.strip()
gold_df_no_dupes['label'] = gold_df_no_dupes['label'].str.strip()

# Total unique training examples
num_unique = len(gold_df_no_dupes)
print(f"🔢 Total unique labeled spans: {num_unique}")

# Label distribution (as % of unique spans)
label_dist = (
    gold_df_no_dupes['label']
    .value_counts(normalize=True)
    .mul(100.0)  # ensures float
    .round(2)
    .reset_index()
    .rename(columns={'index': 'Label', 'label': 'Percentage'})
)

print("📊 Label Distribution (% of unique labeled spans):")
print(label_dist)

label_counts = (
    gold_df_no_dupes['label']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'Label', 'label': 'Count'})
)

print("📊 Label Count:")
print(label_counts)

🔢 Total unique labeled spans: 557
📊 Label Distribution (% of unique labeled spans):
       Percentage  proportion
0          answer       50.09
1         evasion       32.32
2          attack       11.31
3  self_promotion        6.10
4         stretch        0.18
📊 Label Count:
            Count  count
0          answer    279
1         evasion    180
2          attack     63
3  self_promotion     34
4         stretch      1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_df_no_dupes['span'] = gold_df_no_dupes['span'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_df_no_dupes['label'] = gold_df_no_dupes['label'].str.strip()


In [None]:
# Get tiebreak for cases where span is the same but the label is not the same (formerly these were 157 rows, now theyre 455)
def get_disagreement_cases(df_spans, dice_threshold=0.6):
    rows = []
    grouped = df_spans.groupby('debate_unit_id')
    
    for debate_id, group in grouped:
        annos = list(group.to_dict('records'))
        for a1, a2 in itertools.combinations(annos, 2):
            if a1['user_id'] == a2['user_id']:
                continue
            dice = dice_coefficient(a1['span'], a2['span'])
            contained = is_contained(a1['span'], a2['span'])
            if dice >= dice_threshold and contained and a1['label'] != a2['label']:
                rows.append({
                    'debate_unit_id': debate_id,
                    'annotator_1': a1['user_id'],
                    'label_1': a1['label'],
                    'span_1': a1['span'],
                    'annotator_2': a2['user_id'],
                    'label_2': a2['label'],
                    'span_2': a2['span'],
                    'dice': dice,
                    'contained': contained
                })
    
    return pd.DataFrame(rows)


tie_break_df = get_disagreement_cases(df_spans, dice_threshold=0.6)
tie_break_df # disse skal ses igennem af en tredje annotør (gør imorgen?)

In [89]:

# 💾 Save outputs
matches_df.to_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/span_matches_with_dice.csv', index=False)
gold_df_no_dupes.to_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/gold_df_no_dupes.csv', index=False)
tie_break_df.to_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/tie_break_cases.csv", index=False)

print("✅ Done. Outputs saved:")
print("- span_matches_with_dice.csv")
print("- gold_df_no_dupes.csv")
print("- tie_break_cases.csv")


✅ Done. Outputs saved:
- span_matches_with_dice.csv
- gold_df_no_dupes.csv
- tie_break_cases.csv


## Extra processing for DebateUnits who were seen by 4 annotators

In [None]:
# I have some cases where there has been 4 annotators on a debate unit, meaning that annotator A and B have marked a span as answer, and annotator C and D have marked a span as an answer, but the two spans are partially overlapping.
# 1) keep longest span of the two - assumes that more context = better signal, and keeps the full rehtorical move if some annotators included it. Option 2 is use the shared (overlapping) span. I extract the longest common subspan shared between annotators (like the current method, stays consistent)

import re
from collections import defaultdict
from difflib import SequenceMatcher
import pandas as pd

def extract_clauses(text):
    """Split text into small clause units based on sentence delimiters."""
    return re.split(r'[.,;]', text)

def contains_clause_overlap(s1, s2):
    """Check if any clause in s1 exists in s2 or vice versa."""
    clauses1 = [cl.strip() for cl in extract_clauses(s1) if cl.strip()]
    clauses2 = [cl.strip() for cl in extract_clauses(s2) if cl.strip()]
    return any(clause in s2 for clause in clauses1) or any(clause in s1 for clause in clauses2)

def get_longest_common_substring(a, b):
    """Get the longest character-based common substring between two spans."""
    match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
    return a[match.a: match.a + match.size].strip()

def word_count(text):
    return len(text.strip().split())


def collapse_spans_by_clause_overlap(gold_df, min_words=3):
    final_rows = []
    grouped = gold_df.groupby(['debate_unit_id', 'label'])

    for (debate_id, label), group in grouped:
        spans = group['span'].tolist()
        annotators = group['source_annotators'].tolist()
        used_pairs = set()
        span_to_annotators = defaultdict(set)
        span_to_notes = defaultdict(list)

        # Store all candidate shared spans
        candidate_spans = []

        for i in range(len(spans)):
            for j in range(i + 1, len(spans)):
                s1, s2 = spans[i], spans[j]
                uid1 = annotators[i]
                uid2 = annotators[j]
                pair_id = tuple(sorted([uid1, uid2]))

                if pair_id in used_pairs:
                    continue

                if contains_clause_overlap(s1, s2):
                    shared = get_longest_common_substring(s1, s2)
                    if shared.strip() and word_count(shared) >= min_words:
                        candidate_spans.append({
                            'shared_span': shared.strip(),
                            'annotators': set(uid1.split(',')) | set(uid2.split(',')),
                            'note': f"Clause overlap from {uid1} & {uid2}"
                        })
                        used_pairs.add(pair_id)

        if candidate_spans:
            # Sort candidates by span length (descending), keep only longest one
            longest = max(candidate_spans, key=lambda x: word_count(x['shared_span']))
            final_rows.append({
                'debate_unit_id': debate_id,
                'label': label,
                'span': longest['shared_span'],
                'source_annotators': ",".join(sorted(longest['annotators'])),
                'construction_note': longest['note']
            })
        else:
            # No matches — keep individual spans
            for span, annotator in zip(spans, annotators):
                final_rows.append({
                    'debate_unit_id': debate_id,
                    'label': label,
                    'span': span.strip(),
                    'source_annotators': annotator,
                    'construction_note': "Singleton span — no overlapping pairs" # Unique or unmatched span"
                })

    return pd.DataFrame(final_rows)

collapsed_gold_df = collapse_spans_by_clause_overlap(gold_df, min_words=4) # dupe df?
collapsed_gold_df.to_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/collapsed_gold_df.csv', index=False)
collapsed_gold_df

In [None]:
# Coalesce duplicate answers with the same span for pairs of annotators: 
collapsed_gold_df['span'] = collapsed_gold_df['span'].str.strip()
collapsed_gold_df['label'] = collapsed_gold_df['label'].str.strip()
collapsed_gold_df['source_annotators'] = collapsed_gold_df['source_annotators'].str.strip()

# Group by span identity
coalesced = (
    collapsed_gold_df
    .groupby(['debate_unit_id', 'label', 'span'], as_index=False)
    .agg({
        'source_annotators': lambda x: ','.join(sorted(set(','.join(x).split(',')))),
        'construction_note': lambda x: '; '.join(sorted(set(x)))
    })
)

# Remove accidental double spaces, clean up
coalesced['source_annotators'] = coalesced['source_annotators'].str.replace(r'\s+', '', regex=True)
coalesced['construction_note'] = coalesced['construction_note'].str.strip()

# Label distribution (as % of unique spans)
label_dist = (
    coalesced['label']
    .value_counts(normalize=True)
    .mul(100.0)  # ensures float
    .round(2)
    .reset_index()
    .rename(columns={'index': 'Label', 'label': 'Percentage'})
)

print("📊 Label Distribution (% of unique labeled spans):")
print(label_dist)

label_counts = (
    coalesced['label']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'Label', 'label': 'Count'})
)

print("📊 Label Count:")
print(label_counts)

coalesced.to_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/coalesced_gold_df.csv', index=False)
coalesced


# Add tiebreaking

In [None]:
# Work a bit on tie-break
import re
from collections import defaultdict
from difflib import SequenceMatcher
import itertools
import pandas as pd

# Helper functions (redefine for safety)
def extract_clauses(text):
    """Split text into small clause units based on sentence delimiters."""
    return re.split(r'[.,;]', text)

def contains_clause_overlap(s1, s2):
    """Check if any clause in s1 exists in s2 or vice versa."""
    clauses1 = [cl.strip() for cl in extract_clauses(s1) if cl.strip()]
    clauses2 = [cl.strip() for cl in extract_clauses(s2) if cl.strip()]
    return any(clause in s2 for clause in clauses1) or any(clause in s1 for clause in clauses2)

def get_longest_common_substring(a, b):
    """Get the longest character-based common substring between two spans."""
    match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
    return a[match.a: match.a + match.size].strip()

def word_count(text):
    return len(text.strip().split())

# Process the tiebreak DataFrame
def process_tiebreak_df(df_tiebreak, min_words=4):
    rows = []
    for _, row in df_tiebreak.iterrows():
        s1 = row['span_1']
        s2 = row['span_2']
        
        # Calculate the shared span if there is clause overlap
        if contains_clause_overlap(s1, s2):
            shared = get_longest_common_substring(s1, s2)
        else:
            shared = ""
        
        # Check if the shared span meets the minimum word threshold
        if shared and word_count(shared) >= min_words:
            shared_span = shared.strip()
            note = f"Clause overlap from {row['annotator_1']} & {row['annotator_2']}"
        else:
            # If not, we fallback to a combined span 
            shared_span = s1.strip() + " / " + s2.strip()
            note = f"No sufficient clause overlap from {row['annotator_1']} & {row['annotator_2']}"
        
        # Create a combined label (e.g., "evasion + attack")
        disagreement_label = row['label_1'].strip() + " + " + row['label_2'].strip()
        
        # Combine annotators (if there are multiple, split and sort)
        annos = sorted(set(row['annotator_1'].split(",") + row['annotator_2'].split(",")))
        source_ann = ",".join(annos)
        
        rows.append({
            'debate_unit_id': row['debate_unit_id'],
            'label': disagreement_label,
            'span': shared_span,
            'source_annotators': source_ann,
            'construction_note': note,
            'disagreement': True
        })
    return pd.DataFrame(rows)

# Get tiebreak df
tie_break_df = pd.read_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/tie_break_cases.csv')

# List of valid disagreement labels (if one of these is paired with 'answer', we skip it)
valid_labels = ['evasion', 'attack', 'self_promotion', 'stretch']

# Step 2: Filter tie_break_df for rows where both label_1 and label_2 are in valid_labels.
tie_break_valid = tie_break_df[
    tie_break_df['label_1'].isin(valid_labels) & tie_break_df['label_2'].isin(valid_labels)
].copy()

tie_break_valid

# Now, process tie_break_df
tiebreak_processed_df = process_tiebreak_df(tie_break_valid, min_words=4)

# Filter out the no sufficient overlap
tiebreak_processed_df = tiebreak_processed_df[
    ~tiebreak_processed_df['construction_note'].str.contains("No sufficient clause overlap", case=False, na=False)
]

# Display the resulting DataFrame
tiebreak_processed_df


In [None]:
# Getting a tie break cases overview
import pandas as pd

tie_break_df = pd.read_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/tie_break_cases.csv')
tie_break_df

# Create a cross-tabulation of disagreements
confusion = (
    tie_break_df
    .groupby(['label_1', 'label_2'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

# Add a percentage column
total_disagreements = confusion['count'].sum()
confusion['percent'] = (confusion['count'] / total_disagreements * 100).round(1)

# Sort by descending count (or percent)
confusion = confusion.sort_values('count', ascending=False)

confusion

Unnamed: 0,label_1,label_2,count,percent
1,answer,evasion,252,56.2
6,evasion,attack,50,11.2
7,evasion,self_promotion,42,9.4
2,answer,self_promotion,32,7.1
0,answer,attack,23,5.1
8,evasion,stretch,21,4.7
3,answer,stretch,13,2.9
4,attack,self_promotion,8,1.8
5,attack,stretch,4,0.9
9,self_promotion,stretch,3,0.7


In [None]:
# I want to append this to coalesced now, but also priotizie the ones in coalesced if there are any ones in there that the tiebreak also overlaps with

# Making a version of coalesced_gold_df
import pandas as pd
coalesced = pd.read_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/coalesced_gold_df.csv')

# Step 1: Ensure gold_df has a disagreement_status column, and mark gold agreements as False.
coalesced['disagreement'] = False

# Ensure that coalesced (gold df) has the expected columns and flag
coalesced['span'] = coalesced['span'].str.strip()
coalesced['label'] = coalesced['label'].str.strip()
coalesced['source_annotators'] = coalesced['source_annotators'].str.strip()

# Define a minimum word threshold for considering a subpart as "existing"
min_words_threshold = 4

# Prepare a list to accumulate tiebreak rows that should be added
rows_to_add = []

# Loop through each row in the tiebreak DataFrame
for idx, tb_row in tiebreak_processed_df.iterrows():
    debate_id = tb_row['debate_unit_id']
    tb_span = tb_row['span'].strip()
    
    # Flag to determine whether to skip this tiebreak row
    skip = False
    
    # Check if this debate_unit_id exists in the gold (coalesced) DataFrame
    if debate_id in coalesced['debate_unit_id'].values:
        # Get the subset of gold rows for the current debate unit
        gold_subset = coalesced[coalesced['debate_unit_id'] == debate_id]
        
        # Check each gold row in this subset
        for _, gold_row in gold_subset.iterrows():
            gold_span = gold_row['span'].strip()
            
            # 3) If the span exactly exists, then skip this tiebreak row
            if gold_span == tb_span:
                skip = True
                break
            
            # 4) Check if subparts of the tiebreak span already exist:
            # Compute the longest common substring between tb_span and gold_span
            common = get_longest_common_substring(tb_span, gold_span)
            if common and word_count(common) >= min_words_threshold:
                skip = True
                break
    # If debate_unit_id doesn't exist in gold, or if none of the checks triggered a skip,
    # we add the row.
    if not skip:
        # Ensure the disagreement flag is set to True (even if it might already be True)
        tb_row['disagreement'] = True
        rows_to_add.append(tb_row)

# Convert the collected rows to a DataFrame
if rows_to_add:
    appended_tiebreak_df = pd.DataFrame(rows_to_add)
else:
    appended_tiebreak_df = pd.DataFrame(columns=tiebreak_processed_df.columns)

# display the rows to be added
print("Rows from tiebreak that will be appended:")
appended_tiebreak_df


In [None]:

# Append tie_break_valid rows to gold_df
gold_df_augmented = pd.concat([coalesced, appended_tiebreak_df], ignore_index=True)
gold_df_augmented.to_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/coalesced_gold_augmented_df.csv', index=False)

gold_df_augmented

In [95]:
# Add the ones where dice is low, but containment is fulfilled for manual review.
'''
I feel like
Containment is a stronger structural signal than token overlap: if one annotator's span is fully included in another's, 
they are often referring to the same rhetorical move — just with broader or narrower emphasis.

Dice can be sensitive to short spans and stopword mismatch, meaning it may unfairly penalize what is otherwise a clear agreement.

'''

"\nI feel like\nContainment is a stronger structural signal than token overlap: if one annotator's span is fully included in another's, \nthey are often referring to the same rhetorical move — just with broader or narrower emphasis.\n\nDice can be sensitive to short spans and stopword mismatch, meaning it may unfairly penalize what is otherwise a clear agreement.\n\n"

In [96]:
def find_containment_only_matches_with_label_merge(matches_df, gold_df, min_words=3):
    matches_df['span_1'] = matches_df['span_1'].str.strip()
    matches_df['span_2'] = matches_df['span_2'].str.strip()
    gold_spans = set(
        (row['debate_unit_id'], row['label'], row['span'].strip())
        for _, row in gold_df.iterrows()
    )

    output_rows = []

    for _, row in matches_df.iterrows():
        if not (row['contained'] and row['dice'] < 0.6):
            continue

        key1 = (row['debate_unit_id'], row['label'], row['span_1'])
        key2 = (row['debate_unit_id'], row['label'], row['span_2'])

        if key1 in gold_spans or key2 in gold_spans:
            continue

        shared_span = get_longest_common_substring(row['span_1'], row['span_2'])
        if shared_span and word_count(shared_span) >= min_words:
            output_rows.append({
                'debate_unit_id': row['debate_unit_id'],
                'label': row['label'].strip(),
                'span': shared_span.strip(),
                'span_1': row['span_1'],
                'span_2': row['span_2'],
                #'source_annotators': f"{row['annotator_1']} and {row['annotator_2']}",
                'source_annotators': f"{row['annotator_1']},{row['annotator_2']}",
                'construction_note': 'Contained but low Dice (post hoc)',
                'dice': row['dice'],
                'contained': row['contained']
            })

    df = pd.DataFrame(output_rows)

    # 🧠 Merge duplicates by span with label consolidation
    def merge_labels(labels):
        return ' + '.join(sorted(set(labels)))

    def merge_annotators(sources):
        # Flatten and deduplicate
        raw = ','.join(sources)  # Keep it comma-separated
        all_annos = sorted(set(re.split(r'[,\s]+', raw)))
        return f"common substring of {', '.join(all_annos)}"

    df_merged = (
        df.groupby(['debate_unit_id', 'span'], as_index=False)
          .agg({
              'label': merge_labels,
              'source_annotators': merge_annotators,
              'construction_note': lambda x: '; '.join(sorted(set(x))),
              'span_1': 'first',
              'span_2': 'first',
              'dice': 'first',
              'contained': 'first'
          })
    )

    return df_merged

# Secondary step to catch dupes very different in length within same debateunitid
def collapse_to_shortest_contained_span(df):
    from collections import defaultdict

    final_rows = []
    grouped = df.groupby('debate_unit_id')

    for debate_id, group in grouped:
        rows = group.to_dict('records')
        used = set()
        collapsed_spans = []

        for i in range(len(rows)):
            if i in used:
                continue
            span_i = rows[i]['span']
            annos_i = set(re.split(r'[,\s]+', rows[i]['source_annotators'].replace("common substring of", "").strip()))
            label_i = set(rows[i]['label'].split(' + '))
            note_i = rows[i]['construction_note']

            # Look for contained spans
            for j in range(i + 1, len(rows)):
                if j in used:
                    continue
                span_j = rows[j]['span']

                if span_i in span_j or span_j in span_i:
                    # Use the shorter one
                    shorter_span = span_i if len(span_i) <= len(span_j) else span_j
                    span_i = shorter_span

                    # Merge annotators and labels
                    annos_j = set(re.split(r'[,\s]+', rows[j]['source_annotators'].replace("common substring of", "").strip()))
                    annos_i.update(annos_j)

                    label_j = set(rows[j]['label'].split(' + '))
                    label_i.update(label_j)

                    used.add(j)

            final_rows.append({
                'debate_unit_id': debate_id,
                'label': ' + '.join(sorted(label_i)),
                'span': span_i.strip(),
                'source_annotators': 'common substring of ' + ', '.join(sorted(annos_i)),
                'construction_note': 'Reduced to shortest span after containment check'
            })

            used.add(i)

    return pd.DataFrame(final_rows)


In [97]:
# Primary check
containment_posthoc_df = find_containment_only_matches_with_label_merge(matches_df, gold_df, min_words=3)

# Secondary check
collapsed_shortest_df = collapse_to_shortest_contained_span(containment_posthoc_df)
collapsed_shortest_df.to_csv("output/prelim_dataframes/posthoc_containment_matches.csv", index=False)


In [None]:
# Now I do manual review (1 keep span and label, 0 discard) and then we reload in, and then append to gold df
import pandas as pd
containment_posthos_df_edited = pd.read_csv("output/prelim_dataframes/posthoc_containment_matches_manual_edit.csv", sep=";")

# Get the gold + tiebreak aug df
gold_df_augmented = pd.read_csv('/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/coalesced_gold_augmented_df.csv')
gold_df_augmented_c = gold_df_augmented.copy()

gold_df_augmented_c['quality_status'] = gold_df_augmented_c['disagreement'].map({
    False: 'pure gold',
    True: 'from disagreement'
})

# Drop the old one
gold_df_augmented_c = gold_df_augmented_c.drop(columns='disagreement') # 439 rows

# Now merge
# Filter and tag
to_add = containment_posthos_df_edited[containment_posthos_df_edited['keep_flag'] == 1].copy()
to_add['quality_status'] = 'result of low dice but containment check'

# Add missing columns to match
if 'disagreement' in to_add.columns:
    to_add = to_add.drop(columns='disagreement')
if 'keep_flag' in to_add.columns:
    to_add = to_add.drop(columns='keep_flag')

# Make sure both dataframes have same columns
common_cols = list(set(gold_df_augmented_c.columns) & set(to_add.columns))
gold_df_augmented_c = gold_df_augmented_c[common_cols]
to_add = to_add[common_cols]

# Append
gold_final_aug_df = pd.concat([gold_df_augmented_c, to_add], ignore_index=True)

# Reorder
ordered_cols = [
    'debate_unit_id',
    'label',
    'span',
    'quality_status',
    'source_annotators',
    'construction_note'
]

gold_final_aug_df = gold_final_aug_df[ordered_cols]

gold_final_aug_df.to_csv("output/coalesced_gold_aug_three_levels_18_april.csv")
gold_final_aug_df # 595 rows

In [None]:

counts = gold_final_aug_df['quality_status'].value_counts(dropna=False)

counts.rename_axis('quality_status').reset_index(name='count')


Unnamed: 0,quality_status,count
0,pure gold,369
1,result of low dice but containment check,155
2,from disagreement,70


In [None]:
label_counts = gold_final_aug_df['label'].value_counts(dropna=False)
label_counts_df = label_counts.rename_axis('label').reset_index(name='count')
label_counts_df

Unnamed: 0,label,count
0,answer,267
1,evasion,169
2,attack,54
3,self_promotion,33
4,evasion + attack,25
5,evasion + self_promotion,23
6,evasion + stretch,11
7,attack + self_promotion,4
8,attack + stretch,4
9,self_promotion + stretch,3


In [4]:
# Filter rows where quality_status == 'from disagreement'
filtered_df = gold_final_aug_df_check[gold_final_aug_df_check['quality_status'] == 'from disagreement']

# Breakdown of row counts by 'label'
label_counts = filtered_df['label'].value_counts(dropna=False)
label_counts_df = label_counts.rename_axis('label').reset_index(name='count')
label_counts_df

Unnamed: 0,label,count
0,evasion + attack,25
1,evasion + self_promotion,23
2,evasion + stretch,11
3,attack + self_promotion,4
4,attack + stretch,4
5,self_promotion + stretch,3
