# Gold-spans inspections for tables in-paper 🔍 

This notebook loads the gold span dataset and computes descriptive statistics on annotation behavior, span/text lengths, and label distributions. It covers:
1. Annotation counts per user and study group, and normalized annotation rates.
2. Text statistics (word, character, sentence counts) for each annotated span and its full debate context.
3. Debate-unit–level metrics (annotations per unit, unit lengths).
4. Label frequency and normalization by number of debate units, both overall and stratified by user/study.


In [2]:
import pandas as pd

# Load spans
df_spans = pd.read_csv("output/prelim_dataframes/df_spans.csv")

In [None]:
import pandas as pd
import re

# --- Text processing functions ---
def count_words(text):
    return len(text.split())

def count_characters(text):
    return len(text)

def count_sentences(text):
    return sum(1 for char in text if char in ['.', '!', '?'])

# --- Annotation counts ---
annotations_per_user = df_spans.groupby('user_id').size()
print(f"Average number of annotations per user: {annotations_per_user.mean():.2f}")
print(f"Standard deviation of annotations per user: {annotations_per_user.std():.2f}")
print("\nNumber of annotations per user:")
print(annotations_per_user)

if 'user_id_study' in df_spans.columns:
    annotations_per_study = df_spans.groupby('user_id_study').size()
    print("\nNumber of annotations per study type:")
    print(annotations_per_study)
    print(f"\nAverage per study: {annotations_per_study.mean():.2f}")
    print(f"Standard deviation per study: {annotations_per_study.std():.2f}")

    # Annotations per user per study
    annotations_per_study_user = df_spans.groupby(['user_id_study', 'user_id']).size().groupby('user_id_study').agg(['mean', 'std'])
    print("\nAverage annotations per user per study type (with SD):")
    print(annotations_per_study_user)

    # breakdown by predefined study types
    for study in ['linguistics', 'cognitive science', 'rhetorics']:
        study_data = df_spans[df_spans['user_id_study'] == study]
        user_counts = study_data.groupby('user_id').size()
        print(f"\n{study.capitalize()} - Avg: {user_counts.mean():.2f}, SD: {user_counts.std():.2f}")

# --- Span text stats ---
for col, source in [('span', 'annotation'), ('full_text', 'full_text')]:
    if col == 'full_text':
        df_spans['cleaned_full_text'] = df_spans[col].apply(lambda x: re.sub(r'^\[\d+\]\s', '', x))
        base_col = 'cleaned_full_text'
    else:
        base_col = col

    df_spans[f'word_count_{source}'] = df_spans[base_col].apply(count_words)
    df_spans[f'character_count_{source}'] = df_spans[base_col].apply(count_characters)
    df_spans[f'sentence_count_{source}'] = df_spans[base_col].apply(count_sentences)

    print(f"\n{source.capitalize()} statistics:")
    for metric in ['word', 'character', 'sentence']:
        col_name = f'{metric}_count_{source}'
        print(f"{metric.capitalize()} count - Mean: {df_spans[col_name].mean():.2f}, SD: {df_spans[col_name].std():.2f}")

# --- Debate unit analysis ---
annotations_per_debate_unit = df_spans.groupby('debate_unit_id').size()
print(f"\nEach debate unit received an average of {annotations_per_debate_unit.mean():.2f} annotations (SD = {annotations_per_debate_unit.std():.2f})")

total_unique_units = df_spans['debate_unit_id'].nunique()
print(f"Total unique debate units: {total_unique_units}")

debate_units_per_user = df_spans.groupby('user_id')['debate_unit_id'].nunique()
print(f"Average debate units per annotator: {debate_units_per_user.mean():.2f}, SD: {debate_units_per_user.std():.2f}")

# --- Debate unit text length stats ---
for metric_func, name in [(count_words, 'word'), (count_characters, 'character'), (count_sentences, 'sentence')]:
    df_spans[f'{name}_count_unit'] = df_spans['full_text'].apply(metric_func)
    grouped = df_spans.groupby('debate_unit_id')[f'{name}_count_unit'].mean()
    print(f"Avg {name} count per debate unit: {grouped.mean():.2f} (SD = {grouped.std():.2f})")

# --- Normalized annotations ---
normalized_annotations = annotations_per_user / debate_units_per_user
print(f"\nAverage normalized annotations per user: {normalized_annotations.mean():.2f}")
print(f"Standard deviation: {normalized_annotations.std():.2f}")

if 'user_id_study' in df_spans.columns:
    norm_mean = df_spans.groupby('user_id_study').apply(
        lambda x: x.groupby('user_id')['debate_unit_id'].nunique().mean()
    )
    norm_std = df_spans.groupby('user_id_study').apply(
        lambda x: x.groupby('user_id')['debate_unit_id'].nunique().std()
    )
    print("\nAverage normalized annotations per study type:")
    print(norm_mean)
    print("\nStandard deviation of normalized annotations per study type:")
    print(norm_std)


In [None]:
import pandas as pd

# Step 1: Calculate the number of annotations per debate unit
annotations_per_debate_unit = df_spans.groupby('debate_unit_id').size()

# Step 2: Calculate the average and standard deviation for annotations per debate unit (TODO: this here there is code somewhere else to fix, i think in IAA_and_get_labels)
avg_annotations_per_debate_unit = annotations_per_debate_unit.mean()
std_annotations_per_debate_unit = annotations_per_debate_unit.std()

# Step 3: Calculate the total number of unique debate units annotated
total_unique_debate_units = df_spans['debate_unit_id'].nunique()

# Step 4: Calculate the number of unique debate units contributed by each annotator
debate_units_per_annotator = df_spans.groupby('user_id')['debate_unit_id'].nunique()

# Step 5: Calculate the average and standard deviation for the number of unique debate units contributed by each annotator
avg_debate_units_per_annotator = debate_units_per_annotator.mean()
std_debate_units_per_annotator = debate_units_per_annotator.std()

# Step 6: Calculate word count, character count, and sentence count for each debate unit
def count_words(text):
    return len(text.split())

def count_characters(text):
    return len(text)

def count_sentences(text):
    sentence_endings = ['.', '!', '?']
    return sum(1 for char in text if char in sentence_endings)

# Apply the functions to the 'full_text' column of each debate unit
df_spans['word_count'] = df_spans['full_text'].apply(count_words)
df_spans['character_count'] = df_spans['full_text'].apply(count_characters)
df_spans['sentence_count'] = df_spans['full_text'].apply(count_sentences)

# Step 7: Calculate averages for word count, character count, and sentence count
avg_word_count_per_debate_unit = df_spans.groupby('debate_unit_id')['word_count'].mean().mean()
avg_character_count_per_debate_unit = df_spans.groupby('debate_unit_id')['character_count'].mean().mean()
avg_sentence_count_per_debate_unit = df_spans.groupby('debate_unit_id')['sentence_count'].mean().mean()

# Step 8: Calculate standard deviations for word count, character count, and sentence count
std_word_count_per_debate_unit = df_spans.groupby('debate_unit_id')['word_count'].mean().std()
std_character_count_per_debate_unit = df_spans.groupby('debate_unit_id')['character_count'].mean().std()
std_sentence_count_per_debate_unit = df_spans.groupby('debate_unit_id')['sentence_count'].mean().std()

# Print the results
print(f"Each debate unit received an average of {avg_annotations_per_debate_unit:.2f} annotations with an SD of {std_annotations_per_debate_unit:.2f}.")
print(f"A total of {total_unique_debate_units} unique debate units were annotated.")
print(f"On average, each annotator contributed to {avg_debate_units_per_annotator:.2f} unique debate units, with a SD of {std_debate_units_per_annotator:.2f}.")
print(f"In terms of length, the debate units had an average of M = {avg_word_count_per_debate_unit:.2f} words (SD = {std_word_count_per_debate_unit:.2f}),")
print(f"M = {avg_character_count_per_debate_unit:.2f} characters (SD = {std_character_count_per_debate_unit:.2f}), and")
print(f"M = {avg_sentence_count_per_debate_unit:.2f} sentences (SD = {std_sentence_count_per_debate_unit:.2f}).")


Each debate unit received an average of 9.06 annotations with an SD of 5.61.
A total of 294 unique debate units were annotated.
On average, each annotator contributed to 34.68 unique debate units, with a SD of 20.93.
In terms of length, the debate units had an average of M = 617.12 words (SD = 319.53),
M = 3479.46 characters (SD = 1837.19), and
M = 29.50 sentences (SD = 16.40).


In [8]:
# Now check distributions of labels in the raw spans
import pandas as pd

# Labels we're interested in
labels = ['answer', 'stretch', 'evasion', 'self_promotion', 'attack']

# Step 1: Count raw occurrences of each label across all data
label_counts = df_spans['label'].value_counts()

# Step 2: Calculate percentages for each label
label_percentages = df_spans['label'].value_counts(normalize=True) * 100

# Step 3: Count raw occurrences per user
label_counts_per_user = df_spans.groupby('user_id')['label'].value_counts()

# Step 4: Calculate the average number of each label per user
avg_labels_per_user = label_counts_per_user.groupby('user_id').mean()

# Step 5: Count raw occurrences per study type
if 'user_id_study' in df_spans.columns:
    label_counts_per_study = df_spans.groupby('user_id_study')['label'].value_counts()

    # Step 6: Calculate the average number of each label per study type
    avg_labels_per_study = label_counts_per_study.groupby('user_id_study').mean()

# Step 7: Normalize label counts by the number of unique debate units annotated by each user
# Count unique debate units per user
debate_units_per_user = df_spans.groupby('user_id')['debate_unit_id'].nunique()

# Normalize the counts by dividing by the number of unique debate units per user
normalized_label_counts_per_user = label_counts_per_user / debate_units_per_user

# Step 8: Normalize label counts by the number of unique debate units per study type
# Count unique debate units per study type
debate_units_per_study = df_spans.groupby('user_id_study')['debate_unit_id'].nunique()

# Normalize label counts by the number of unique debate units per study type
normalized_label_counts_per_study = label_counts_per_study / debate_units_per_study

# Step 9: Print the results

# Raw counts of labels
print("Raw counts of labels across all data:")
print(label_counts)

# Percentages of labels
print("\nPercentages of labels across all data:")
print(label_percentages)

# Raw counts of labels per user
print("\nRaw counts of labels per user:")
print(label_counts_per_user)

# Average number of labels per user
print("\nAverage number of labels per user:")
print(avg_labels_per_user)

# Raw counts of labels per study type
if 'user_id_study' in df_spans.columns:
    print("\nRaw counts of labels per study type:")
    print(label_counts_per_study)

    # Average number of labels per study type
    print("\nAverage number of labels per study type:")
    print(avg_labels_per_study)

# Normalized counts of labels per user (per debate unit)
print("\nNormalized label counts per user (per debate unit):")
print(normalized_label_counts_per_user)

# Normalized counts of labels per study type (per debate unit)
if 'user_id_study' in df_spans.columns:
    print("\nNormalized label counts per study type (per debate unit):")
    print(normalized_label_counts_per_study)


Raw counts of labels across all data:
label
answer            1099
evasion            810
attack             428
self_promotion     242
stretch             86
Name: count, dtype: int64

Percentages of labels across all data:
label
answer            41.238274
evasion           30.393996
attack            16.060038
self_promotion     9.080675
stretch            3.227017
Name: proportion, dtype: float64

Raw counts of labels per user:
user_id   label         
2WJLAP22  answer             33
          evasion            19
          self_promotion     13
          stretch             6
          attack              4
                           ... 
Z1NERR2S  answer            177
          attack             98
          evasion            76
          self_promotion     24
          stretch             5
Name: count, Length: 93, dtype: int64

Average number of labels per user:
user_id
2WJLAP22    15.00
3P6E0LDX    27.20
4AXSP923    10.60
7MXM4GWL    13.80
8ER9GXMV    68.60
8UN6GI5O    18.

In [9]:
df_spans_w_counts = df_spans.copy()
df_spans_w_counts.to_csv("output/prelim_dataframes/inspections/df_spans_w_counts.csv")

In [None]:
# Checking 
import pandas as pd

checking = pd.read_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/for_pretraining/ALL_debates_with_turns_and_anon_18_april_debates_wo_labelledrows.csv")
checking