In [2]:
import pandas as pd

# --- Step 1: Read Annotator Data ---
# Replace these file names with your actual Excel file names.
df_a1 = pd.read_excel('Annotator1_plumbing.xlsx')
df_a2 = pd.read_excel('Annotator2_plumbing.xlsx')

# --- Step 1.1: Clean Column Names ---
df_a1.columns = df_a1.columns.str.strip()
df_a2.columns = df_a2.columns.str.strip()

# --- Step 1.2: Rename Columns if Needed ---
# For Annotator 1:
if 'Argument' not in df_a1.columns and 'Arguments' in df_a1.columns:
    df_a1.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a1.columns and 'Arguments2' in df_a1.columns:
    df_a1.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a1.columns and 'Senses' in df_a1.columns:
    df_a1.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a1.columns and 'Senses2' in df_a1.columns:
    df_a1.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# For Annotator 2:
if 'Argument' not in df_a2.columns and 'Arguments' in df_a2.columns:
    df_a2.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a2.columns and 'Arguments2' in df_a2.columns:
    df_a2.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a2.columns and 'Senses' in df_a2.columns:
    df_a2.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a2.columns and 'Senses2' in df_a2.columns:
    df_a2.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# --- Step 1.3: (Optional) Preserve Order from Annotator1 ---
df_a1['order'] = df_a1.index

# --- Step 2: Reshape Each Annotator's Data to Long Format ---
df_a1_long = pd.DataFrame({
    "Argument": pd.concat([df_a1["Argument"], df_a1["Argument2"]], ignore_index=True),
    "Sense_a1": pd.concat([df_a1["Sense"], df_a1["Sense2"]], ignore_index=True)
})
df_a2_long = pd.DataFrame({
    "Argument": pd.concat([df_a2["Argument"], df_a2["Argument2"]], ignore_index=True),
    "Sense_a2": pd.concat([df_a2["Sense"], df_a2["Sense2"]], ignore_index=True)
})

# --- Step 2.1: Remove blank or missing Argument entries before merging ---
df_a1_long = df_a1_long[df_a1_long["Argument"].notna() & df_a1_long["Argument"].str.strip().ne("")]
df_a2_long = df_a2_long[df_a2_long["Argument"].notna() & df_a2_long["Argument"].str.strip().ne("")]

# --- Step 3: Merge the Two Long DataFrames on "Argument" ---
merged_df = pd.merge(df_a1_long, df_a2_long, on="Argument", how="outer", sort=False)

# --- Step 4: Process Missing Values ---
merged_df["Sense_a1"] = merged_df["Sense_a1"].fillna("N/A")
merged_df["Sense_a2"] = merged_df["Sense_a2"].fillna("N/A")

# Remove rows where both annotations are "N/A"
merged_df = merged_df[~((merged_df["Sense_a1"] == "N/A") & (merged_df["Sense_a2"] == "N/A"))]

# --- Step 5: Compute Row-Level Agreement ---
def compute_agreement(row):
    s1 = row["Sense_a1"].lower().strip()
    s2 = row["Sense_a2"].lower().strip()
    if s1 == "n/a" or s2 == "n/a":
        return 0
    return 1 if s1 == s2 else 0

merged_df["Agreement"] = merged_df.apply(compute_agreement, axis=1)

# --- Step 6: Compute Global Metrics ---
A1_inter_A2 = merged_df['Agreement'].sum()
A1_union_A2 = len(merged_df)
accuracy = A1_inter_A2 / A1_union_A2 if A1_union_A2 > 0 else 0

precision = A1_inter_A2 / merged_df[merged_df['Sense_a1'] != "N/A"].shape[0] if merged_df['Sense_a1'].ne("N/A").any() else 0
recall = A1_inter_A2 / merged_df[merged_df['Sense_a2'] != "N/A"].shape[0] if merged_df['Sense_a2'].ne("N/A").any() else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# --- Step 7: Print Results ---
print("Global Agreement Metrics:")
print(f"  Overall Agreement (row-level): {merged_df['Agreement'].mean():.2f}")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")

# --- Step 8: Save the Final Merged Comparison to Excel ---
output_filename = "Results/A1 VS A2_RV_Plumbing.xlsx"
merged_df.to_excel(output_filename, index=False)
print(f"Comparison results saved to: '{output_filename}'")


Global Agreement Metrics:
  Overall Agreement (row-level): 0.64
  Accuracy: 0.64
  Precision: 0.73
  Recall: 0.74
  F1 Score: 0.74
Comparison results saved to: 'Results/A1 VS A2_RV_Plumbing.xlsx'


In [3]:
import pandas as pd

# --- Step 1: Read Annotator Data ---
# Replace these file names with your actual Excel file names.
df_a1 = pd.read_excel('Annotator1_plumbing.xlsx')
df_a2 = pd.read_excel('Final Annotation of RV Plumbing.xlsx')

# --- Step 1.1: Clean Column Names ---
df_a1.columns = df_a1.columns.str.strip()
df_a2.columns = df_a2.columns.str.strip()

# --- Step 1.2: Rename Columns if Needed ---
# For Annotator 1:
if 'Argument' not in df_a1.columns and 'Arguments' in df_a1.columns:
    df_a1.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a1.columns and 'Arguments2' in df_a1.columns:
    df_a1.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a1.columns and 'Senses' in df_a1.columns:
    df_a1.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a1.columns and 'Senses2' in df_a1.columns:
    df_a1.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# For Annotator 2:
if 'Argument' not in df_a2.columns and 'Arguments' in df_a2.columns:
    df_a2.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a2.columns and 'Arguments2' in df_a2.columns:
    df_a2.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a2.columns and 'Senses' in df_a2.columns:
    df_a2.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a2.columns and 'Senses2' in df_a2.columns:
    df_a2.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# --- Step 1.3: (Optional) Preserve Order from Annotator1 ---
df_a1['order'] = df_a1.index

# --- Step 2: Reshape Each Annotator's Data to Long Format ---
df_a1_long = pd.DataFrame({
    "Argument": pd.concat([df_a1["Argument"], df_a1["Argument2"]], ignore_index=True),
    "Sense_a1": pd.concat([df_a1["Sense"], df_a1["Sense2"]], ignore_index=True)
})
df_a2_long = pd.DataFrame({
    "Argument": pd.concat([df_a2["Argument"], df_a2["Argument2"]], ignore_index=True),
    "Sense_a2": pd.concat([df_a2["Sense"], df_a2["Sense2"]], ignore_index=True)
})

# --- Step 2.1: Remove blank or missing Argument entries before merging ---
df_a1_long = df_a1_long[df_a1_long["Argument"].notna() & df_a1_long["Argument"].str.strip().ne("")]
df_a2_long = df_a2_long[df_a2_long["Argument"].notna() & df_a2_long["Argument"].str.strip().ne("")]

# --- Step 3: Merge the Two Long DataFrames on "Argument" ---
merged_df = pd.merge(df_a1_long, df_a2_long, on="Argument", how="outer", sort=False)

# --- Step 4: Process Missing Values ---
merged_df["Sense_a1"] = merged_df["Sense_a1"].fillna("N/A")
merged_df["Sense_a2"] = merged_df["Sense_a2"].fillna("N/A")

# Remove rows where both annotations are "N/A"
merged_df = merged_df[~((merged_df["Sense_a1"] == "N/A") & (merged_df["Sense_a2"] == "N/A"))]

# --- Step 5: Compute Row-Level Agreement ---
def compute_agreement(row):
    s1 = row["Sense_a1"].lower().strip()
    s2 = row["Sense_a2"].lower().strip()
    if s1 == "n/a" or s2 == "n/a":
        return 0
    return 1 if s1 == s2 else 0

merged_df["Agreement"] = merged_df.apply(compute_agreement, axis=1)

# --- Step 6: Compute Global Metrics ---
A1_inter_A2 = merged_df['Agreement'].sum()
A1_union_A2 = len(merged_df)
accuracy = A1_inter_A2 / A1_union_A2 if A1_union_A2 > 0 else 0

precision = A1_inter_A2 / merged_df[merged_df['Sense_a1'] != "N/A"].shape[0] if merged_df['Sense_a1'].ne("N/A").any() else 0
recall = A1_inter_A2 / merged_df[merged_df['Sense_a2'] != "N/A"].shape[0] if merged_df['Sense_a2'].ne("N/A").any() else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# --- Step 7: Print Results ---
print("Global Agreement Metrics:")
print(f"  Overall Agreement (row-level): {merged_df['Agreement'].mean():.2f}")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")

# --- Step 8: Save the Final Merged Comparison to Excel ---
output_filename = "Results/A1 VS FINAL_RV_Plumbing.xlsx"
merged_df.to_excel(output_filename, index=False)
print(f"Comparison results saved to: '{output_filename}'")


Global Agreement Metrics:
  Overall Agreement (row-level): 0.94
  Accuracy: 0.94
  Precision: 0.96
  Recall: 0.96
  F1 Score: 0.96
Comparison results saved to: 'Results/A1 VS FINAL_RV_Plumbing.xlsx'


In [4]:
import pandas as pd

# --- Step 1: Read Annotator Data ---
# Replace these file names with your actual Excel file names.
df_a1 = pd.read_excel('Annotator2_plumbing.xlsx')
df_a2 = pd.read_excel('Final Annotation of RV Plumbing.xlsx')

# --- Step 1.1: Clean Column Names ---
df_a1.columns = df_a1.columns.str.strip()
df_a2.columns = df_a2.columns.str.strip()

# --- Step 1.2: Rename Columns if Needed ---
# For Annotator 1:
if 'Argument' not in df_a1.columns and 'Arguments' in df_a1.columns:
    df_a1.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a1.columns and 'Arguments2' in df_a1.columns:
    df_a1.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a1.columns and 'Senses' in df_a1.columns:
    df_a1.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a1.columns and 'Senses2' in df_a1.columns:
    df_a1.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# For Annotator 2:
if 'Argument' not in df_a2.columns and 'Arguments' in df_a2.columns:
    df_a2.rename(columns={'Arguments': 'Argument'}, inplace=True)
if 'Argument2' not in df_a2.columns and 'Arguments2' in df_a2.columns:
    df_a2.rename(columns={'Arguments2': 'Argument2'}, inplace=True)
if 'Sense' not in df_a2.columns and 'Senses' in df_a2.columns:
    df_a2.rename(columns={'Senses': 'Sense'}, inplace=True)
if 'Sense2' not in df_a2.columns and 'Senses2' in df_a2.columns:
    df_a2.rename(columns={'Senses2': 'Sense2'}, inplace=True)

# --- Step 1.3: (Optional) Preserve Order from Annotator1 ---
df_a1['order'] = df_a1.index

# --- Step 2: Reshape Each Annotator's Data to Long Format ---
df_a1_long = pd.DataFrame({
    "Argument": pd.concat([df_a1["Argument"], df_a1["Argument2"]], ignore_index=True),
    "Sense_a1": pd.concat([df_a1["Sense"], df_a1["Sense2"]], ignore_index=True)
})
df_a2_long = pd.DataFrame({
    "Argument": pd.concat([df_a2["Argument"], df_a2["Argument2"]], ignore_index=True),
    "Sense_a2": pd.concat([df_a2["Sense"], df_a2["Sense2"]], ignore_index=True)
})

# --- Step 2.1: Remove blank or missing Argument entries before merging ---
df_a1_long = df_a1_long[df_a1_long["Argument"].notna() & df_a1_long["Argument"].str.strip().ne("")]
df_a2_long = df_a2_long[df_a2_long["Argument"].notna() & df_a2_long["Argument"].str.strip().ne("")]

# --- Step 3: Merge the Two Long DataFrames on "Argument" ---
merged_df = pd.merge(df_a1_long, df_a2_long, on="Argument", how="outer", sort=False)

# --- Step 4: Process Missing Values ---
merged_df["Sense_a1"] = merged_df["Sense_a1"].fillna("N/A")
merged_df["Sense_a2"] = merged_df["Sense_a2"].fillna("N/A")

# Remove rows where both annotations are "N/A"
merged_df = merged_df[~((merged_df["Sense_a1"] == "N/A") & (merged_df["Sense_a2"] == "N/A"))]

# --- Step 5: Compute Row-Level Agreement ---
def compute_agreement(row):
    s1 = row["Sense_a1"].lower().strip()
    s2 = row["Sense_a2"].lower().strip()
    if s1 == "n/a" or s2 == "n/a":
        return 0
    return 1 if s1 == s2 else 0

merged_df["Agreement"] = merged_df.apply(compute_agreement, axis=1)

# --- Step 6: Compute Global Metrics ---
A1_inter_A2 = merged_df['Agreement'].sum()
A1_union_A2 = len(merged_df)
accuracy = A1_inter_A2 / A1_union_A2 if A1_union_A2 > 0 else 0

precision = A1_inter_A2 / merged_df[merged_df['Sense_a1'] != "N/A"].shape[0] if merged_df['Sense_a1'].ne("N/A").any() else 0
recall = A1_inter_A2 / merged_df[merged_df['Sense_a2'] != "N/A"].shape[0] if merged_df['Sense_a2'].ne("N/A").any() else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# --- Step 7: Print Results ---
print("Global Agreement Metrics:")
print(f"  Overall Agreement (row-level): {merged_df['Agreement'].mean():.2f}")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")

# --- Step 8: Save the Final Merged Comparison to Excel ---
output_filename = "Results/A2 VS FINAL_RV_Plumbing.xlsx"
merged_df.to_excel(output_filename, index=False)
print(f"Comparison results saved to: '{output_filename}'")


Global Agreement Metrics:
  Overall Agreement (row-level): 0.67
  Accuracy: 0.67
  Precision: 0.76
  Recall: 0.75
  F1 Score: 0.75
Comparison results saved to: 'Results/A2 VS FINAL_RV_Plumbing.xlsx'


In [2]:
import pandas as pd
import re
from collections import defaultdict

def parse_spans(span_str):
    """Parse span strings like '1', '2-7', '10-11' into a list of integers."""
    if pd.isna(span_str) or span_str == '':
        return []
    spans = []
    span_str = span_str.replace('Arg1(', '').replace('Arg2(', '').replace(')', '')
    parts = span_str.split(',')
    for part in parts:
        part = part.strip()
        if '-' in part:
            try:
                start, end = map(int, part.split('-'))
                spans.extend(range(start, end + 1))
            except ValueError:
                continue
        else:
            try:
                spans.append(int(part))
            except ValueError:
                continue
    return sorted(set(spans))

def is_nonadjacent(arg1_spans, arg2_spans):
    """Check if Arg1 and Arg2 spans are nonadjacent (gap or non-consecutive)."""
    if not arg1_spans or not arg2_spans:
        return False
    all_spans = sorted(set(arg1_spans + arg2_spans))
    arg1_max, arg2_min = max(arg1_spans), min(arg2_spans)
    arg2_max, arg1_min = max(arg2_spans), min(arg1_spans)
    return (arg2_min > arg1_max + 1) or (arg1_min > arg2_max + 1)

def count_spans(arg_spans):
    """Count number of spans in an argument."""
    return len(arg_spans)

def parse_discourse_excel(file_path):
    # Read Excel file
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return None

    # Ensure required columns exist
    required_cols = ['Sense', 'Explicit?', 'Arguments']
    if not all(col in df.columns for col in required_cols):
        print(f"Required columns {required_cols} not all found.")
        return None

    # Initialize counters
    implicit_count = 0
    explicit_count = 0
    nonadjacent_rows = []
    compound_rows = []
    long_span_rows = []

    # Process each row
    for idx, row in df.iterrows():
        # Implicit/Explicit for Sense
        explicit = str(row.get('Explicit?', '')).strip().lower()
        if explicit in ['implicit', ''] or 'altlex' in explicit:
            if explicit:  # Only count non-empty cells
                implicit_count += 1
        elif explicit and explicit != 'nan':
            explicit_count += 1

        # Implicit/Explicit for Sense2
        explicit2 = str(row.get('Explicit?.1', '')).strip().lower()
        if explicit2 in ['implicit', ''] or 'altlex' in explicit2:
            if explicit2:  # Only count non-empty cells
                implicit_count += 1
        elif explicit2 and explicit2 != 'nan':
            explicit_count += 1

        # Nonadjacent spans
        args = str(row.get('Arguments', ''))
        arg2_extra = str(row.get('Argument2', ''))
        arg1_match = re.search(r'Arg1\((.*?)\)', args)
        arg2_match = re.search(r'Arg2\((.*?)\)', args)
        arg1_spans = parse_spans(arg1_match.group(1) if arg1_match else '')
        arg2_spans = parse_spans(arg2_match.group(1) if arg2_match else '')
        arg2_extra_match = re.search(r'Arg2\((.*?)\)', arg2_extra)
        if arg2_extra_match:
            arg2_spans.extend(parse_spans(arg2_extra_match.group(1)))
            arg2_spans = sorted(set(arg2_spans))
        arg1_extra_match = re.search(r'Arg1\((.*?)\)', arg2_extra)
        if arg1_extra_match:
            arg1_spans.extend(parse_spans(arg1_extra_match.group(1)))
            arg1_spans = sorted(set(arg1_spans))
        if is_nonadjacent(arg1_spans, arg2_spans):
            nonadjacent_rows.append(row.get('No.', idx + 1))

        # Compound senses
        sense = row.get('Sense', '')
        sense2 = row.get('Sense2', '')
        if pd.notna(sense2) and sense2 != '':
            compound_rows.append(row.get('No.', idx + 1))

        # Long spans
        if count_spans(arg1_spans) >= 3 or count_spans(arg2_spans) >= 3:
            long_span_rows.append(row.get('No.', idx + 1))

    # Remove duplicates
    nonadjacent_rows = sorted(set(nonadjacent_rows))
    compound_rows = sorted(set(compound_rows))
    long_span_rows = sorted(set(long_span_rows))

    # Compile results
    results = {
        'Implicit Relations': implicit_count,
        'Explicit Relations': explicit_count,
        'Nonadjacent Spans': len(nonadjacent_rows),
        'Nonadjacent Rows': nonadjacent_rows,
        'Compound Senses': len(compound_rows),
        'Compound Rows': compound_rows,
        'Long Spans': len(long_span_rows),
        'Long Span Rows': long_span_rows
    }

    return results

def main():
    # Replace with your Excel file path
    file_path = 'Final Annotation of RV Plumbing.xlsx'
    
    results = parse_discourse_excel(file_path)
    if results:
        print("\nDiscourse Analysis Results:")
        print(f"Implicit Relations: {results['Implicit Relations']}")
        print(f"Explicit Relations: {results['Explicit Relations']}")
        print(f"Nonadjacent Spans: {results['Nonadjacent Spans']} (Rows: {results['Nonadjacent Rows']})")
        print(f"Compound Senses: {results['Compound Senses']} (Rows: {results['Compound Rows']})")
        print(f"Long Spans: {results['Long Spans']} (Rows: {results['Long Span Rows']})")
        
        # Save to CSV
        summary_df = pd.DataFrame({
            'Metric': ['Implicit Relations', 'Explicit Relations', 'Nonadjacent Spans', 'Compound Senses', 'Long Spans'],
            'Count': [results['Implicit Relations'], results['Explicit Relations'], 
                      results['Nonadjacent Spans'], results['Compound Senses'], results['Long Spans']],
            'Rows': ['', '', str(results['Nonadjacent Rows']), str(results['Compound Rows']), str(results['Long Span Rows'])]
        })
        summary_df.to_csv('discourse_metrics.csv', index=False)
       # print("\nResults saved to 'discourse_metrics.csv'")
        
        # Save to Excel
        summary_df.to_excel('discourse_metrics.xlsx', index=False, engine='openpyxl')
        print("Results also saved to 'discourse_metrics.xlsx'")

if __name__ == "__main__":
    main()

Required columns ['Sense', 'Explicit?', 'Arguments'] not all found.
