# Span Classification Model Training

This notebook trains a transformer model to classify the persuasive technique used in a marked text span within Russian articles.

In [32]:
import os
import re
import glob
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding,
    PreTrainedModel,
    XLMRobertaPreTrainedModel
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict
from typing import List, Tuple, Dict, Set
from pathlib import Path
import gc

# --- Constants ---
BASE_DATA_PATH = "../data/processed/ru/"
LABEL_FILES_PATTERN = os.path.join(BASE_DATA_PATH, "train-labels-subtask-3-spans-*.txt")
WRAPPED_ARTICLES_DIR = os.path.join(BASE_DATA_PATH, "wrapped-articles")
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 512
TEST_SIZE = 0.1
RANDOM_STATE = 42
OUTPUT_DIR = "./span_classification_model"
LOGGING_DIR = "./span_classification_logs"

## 1. Load and Prepare Data

Load labels from all language files, assign a unique span index within each article, and create label mappings.

In [33]:
def load_all_labels(pattern: str) -> pd.DataFrame:
    """Loads labels from all files matching the pattern and assigns span indices."""
    all_files = glob.glob(pattern)
    df_list = []
    print(f"Found label files: {all_files}")
    for f in all_files:
        try:
            df_lang = pd.read_csv(f, sep="\t", header=None, names=["article_id", "label", "start", "end"], dtype={'article_id': str})
            df_list.append(df_lang)
        except Exception as e:
            print(f"Error reading {f}: {e}")
    
    if not df_list:
        raise ValueError("No label data loaded. Check LABEL_FILES_PATTERN.")
        
    full_df = pd.concat(df_list, ignore_index=True)
    
    # Sort by article_id and original position (start offset) to ensure consistent indexing
    full_df = full_df.sort_values(by=['article_id', 'start'], ascending=[True, True])
    
    # Assign 1-based span index within each article group
    full_df['span_idx'] = full_df.groupby('article_id').cumcount() + 1
    
    print(f"Loaded {len(full_df)} labels from {len(all_files)} files.")
    print(f"Unique articles: {full_df['article_id'].nunique()}")
    print(f"Unique labels: {full_df['label'].nunique()}")
    return full_df[['article_id', 'label', 'span_idx']]

# Load the data
label_df = load_all_labels(LABEL_FILES_PATTERN)

# Create label mappings
unique_labels = sorted(label_df['label'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_labels)

print(f"\nNumber of unique labels: {num_labels}")

# Add label_id column to DataFrame
label_df['label_id'] = label_df['label'].map(label2id)

print("\nSample of processed label data:")
print(label_df.head())

Found label files: ['../data/processed/ru/train-labels-subtask-3-spans-po.txt', '../data/processed/ru/train-labels-subtask-3-spans-it.txt', '../data/processed/ru/train-labels-subtask-3-spans-en.txt', '../data/processed/ru/train-labels-subtask-3-spans-ru.txt', '../data/processed/ru/train-labels-subtask-3-spans-ge.txt', '../data/processed/ru/train-labels-subtask-3-spans-fr.txt']
Loaded 36511 labels from 6 files.
Unique articles: 1550
Unique labels: 23

Number of unique labels: 23

Sample of processed label data:
      article_id                     label  span_idx  label_id
17956  111111111                     Doubt         1         9
17957  111111111       Appeal_to_Authority         2         0
17958  111111111                Repetition         3        19
17959  111111111  Appeal_to_Fear-Prejudice         4         1
17960  111111111  Appeal_to_Fear-Prejudice         5         1


## 2. Create PyTorch Dataset

Define a dataset class to handle loading wrapped articles, replacing markers, tokenizing, and identifying the start marker position.

In [34]:
class SpanClassificationDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer, wrapped_articles_dir: str, max_length: int):
        self.data = data
        self.tokenizer = tokenizer
        self.wrapped_articles_dir = wrapped_articles_dir
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        article_id = item['article_id']
        label_id = item['label_id']
        span_idx = item['span_idx']

        # --- Find the article file using glob ---
        search_pattern = os.path.join(self.wrapped_articles_dir, f"*article{article_id}.txt")
        possible_files = glob.glob(search_pattern)
        
        article_path = ""
        if len(possible_files) == 1:
            article_path = possible_files[0]
        elif len(possible_files) == 0:
             raise FileNotFoundError(f"Missing article file for {article_id}. Searched pattern: {search_pattern}")
        else:
            non_prefixed_path = os.path.join(self.wrapped_articles_dir, f"article{article_id}.txt")
            if non_prefixed_path in possible_files:
                 article_path = non_prefixed_path
                 print(f"Warning: Found multiple files for article {article_id}, using non-prefixed: {article_path}")
            else:
                 article_path = possible_files[0]
                 print(f"Warning: Found multiple files for article {article_id}: {possible_files}. Using the first one: {article_path}")

        try:
            with open(article_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except Exception as e:
            print(f"Error reading article {article_id} from {article_path}: {e}")
            raise e

        start_marker = f"<<S_{span_idx}>>"
        end_marker = f"<</S_{span_idx}>>"
        processed_text = text.replace(start_marker, "", 1)
        processed_text = text.replace(end_marker, "", 1)

        processed_text = re.sub(r'<<S_\d+>>', '', processed_text)
        processed_text = re.sub(r'<</S_\d+>>', '', processed_text)

        encoding = self.tokenizer(
            processed_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

## 3. Initialize Tokenizer

Load the tokenizer.

In [35]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## 4. Define Custom Model

Create a model that uses the hidden state of the `[CLS]` token for classification.

In [36]:
class SpanClassifierModel(XLMRobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = AutoModel.from_config(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        self.class_weights = None

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        cls_embedding = outputs[0][:, 0, :]
        pooled_output = self.dropout(cls_embedding)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_weights = self.class_weights.to(logits.device) if self.class_weights is not None else None
            loss_fct = torch.nn.CrossEntropyLoss(weight=loss_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).to(logits.device))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

## 5. Prepare Data for Training

Split the data, create Dataset instances, and define a data collator.

In [37]:
# Split data into train and validation sets
train_df, val_df = train_test_split(
    label_df, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    stratify=label_df['label_id']
)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# --- Calculate Class Weights from Training Data ---
class_counts = train_df['label_id'].value_counts().sort_index()
total_samples = len(train_df)
weights = total_samples / (len(class_counts) * class_counts)
class_weights_tensor = torch.tensor(weights.values, dtype=torch.float)
print("\nCalculated class weights:")
for i, w in enumerate(weights):
    print(f"  Class {i} ({id2label[i]}): {w:.4f}")
# --- End Class Weight Calculation ---

# Create Dataset instances
train_dataset = SpanClassificationDataset(train_df, tokenizer, WRAPPED_ARTICLES_DIR, MAX_LENGTH)
val_dataset = SpanClassificationDataset(val_df, tokenizer, WRAPPED_ARTICLES_DIR, MAX_LENGTH)

# Data Collator - pads sequences dynamically per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Test the dataset and collator with one item
try:
    sample_item = train_dataset[0]
    print("\nSample dataset item:")
    print({k: v.shape if hasattr(v, 'shape') else v for k, v in sample_item.items()})
    batch = data_collator([train_dataset[i] for i in range(4)])
    print("\nSample collated batch (first 4 items):")
    print({k: v.shape if hasattr(v, 'shape') else v for k, v in batch.items()})
except Exception as e:
    print(f"Error testing dataset/collator: {e}")
    raise e

Train set size: 32859
Validation set size: 3652

Calculated class weights:
  Class 0 (Appeal_to_Authority): 2.1646
  Class 1 (Appeal_to_Fear-Prejudice): 0.9205
  Class 2 (Appeal_to_Hypocrisy): 1.6143
  Class 3 (Appeal_to_Popularity): 4.3824
  Class 4 (Appeal_to_Time): 9.0421
  Class 5 (Appeal_to_Values): 2.2288
  Class 6 (Causal_Oversimplification): 2.6704
  Class 7 (Consequential_Oversimplification): 4.2268
  Class 8 (Conversation_Killer): 1.5927
  Class 9 (Doubt): 0.3335
  Class 10 (Exaggeration-Minimisation): 0.8907
  Class 11 (False_Dilemma-No_Choice): 3.2396
  Class 12 (Flag_Waving): 2.0037
  Class 13 (Guilt_by_Association): 2.3692
  Class 14 (Loaded_Language): 0.1725
  Class 15 (Name_Calling-Labeling): 0.2359
  Class 16 (Obfuscation-Vagueness-Confusion): 4.2902
  Class 17 (Questioning_the_Reputation): 0.6803
  Class 18 (Red_Herring): 7.5590
  Class 19 (Repetition): 1.2621
  Class 20 (Slogans): 2.0439
  Class 21 (Straw_Man): 4.9095
  Class 22 (Whataboutism): 10.2781

Sample datase

## 6. Configure Training

Define metrics function and training arguments.

In [38]:
from sklearn.metrics import classification_report

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_per_class = f1_score(labels, predictions, average=None, zero_division=0)
    
    metrics = {
        'accuracy': acc,
        'f1_macro': f1_macro,
    }
    for i, f1 in enumerate(f1_per_class):
        if i in id2label:
             metrics[f'f1_{id2label[i]}'] = f1
        else:
             metrics[f'f1_class_{i}'] = f1
    return metrics

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGGING_DIR,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="tensorboard",
    remove_unused_columns=True,
    fp16=torch.cuda.is_available(),
    lr_scheduler_type='cosine_with_restarts',
)

## 7. Initialize Model and Trainer

In [39]:
# Ensure model config has correct label mappings
model = SpanClassifierModel.from_pretrained(
    MODEL_NAME, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label
)

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Assign Class Weights to Model ---
model.class_weights = class_weights_tensor.to(device)
print("Assigned class weights to the model.")
# --- End Assign Class Weights ---

# Move model to the correct device
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of SpanClassifierModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Assigned class weights to the model.


  trainer = Trainer(


## 8. Train the Model

In [None]:
# Before starting training, clear cache if needed
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
print("Starting training...")
train_result = trainer.train()
print("Training finished.")

# Save training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

# Save the final model and tokenizer
trainer.save_model() 
trainer.save_state()

Starting training...


Epoch,Training Loss,Validation Loss


## 9. Evaluate the Model

Evaluate the best model on the validation set.

In [None]:
print("Evaluating the best model on the validation set...")
eval_metrics = trainer.evaluate()

print("Evaluation finished.")
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

print("\nEvaluation Metrics:")
print(eval_metrics)

Evaluating the best model on the validation set...


Evaluation finished.
***** eval metrics *****
  epoch                                    =        5.0
  eval_accuracy                            =     0.1336
  eval_f1_Appeal_to_Authority              =     0.1618
  eval_f1_Appeal_to_Fear-Prejudice         =     0.1086
  eval_f1_Appeal_to_Hypocrisy              =     0.1322
  eval_f1_Appeal_to_Popularity             =     0.1029
  eval_f1_Appeal_to_Time                   =        0.0
  eval_f1_Appeal_to_Values                 =     0.0991
  eval_f1_Causal_Oversimplification        =      0.098
  eval_f1_Consequential_Oversimplification =     0.0606
  eval_f1_Conversation_Killer              =        0.0
  eval_f1_Doubt                            =       0.25
  eval_f1_Exaggeration-Minimisation        =     0.0996
  eval_f1_False_Dilemma-No_Choice          =     0.0388
  eval_f1_Flag_Waving                      =     0.0976
  eval_f1_Guilt_by_Association             =     0.1226
  eval_f1_Loaded_Language                  =      0.054
  

## Clear Model from Memory

Delete the model and trainer objects and clear the GPU cache to free up memory.

In [None]:
import torch
import gc
import numpy as np

print("Clearing model and trainer from memory...")

# Check if variables exist before deleting
if 'model' in locals() or 'model' in globals():
    del model
    print("  Deleted 'model' variable.")
if 'trainer' in locals() or 'trainer' in globals():
    del trainer 
    print("  Deleted 'trainer' variable.")
if 'train_dataset' in locals() or 'train_dataset' in globals():
    del train_dataset
    print("  Deleted 'train_dataset'.")
if 'val_dataset' in locals() or 'val_dataset' in globals():
    del val_dataset
    print("  Deleted 'val_dataset'.")
if 'label_df' in locals() or 'label_df' in globals():
    del label_df
    print("  Deleted 'label_df'.")

# Run Python's garbage collector
gc.collect()
print("  Ran garbage collector.")

# Clear PyTorch's CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("  Cleared PyTorch CUDA cache.")
else:
    print("  CUDA not available, skipping cache clearing.")

print("Memory clearing process finished.")

Clearing model and trainer from memory...
  Deleted 'model' variable.
  Deleted 'trainer' variable.
  Deleted 'train_dataset'.
  Deleted 'val_dataset'.
  Deleted 'label_df'.
  Ran garbage collector.
  Cleared PyTorch CUDA cache.
Memory clearing process finished.


---
## Obsolete Code (Commented Out)

The following cells contained previous data loading/analysis code which is no longer needed for the current span classification task.

In [None]:
# --- OBSOLETE CODE ---
# Original data loading utilities - replaced by new loading logic above
# import os, re
# from collections import defaultdict
# from typing import List, Tuple, Dict, Set
# from pathlib import Path # Added import
# 
# def read_article(path: str) -> str:
#     with open(path, "rb") as f:
#         raw = f.read()
#     return raw.decode("utf-8", "ignore")
# 
# def load_span_labels(label_file: str
#     ) -> Dict[str, List[Tuple[str, int, int]]]:
#     
#     spans = defaultdict(list)
#     with open(label_file, encoding="utf-8") as f:
#         for line in f:
#             # Handle potential empty lines or lines with incorrect format
#             parts = line.rstrip().split("\t")
#             if len(parts) == 4:
#                 art_id, lab, s, e = parts
#                 try:
#                     spans[art_id].append((lab, int(s), int(e)))
#                 except ValueError:
#                     print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
#             elif line.strip(): # Print warning for non-empty, but malformed lines
#                  print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
#     return spans
# 
# # Added function to extract base classes from the span file
# def get_base_classes_from_spans(label_file: str) -> Set[str]:
#     base_classes = set()
#     with open(label_file, encoding="utf-8") as f:
#         for line in f:
#             parts = line.rstrip().split("\t")
#             if len(parts) == 4:
#                 _, lab, _, _ = parts
#                 base_classes.add(lab)
#             elif line.strip():
#                  # Warnings handled in load_span_labels, no need to repeat here
#                  pass
#     return base_classes

In [None]:
# --- OBSOLETE CODE ---
# Original span printing utility
# def print_span(article_number, start_offset, end_offset, lang="en", base_path="data/raw"):
#     article_path = f"{base_path}/{lang}/train-articles-subtask-3/{lang}_article{article_number}.txt"
#     with open(article_path, "rb") as f:
#         raw = f.read()
#     text = raw.decode("utf-8", errors="ignore")
#     span = text[start_offset:end_offset]
#     print(span)
#     
# import pandas as pd
# 
# df = pd.read_csv("../data/processed/ru/train-labels-subtask-3-spans-en.txt", sep="\t", header=None, names=["article_id", "label", "start", "end"])
# 
# 


In [None]:
# --- OBSOLETE CODE ---
# Original span display functions
# import re
# # Function to display spans for a specific article
# def show_article_spans(article_id, lang="en", base_path="../data/processed/ru"):
#     # Get all spans for this specific article
#     article_spans = df[df['article_id'] == article_id].sort_values(by='start')
#     
#     # Check if we found any spans
#     if len(article_spans) == 0:
#         print(f"No spans found for article {article_id}")
#         return
#     
#     r = []
#     # Try to get the original article text
#     article_path = f"{base_path}/unwrapped-articles/{lang}_article{article_id}.txt"
#     try:
#         with open(article_path, "rb") as f:
#             raw = f.read()
#         article_text = raw.decode("utf-8", errors="ignore")
#         
#         print(f"\n{'='*60}")
#         print(f"Article ID: {article_id}")
#         print(f"Total spans: {len(article_spans)}")
#         print(f"{'='*60}")
#         
#         # Display each span
#         for index, row in article_spans.iterrows():
#             label = row['label']
#             start_offset = row['start']
#             end_offset = row['end']
#             span_text = article_text[start_offset:end_offset]
#             r.append((label, start_offset, end_offset, span_text))
#             
#             print(f"\nSpan {index}")
#             print(f"Label: {label}")
#             print(f"Position: {start_offset} to {end_offset}")
#             print(f"Text: '{span_text}'")
# 
#         return r
#     except Exception as e:
#         print(f"Error accessing article {article_id}: {e}")
# 
# # Function to show text between span markers in wrapped articles
# def show_wrapped_spans(article_id, lang="en", base_path="../data/processed/ru"):
#     # Try to get the wrapped article text
#     article_path = f"{base_path}/wrapped-articles/{lang}_article{article_id}.txt"
#     r = []
#     try:
#         with open(article_path, "r", encoding="utf-8") as f:
#             wrapped_text = f.read()
#         
#         print(f"\n{'='*60}")
#         print(f"Wrapped Article ID: {article_id}")
#         print(f"{'='*60}")
#         
#         # Find all span markers using regex
#         spans = re.finditer(r'<<S_(\d+)>>(.*?)<</S_\1>>', wrapped_text, re.DOTALL)
#         
#         found_spans = False
#         for i, span in enumerate(spans):
#             found_spans = True
#             span_number = span.group(1)
#             span_text = span.group(2)
#             print(f"\nMarked span #{span_number}")
#             print(f"Text: '{span_text}'")
#             r.append((span_number, span_text))
#         
#         if not found_spans:
#             print(f"No marked spans found in wrapped article {article_id}")
#         return r
#     except Exception as e:
#         print(f"Error accessing wrapped article {article_id}: {e}")
# 
# # Example usage - show spans for a sample article
# sample_article_id = df['article_id'].sample(1).iloc[0]
# article_spans = show_article_spans(sample_article_id)
# 
# # Show the wrapped version of the same article
# wrapped_spans = show_wrapped_spans(sample_article_id)
# 
# 


In [None]:
# --- OBSOLETE CODE ---
# Original span comparison function
# import difflib
# 
# def compare_article_and_wrapped_span_texts(article_id, lang="en", base_path="../data/processed/ru", min_ratio=0.8):
#     """
#     Compare the span texts from the article (offset-based) and wrapped (marker-based) versions,
#     reporting the closest matches and their differences. Order is not assumed to be the same.
#     """
#     # Get span texts from both sources
#     article_spans = show_article_spans(article_id, lang=lang, base_path=base_path)
#     wrapped_spans = show_wrapped_spans(article_id, lang=lang, base_path=base_path)
# 
#     if not article_spans or not wrapped_spans:
#         print("No spans found in one or both sources.")
#         return
# 
#     # Extract just the span texts
#     article_texts = [span[3].strip() for span in article_spans]
#     wrapped_texts = [span[1].strip() for span in wrapped_spans]
# 
#     # For each article span, find the best matching wrapped span (by similarity ratio)
#     print(f"\n{'Article Span Index':<18} {'Best Match Index':<16} {'Similarity':<10} {'Article Span Text':<40} {'Wrapped Span Text'}")
#     print("-" * 120)
#     used_wrapped = set()
#     for i, art_text in enumerate(article_texts):
#         # Find the best match in wrapped_texts
#         best_ratio = 0
#         best_j = -1
#         for j, wrap_text in enumerate(wrapped_texts):
#             if j in used_wrapped:
#                 continue
#             ratio = difflib.SequenceMatcher(None, art_text, wrap_text).ratio()
#             if ratio > best_ratio:
#                 best_ratio = ratio
#                 best_j = j
#         # Optionally, only consider matches above a threshold
#         if best_ratio >= min_ratio and best_j != -1:
#             used_wrapped.add(best_j)
#             print(f"{i:<18} {best_j:<16} {best_ratio:.2f} {art_text[:40]:<40} {wrapped_texts[best_j][:40]}")
#             # Show diff if not identical
#             if best_ratio < 1.0:
#                 diff = difflib.unified_diff(
#                     art_text.splitlines(), wrapped_texts[best_j].splitlines(),
#                     fromfile='article_span', tofile='wrapped_span', lineterm=''
#                 )
#                 print('\n'.join(diff))
#         else:
#             print(f"{i:<18} {'-':<16} {'0.00':<10} {art_text[:40]:<40} {'NO MATCH'}")
# 
#     # Optionally, report wrapped spans that were not matched
#     unmatched = [j for j in range(len(wrapped_texts)) if j not in used_wrapped]
#     if unmatched:
#         print("\nWrapped spans not matched to any article span:")
#         for j in unmatched:
#             print(f"Wrapped index {j}: {wrapped_texts[j][:60]}")
# 
# # Example usage:
# compare_article_and_wrapped_span_texts(sample_article_id)