1. Data and Libraries

In [None]:
from pandas import read_csv
df = read_csv("/content/EUSpeech_translated.csv")

In [None]:
# @title
import re
import pandas as pd
import spacy
#import coreferee
import concurrent.futures
from tqdm import tqdm  # For progress display

2. Pre-Processing Functions

2.1 Tokenization and key sentence extraction

In [None]:
import re
import pandas as pd
import nltk

# Ensure required NLTK resources are available
try:
    nltk.data.find("tokenizers/punkt_tab/english")
except LookupError:
    nltk.download("punkt_tab")

import re
from nltk.tokenize import word_tokenize, sent_tokenize


# ----------------------------
# Function to get rokenized words and sentences
# ----------------------------
def tokenize_text(df, text_col="text.en", keywords=None):
    """
    Filters rows containing any of the specified keywords in the text,
    tokenizes the text into words, splits the text into sentences, and extracts
    context sentences that contain the keywords along with their adjacent sentences.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a text column.
        text_col (str): Name of the column containing text (default "text.en").
        keywords (list of str): List of keywords to filter the text. 
                                If None, defaults to the EU Eastern Partnership countries:
                                ["Armenia", "Azerbaijan", "Belarus", "Georgia", "Moldova", "Ukraine"].

    Returns:
        pd.DataFrame: A copy of the DataFrame with new columns:
            - 'tokenized': List of tokens from the text.
            - 'sentences': List of sentences from the text.
            - 'context': List of context strings (each is a snippet including the matching sentence 
                         plus its preceding and following sentences, if available).
    """
    # Default keywords if not provided
    if keywords is None:
        keywords = ["Armenia", "Azerbaijan", "Belarus", "Georgia", "Moldova", "Ukraine"]

    # Compile a regex pattern from the keywords for filtering text (case-insensitive)
    regex_pattern = r"\b(?:{})\b".format("|".join(map(re.escape, keywords)))
    pattern = re.compile(regex_pattern, re.IGNORECASE)

    # Filter rows that contain any of the keywords
    df_filtered = df[df[text_col].apply(lambda x: bool(pattern.search(x)))].copy()

    # Tokenize the text into words and store in a new column 'tokenized'
    df_filtered['tokenized'] = df_filtered[text_col].apply(word_tokenize)

    # Split the text into sentences and store in a new column 'sentences'
    df_filtered['sentences'] = df_filtered[text_col].apply(sent_tokenize)

    def extract_context(sentences, keywords):
        """
        Extract context sentences that contain any of the keywords along with their adjacent sentences.
        """
        contexts = []
        for i, sentence in enumerate(sentences):
            # Check if the sentence contains any keyword (case-insensitive)
            if any(re.search(r'\b{}\b'.format(re.escape(keyword)), sentence, re.IGNORECASE) for keyword in keywords):
                # Get the previous sentence if exists, current, and the next sentence if exists
                start = max(i - 1, 0)
                end = min(i + 2, len(sentences))
                context = " ".join(sentences[start:end])
                contexts.append(context)
        return contexts

    # Apply context extraction for each row based on its list of sentences
    df_filtered['context'] = df_filtered['sentences'].apply(lambda sents: extract_context(sents, keywords))

    return df_filtered


2.2 Coreference Resolution

In [None]:
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl#egg=en_coreference_web_trf


In [None]:
import spacy
!pip show spacy
import spacy_experimental
nlp = spacy.load("en_coreference_web_trf")


In [None]:
def append_corefs(doc):
    """
    Returns a version of the document text where for each non-main coreferent mention,
    the main mention is appended in square brackets.
    
    For example, if a mention is "he" and the main mention for that coreference chain is "Fidel Castro",
    the function will change "he" to "he [Fidel Castro]".
    
    Args:
        doc (spacy.tokens.Doc): A spaCy document processed with a coreference model that sets doc.spans.
    
    Returns:
        str: The modified text with coreference annotations appended.
    """
    resolved_text = doc.text
    offset = 0
    insertions = []
    
    # Iterate over each coreference chain in doc.spans.
    for chain in doc.spans:
        # Process only chains that begin with the expected prefix.
        if not chain.startswith("coref_clusters"):
            continue
        
        # The first mention is the main reference.
        main_text = doc.spans[chain][0].text
        
        # For every subsequent mention, we want to append the main text in brackets.
        for idx, span in enumerate(doc.spans[chain]):
            if idx > 0:
                # Insert after the mention; use span.end_char as the insertion point.
                insertions.append([span.end_char, f" [{main_text}]"])
    
    # Sort the insertions by the character index.
    for pos, insertion in sorted(insertions, key=lambda x: x[0]):
        resolved_text = resolved_text[:pos + offset] + insertion + resolved_text[pos + offset:]
        offset += len(insertion)
    
    return resolved_text

# Example usage:
import spacy
import spacy_experimental

nlp = spacy.load("en_coreference_web_trf")
text = """Fidel Castro led a communist revolution that toppled the Cuban government in 1959, after which he declared himself prime minister. He held the title until 1976, when it was abolished and he became head of the Communist Party and president of the council of state and the council of ministers. With his health failing, Castro handed power to his brother, Raúl, in 2006. He died in 2016."""
doc = nlp(text)
resolved = append_corefs(doc)
print("Original text:")
print(doc.text)
print("\nText with coreferences appended:")
print(resolved)


3. Running Pre-Processing

3.1 Define Pre-Processing Functions

In [None]:

# ----------------------------
# Define processing function for each text entry
# ----------------------------

# ----------------------------
# Define processing function for each text entry
# ----------------------------
def process_text(text):
    """
    Process a single text:
      - Extract Named Entities, subject, and object from the original text.
      - Resolve coreference using coreferee (via append_corefs) on the entire text.
      
    Returns:
        tuple: (sentence_records, resolved_text)
        
        - sentence_records: A list of sentence-level dictionaries with keys:
            "sentence", "entities", "subject", and "object".
        - resolved_text: The entire text with coreference annotations appended.
    """
    # Process the original text
    doc = nlp(text)
    
    sentence_records = []
    # Extract sentence-level data from the original text
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue
        
        sent_doc = nlp(sent_text)
        entities = [(ent.text, ent.label_) for ent in sent_doc.ents]
        
        subject = None
        obj = None
        for token in sent_doc:
            if token.dep_ in ("nsubj", "nsubjpass") and subject is None:
                subject = token.text
            if token.dep_ in ("dobj", "pobj") and obj is None:
                obj = token.text
        
        sentence_records.append({
            "sentence": sent_text,
            "entities": entities,
            "subject": subject,
            "object": obj
        })
    
    # Apply coreference resolution using append_corefs
    resolved_text = append_corefs(doc)
    
    return sentence_records, resolved_text



3.2 Run the Parallelized Pre-Processing

In [None]:

# ----------------------------
# Define processing function for each text entry
# ----------------------------

# ----------------------------
# Define processing function for each text entry
# ----------------------------
def process_text(text):
    """
    Process a single text:
      - Extract Named Entities, subject, and object from the original text.
      - Resolve coreference using coreferee (via append_corefs) on the entire text.
      
    Returns:
        tuple: (sentence_records, resolved_text)
        
        - sentence_records: A list of sentence-level dictionaries with keys:
            "sentence", "entities", "subject", and "object".
        - resolved_text: The entire text with coreference annotations appended.
    """
    # Process the original text
    doc = nlp(text)
    
    sentence_records = []
    # Extract sentence-level data from the original text
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue
        
        sent_doc = nlp(sent_text)
        entities = [(ent.text, ent.label_) for ent in sent_doc.ents]
        
        subject = None
        obj = None
        for token in sent_doc:
            if token.dep_ in ("nsubj", "nsubjpass") and subject is None:
                subject = token.text
            if token.dep_ in ("dobj", "pobj") and obj is None:
                obj = token.text
        
        sentence_records.append({
            "sentence": sent_text,
            "entities": entities,
            "subject": subject,
            "object": obj
        })
    
    # Apply coreference resolution using append_corefs
    resolved_text = append_corefs(doc)
    
    return sentence_records, resolved_text

# ----------------------------
# Create a DataFrame of sentence-level records
# ----------------------------

df_sentences = pd.DataFrame(results_sentences)
df_sentences.to_csv('df_sentences_test_EUSpeech.csv', index=False)
print(df_sentences.head())