1. Data and Libraries

In [None]:
from pandas import read_csv
df = read_csv("/content/EUSpeech_translated.csv")

In [None]:
# @title
import re
import pandas as pd
import spacy
#import coreferee
import concurrent.futures
from tqdm import tqdm  # For progress display

2. Pre-Processing Functions

2.1 Tokenization and key sentence extraction

In [None]:
import re
import pandas as pd
import nltk

# Ensure required NLTK resources are available
try:
    nltk.data.find("tokenizers/punkt_tab/english")
except LookupError:
    nltk.download("punkt_tab")

import re
from nltk.tokenize import word_tokenize, sent_tokenize


# ----------------------------
# Function to get rokenized words and sentences
# ----------------------------
def tokenize_text(df, text_col="text.en", keywords=None):
    """
    Filters rows containing any of the specified keywords in the text,
    tokenizes the text into words, splits the text into sentences, extracts
    context sentences that contain the keywords along with their adjacent sentences,
    and extracts sentence-level records (sentence text, entities, subject, object)
    using spaCy, all within this single function.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a text column.
        text_col (str): Name of the column containing text (default "text.en").
        keywords (list of str): List of keywords to filter the text. 
                                If None, defaults to:
                                ["Armenia", "Azerbaijan", "Belarus", "Georgia", "Moldova", "Ukraine"].

    Returns:
        pd.DataFrame: A copy of the DataFrame with new columns:
            - 'tokenized': List of tokens from the text.
            - 'sentences': List of sentences from the text.
            - 'context': List of context strings (each is a snippet including the matching sentence 
                         plus its preceding and following sentences, if available).
            - 'sentence_records': List of dictionaries with sentence-level data (sentence, entities, subject, object).
            - 'matched_keyword': The matched keyword(s) found in the text.
    """
    # Default keywords if not provided
    if keywords is None:
        keywords = ["Armenia", "Azerbaijan", "Belarus", "Georgia", "Moldova", "Ukraine"]

    # Compile a regex pattern from the keywords for filtering text (case-insensitive)
    regex_pattern = r"\b(?:{})\b".format("|".join(map(re.escape, keywords)))
    pattern = re.compile(regex_pattern, re.IGNORECASE)

    # Pre-filter rows that contain any of the keywords in the text
    df_filtered = df[df[text_col].apply(lambda x: bool(pattern.search(x)) if isinstance(x, str) else False)].copy()

    # Add a new column with the matched keyword(s)
    def extract_keywords(text):
        if isinstance(text, str):
            matches = pattern.findall(text)
            return ", ".join(sorted(set(matches))) if matches else None
        return None

    df_filtered["matched_keyword"] = df_filtered[text_col].apply(extract_keywords)

    # Tokenize the text into words and store in a new column 'tokenized'
    df_filtered['tokenized'] = df_filtered[text_col].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

    # Split the text into sentences and store in a new column 'sentences'
    df_filtered['sentences'] = df_filtered[text_col].apply(lambda x: sent_tokenize(x) if isinstance(x, str) else [])

    # Extract context for each row based on its list of sentences
    def extract_context(sentences):
        contexts = []
        for i, sentence in enumerate(sentences):
            # If the sentence contains any keyword (case-insensitive)
            if any(re.search(r'\b{}\b'.format(re.escape(keyword)), sentence, re.IGNORECASE) for keyword in keywords):
                start = max(i - 1, 0)
                end = min(i + 2, len(sentences))
                context = " ".join(sentences[start:end])
                contexts.append(context)
        return contexts

    df_filtered['context'] = df_filtered['sentences'].apply(extract_context)

    return df_filtered




2.2 Coreference Resolution

In [None]:
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl#egg=en_coreference_web_trf


In [None]:
import spacy
!pip show spacy
import spacy_experimental
nlp = spacy.load("en_coreference_web_trf")


In [None]:
def append_corefs(text):
    """
    Returns a version of the document text where for each non-main coreferent mention,
    the main mention is appended in square brackets.
    
    For example, if a mention is "he" and the main mention for that coreference chain is "Fidel Castro",
    the function will change "he" to "he [Fidel Castro]".
    
    Args:
        doc (spacy.tokens.Doc): A spaCy document processed with a coreference model that sets doc.spans.
    
    Returns:
        str: The modified text with coreference annotations appended.
    """
    resolved_text = text
    offset = 0
    insertions = []
    doc = nlp(text)
    
    # Iterate over each coreference chain in doc.spans.
    for chain in doc.spans:
        # Process only chains that begin with the expected prefix.
        if not chain.startswith("coref_clusters"):
            continue
        
        # The first mention is the main reference.
        main_text = doc.spans[chain][0].text
        
        # For every subsequent mention, we want to append the main text in brackets.
        for idx, span in enumerate(doc.spans[chain]):
            if idx > 0:
                # Insert after the mention; use span.end_char as the insertion point.
                insertions.append([span.end_char, f" [{main_text}]"])
    
    # Sort the insertions by the character index.
    for pos, insertion in sorted(insertions, key=lambda x: x[0]):
        resolved_text = resolved_text[:pos + offset] + insertion + resolved_text[pos + offset:]
        offset += len(insertion)
    
    return resolved_text

# Example usage:
import spacy
import spacy_experimental

nlp = spacy.load("en_coreference_web_trf")
text = """Fidel Castro led a communist revolution that toppled the Cuban government in 1959, after which he declared himself prime minister. He held the title until 1976, when it was abolished and he became head of the Communist Party and president of the council of state and the council of ministers. With his health failing, Castro handed power to his brother, Raúl, in 2006. He died in 2016."""
resolved = append_corefs(text)
print("Original text:")
print(text)
print("\nText with coreferences appended:")
print(resolved)


3. Running Pre-Processing

3.1 Define Pre-Processing Functions

In [None]:

# ----------------------------
# Define De-Bugged Processing function 
# ----------------------------

def debug_process_text(df):
    try:
        print(f"[DEBUG] Processing text (first 50 chars): {df[:50]}...")
        df_filtered = tokenize_text(df)
        print(f"[DEBUG] Finished processing text (first 50 chars): {df[:50]}...")
        return df_filtered
    except Exception as e:
        print(f"[ERROR] Exception while processing text (first 50 chars): {df[:50]}... : {e}")
        raise
    if False:
        try:
            print(f"[DEBUG] Processing text (first 50 chars): {df[:50]}...")
            df_filtered["resolved_text"] = df_filtered["text.en"].apply(
                lambda t: append_corefs(nlp(t)) if isinstance(t, str) else t
            )
            print(f"[DEBUG] Finished processing text (first 50 chars): {df[:50]}...")
            return df_filtered
        except Exception as e:
            print(f"[ERROR] Exception while processing text (first 50 chars): {df[:50]}... : {e}")
            raise




3.2 Run the Parallelized Pre-Processing

In [None]:
# ----------------------------
# MAIN WORKFLOW
# ----------------------------
# Parallelize processing for all texts in df_prefiltered (assumed to have a "text.en" column).
results_dfs = []
max_workers = 30

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit one task per row.
    futures = {executor.submit(debug_process_text, df.iloc[[i]]): i 
               for i in range(len(df))}
    print(f"[DEBUG] Submitted {len(futures)} tasks to the executor.")
    
    for future in tqdm(concurrent.futures.as_completed(futures),
                       total=len(futures),
                       desc="Processing texts"):
        try:
            result_df = future.result()  # Each result is an enriched DataFrame for one row.
            results_dfs.append(result_df)
        except Exception as exc:
            print(f"[ERROR] Error processing a text: {exc}")

# Concatenate all single-row DataFrames into one final DataFrame.
df_filtered_all = pd.concat(results_dfs, ignore_index=True)
print("[DEBUG] Final enriched DataFrame:")
print(df_filtered_all.head())

In [None]:
df_filtered_all.to_csv('output.csv', index=False)