In [21]:
from tqdm import tqdm
from conllu import parse_incr
import os, csv, re

In [None]:
wh_words = {"what", "who", "whom", "whose", "which", "where", "when", "why", "how"}
allowed_clause_deprels = {"ccomp", "xcomp", "csubj", "obj"}
RELATIVIZERS = {"who", "whom", "whose", "which", "when", "where", "why"}

In [30]:
def is_embedded_wh(tokenlist):
    """
    Detects embedded wh-questions in a CoNLL-U formatted sentence.

    This version is embedding verb agnostic, meaning it identifies structures
    based on syntactic patterns rather than a predefined list of embedding verbs.
    It checks if a wh-clause (identified by allowed_clause_deprels) is
    governed by any verb.

    Args:
        tokenlist (list): A list of dictionaries, where each dictionary represents
                          a token with CoNLL-U fields like "id", "form", "lemma",
                          "head", "deprel", "upos".
        wh_words (set): A set of wh-words (e.g., "what", "who").
        allowed_clause_deprels (set): A set of dependency relations that an
                                      embedded wh-clause might have relative to its
                                      governing (embedding) verb.

    Returns:
        bool: True if an embedded wh-question matching the criteria is found,
              False otherwise.
    """
    if not tokenlist:
        return False

    token_map = {t["id"]: t for t in tokenlist if isinstance(t, dict) and "id" in t}

    for wh_token_index, token in enumerate(tokenlist):
        
        if not isinstance(token, dict):
            continue

        # Check wh-word
        token_form = token.get("form", "").lower()
        if token_form not in wh_words:
            continue

        # Find wh-word root
        current_head_id = token.get("head")
        wh_clause_verb = None
        temp_current_id = current_head_id
        while temp_current_id and temp_current_id in token_map:
            candidate_node = token_map[temp_current_id]
            if candidate_node.get("upos") in {"VERB", "AUX"}:
                wh_clause_verb = candidate_node
                break
            temp_current_id = candidate_node.get("head")

        if not wh_clause_verb:
            continue

        # Make sure deprel with head is a clausal complement, subject, or object.
        wh_clause_deprel = wh_clause_verb.get("deprel", "")
        if wh_clause_deprel not in allowed_clause_deprels:
            continue

        # Find the embedding token, make sure it is a verb
        embedding_verb_id = wh_clause_verb.get("head")
        if not embedding_verb_id or embedding_verb_id not in token_map:
            continue
        
        embedding_verb_token = token_map[embedding_verb_id]
        if embedding_verb_token.get("upos") not in {"VERB", "AUX"}:
            continue

        return True

    return False

In [31]:
def is_matrix_wh(tokenlist):
    """
    Determines if a sentence contains a wh-question in the matrix clause.
    Args:
        tokenlist: A list of token dictionaries from a CoNLL-U parse
    Returns:
        bool: True if the sentence contains a matrix wh-question, False otherwise
    """
    if not tokenlist:
        return False
    
    # Check if the sentence has a question mark
    has_question_mark = any(
        isinstance(token, dict) and token.get("form") == "?" for token in tokenlist
    )
    if not has_question_mark:
        return False

    token_map = {t.get("id"): t for t in tokenlist if isinstance(t, dict) and "id" in t}

    # Find root
    root_token = None
    for token in tokenlist:
        if isinstance(token, dict) and token.get("head") == 0:
            root_token = token
            break
    if not root_token:
        return False

    # Look for wh-word that is a direct dependent of the root (main clause)
    for token in tokenlist:
        if not isinstance(token, dict) or "form" not in token:
            continue
        if token.get("form", "").lower() in wh_words:
            if token.get("id") == root_token.get("id"):
                return True

    return False


In [None]:
def is_cleft(tokenlist):
    """
    Detect if a sentence in CoNLL-U format is an it-cleft construction.
    Returns:
        dict: Information about whether the sentence is an it-cleft and its components.

    Based off https://universaldependencies.org/en/dep/acl-relcl.html#clefts
    """
    if not tokenlist:
        return {"is_it_cleft": False}

    # Look for "it" as an expletive
    it_token = next((token for token in tokenlist if token.get("form", "").lower() == "it" and token.get("deprel") == "expl"), None)
    if not it_token:
        return {"is_it_cleft": False}

    # Look for a copula verb (e.g., "is", "was") associated with "it"
    copula_token = next((token for token in tokenlist if token.get("deprel") == "cop"), None)
    if not copula_token:
        return {"is_it_cleft": False}

    # Look for a relative clause (advcl:relcl) dependent on the copula or another token in the sentence
    rel_clause_token = next((token for token in tokenlist if token.get("deprel") == "advcl:relcl"), None)
    if not rel_clause_token:
        return {"is_it_cleft": False}

    # Check for a relativizer (e.g., "that", "who") associated with the relative clause
    relativizer_token = next((token for token in tokenlist if token.get("head") == rel_clause_token.get("id") and token.get("deprel") == "mark"), None)

    return {
        "is_it_cleft": True,
        "it_token": it_token,
        "copula_token": copula_token,
        "rel_clause_token": rel_clause_token,
        "relativizer_token": relativizer_token
    }


In [33]:
def is_pseudocleft(tokenlist):
    """
    Determines if a sentence contains a pseudocleft construction.
    
    Based off: https://universaldependencies.org/en/dep/acl-relcl.html#clefts
    
    Args:
        tokenlist: A list of token dictionaries from a CoNLL-U parse
        
    Returns:
        bool: True if the sentence contains a pseudocleft construction, False otherwise
    """
    # Check if we have tokens to process
    if not tokenlist:
        return False
    
    # Build a map of token IDs to tokens for easy lookup
    token_map = {t.get("id"): t for t in tokenlist if isinstance(t, dict) and "id" in t}

    
    # find the root token (the one whose head is 0)
    root_token = next(
        (t for t in tokenlist 
         if isinstance(t, dict) and t.get("head") == 0),
        None
    )
    
    # Find a copula whose head is the root
    copula = next(
        (t for t in tokenlist
         if isinstance(t, dict)
           and t.get("deprel") == "cop"
           and t.get("head") == root_token.get("id")),
        None
    )
    if not copula:
        return False

    # Find a subject whose head is the root
    subj = next(
        (t for t in tokenlist
         if isinstance(t, dict)
           and t.get("deprel") == "nsubj"
           and t.get("head") == root_token.get("id")
           and t.get("id") <= copula.get("id")
           and t.get("form", "").lower() in wh_words),
        None
    )
    if not subj:
        return False

    # Find a relative clause (acl:relcl) whose head is that subject
    relcl = next(
        (t for t in tokenlist
         if isinstance(t, dict)
           and t.get("deprel") == "acl:relcl"
           and t.get("head") == subj.get("id")),
        None
    )
    return relcl is not None

In [None]:
def is_topicalization(tokenlist):
    """
    Detects argument topicalization (not adjuncts) in a sentence.
    Returns True if a core argument (obj, iobj, etc.) appears sentence-initially,
    possibly followed by a comma, and is not a subject or adjunct.
    """
    if not tokenlist:
        return False

    # TOpicalization is marked by dislocation in UD
    if not "dislocated" in [tokenlist[i].get("deprel", "") for i in range(len(tokenlist))]:
        return False
    
    # Make sure that the dislocared item is before the head, and not an adposition (would indicate adjunct)
    for i, token in enumerate(tokenlist):
        if not isinstance(token, dict) or "form" not in token:
            continue

        if "dislocated" in token.get("deprel"):
            head = token.get("head")
            if token.get("id") < head:
                if token.get("upos") == "ADP":
                    continue
                return True

    return False


In [35]:
def is_restrictive_relative(tokenlist):
    """
    Returns True if a restrictive relative clause with a relativizer is found, else False.
    """
    for t in tokenlist:
        # Check if a relatibe clase
        if t.get("deprel") == "acl:relcl":
            relcl_idx = t["id"]
            head_idx = t.get("head")
            if head_idx is None or head_idx <= 0:
                continue

            # check for a relativizer 
            has_relativizer = any(
                ref.get("form", "").lower() in RELATIVIZERS and ref.get("head") == t["id"]
                for ref in tokenlist
            )
            if not has_relativizer:
                continue  # Not a real relative clause

            # make sure no comma (comma is likely ascriptive)
            start = min(head_idx, relcl_idx)
            end = max(head_idx, relcl_idx)
            has_comma = any(
                token["form"].strip() == "," and start < token["id"] < end
                for token in tokenlist
            )

            if not has_comma:
                return True 

    return False  

In [None]:
ud_folder = "ud/"
if not os.path.exists(ud_folder):
    os.makedirs(ud_folder, exist_ok=True)
    print(f"Please download the UD 2.16 from https://universaldependencies.org/ and to folder '{ud_folder}'.")
    exit(1)

ud_file_paths = [ud_folder + path for path in
                ["ud-treebanks-v2.15/UD_English-EWT/en_ewt-ud-train.conllu",
                 "ud-treebanks-v2.15/UD_English-EWT/en_ewt-ud-dev.conllu",
                 "ud-treebanks-v2.15/UD_English-EWT/en_ewt-ud-test.conllu"]]

# Counters and example lists
total_sentences = 0
total_matrix = 0
total_embedded = 0
total_cleft = 0
total_pseudo_cleft = 0
total_topicalization = 0
total_restrictive_relative = 0

ud_examples_matrix = []
ud_examples_embedded = []
ud_examples_cleft = []
ud_examples_pseudo_cleft = []
ud_examples_topicalization = []
ud_examples_restrictive = []

for ud_file_path in tqdm(ud_file_paths):
    # Process the UD corpus.
    with open(ud_file_path, "r", encoding="utf-8") as f:
        for tokenlist in parse_incr(f):
            total_sentences += 1
            sentence_text = " ".join(token["form"] for token in tokenlist if token["form"])
            
            if is_matrix_wh(tokenlist):
                total_matrix += 1
                ud_examples_matrix.append(sentence_text)
            if is_embedded_wh(tokenlist):
                total_embedded += 1
                ud_examples_embedded.append(sentence_text)
            if is_cleft(tokenlist)['is_it_cleft']:
                total_cleft += 1
                ud_examples_cleft.append(sentence_text)
            if is_pseudocleft(tokenlist):
                total_pseudo_cleft += 1
                ud_examples_pseudo_cleft.append(sentence_text)
            if is_topicalization(tokenlist):
                total_topicalization += 1
                ud_examples_topicalization.append(sentence_text)
            if is_restrictive_relative(tokenlist):
                total_restrictive_relative += 1
                ud_examples_restrictive.append(sentence_text)

# Write the example sentences to CSV files.
os.makedirs("ud/ud_frequency", exist_ok=True)

def write_examples(file_path, examples):
    with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["sentence"])
        for ex in examples:
            writer.writerow([ex])

write_examples("ud_frequency/matrix.csv", ud_examples_matrix)
write_examples("ud_frequency/embedded.csv", ud_examples_embedded)
write_examples("ud_frequency/cleft.csv", ud_examples_cleft)
write_examples("ud_frequency/pseudo_cleft.csv", ud_examples_pseudo_cleft)
write_examples("ud_frequency/topicalization.csv", ud_examples_topicalization)
write_examples("ud_frequency/restrictive_relative.csv", ud_examples_restrictive)

print(f"Total sentences: {total_sentences}")
print(f"Total matrix wh-questions: {total_matrix}, proportion: {total_matrix/total_sentences:.2%}")
print(f"Total embedded wh-questions: {total_embedded}, proportion: {total_embedded/total_sentences:.2%}")
print(f"Total cleft constructions: {total_cleft}, proportion: {total_cleft/total_sentences:.2%}")
print(f"Total pseudo-cleft constructions: {total_pseudo_cleft}, proportion: {total_pseudo_cleft/total_sentences:.2%}")
print(f"Total topicalization constructions: {total_topicalization}, proportion: {total_topicalization/total_sentences:.2%}")
print(f"Total restrictive relative clauses: {total_restrictive_relative}, proportion: {total_restrictive_relative/total_sentences:.2%}")

100%|██████████| 3/3 [00:03<00:00,  1.21s/it]

Total sentences: 16622
Total matrix wh-questions: 82, proportion: 0.49%
Total embedded wh-questions: 308, proportion: 1.85%
Total cleft constructions: 20, proportion: 0.12%
Total pseudo-cleft constructions: 6, proportion: 0.04%
Total topicalization constructions: 6, proportion: 0.04%
Total restrictive relative clauses: 504, proportion: 3.03%



