In [3]:
# STEP 1.1: Imports
import os
import xml.etree.ElementTree as ET
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("punkt")

# STEP 1.2: Load Halliday-based dictionary
from halliday_dict import HALLIDAY_CONJUNCTIONS

# STEP 1.3: Flatten dictionary for lookup (Halliday → flat structure)
def flatten_conj_dict(d, path=None, result=None):
    if result is None:
        result = {}
    if path is None:
        path = []
    if isinstance(d, dict):
        for k, v in d.items():
            flatten_conj_dict(v, path + [k], result)
    elif isinstance(d, list):
        for conj in d:
            result[conj.lower()] = " → ".join(path)
    return result

flat_conj_dict = flatten_conj_dict(HALLIDAY_CONJUNCTIONS)
print(f"✅ Loaded {len(flat_conj_dict)} conjunctions from dictionary.")

✅ Loaded 382 conjunctions from dictionary.


[nltk_data] Downloading package punkt to /Users/joselema/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# STEP 2.1: Process a TEI XML file and return conjunction metadata at paragraph level
def process_paragraph_conjunctions(file_path, flat_conj_dict):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        # STEP 2.2: Extract and clean all <p> paragraph text
        paragraphs = root.findall(".//tei:text//tei:p", ns)

        def get_text_recursive(p):
            return "".join(p.itertext()).strip()

        clean_paragraphs = [get_text_recursive(p) for p in paragraphs if get_text_recursive(p)]

        # STEP 2.3: Initialize counters
        total_paragraphs = len(clean_paragraphs)
        total_words = sum(len(p.split()) for p in clean_paragraphs)
        para_conj_count = 0
        category_counts = {}

        # STEP 2.4: Scan each paragraph's first sentence
        for paragraph in clean_paragraphs:
            if paragraph.strip():
                first_sent = sent_tokenize(paragraph)[0].strip().lower()
                for conj, category in flat_conj_dict.items():
                    if first_sent.startswith(conj):
                        para_conj_count += 1
                        tag = category + " → Paragraph"
                        category_counts[tag] = category_counts.get(tag, 0) + 1
                        break

        # STEP 2.5: Return summary
        return {
            "file_name": os.path.basename(file_path),
            "total_paragraphs": total_paragraphs,
            "total_words": total_words,
            "para_initial_conj_total": para_conj_count,
            "mean_para_conj": round(para_conj_count / total_paragraphs, 4) if total_paragraphs else 0,
            **category_counts
        }

    except Exception as e:
        return {"file_name": os.path.basename(file_path), "error": str(e)}

In [5]:
import os
import pandas as pd

# STEP 3.1: Process all TEI files in a folder
def batch_process_paragraph_conjunctions(folder_path, flat_conj_dict):
    summary_data = []

    for fname in os.listdir(folder_path):
        if fname.endswith(".xml"):
            fpath = os.path.join(folder_path, fname)
            result = process_paragraph_conjunctions(fpath, flat_conj_dict)
            summary_data.append(result)

    # STEP 3.2: Convert to DataFrame
    df_summary = pd.DataFrame(summary_data)

    # STEP 3.3: Fill missing category columns with 0
    df_summary.fillna(0, inplace=True)

    return df_summary

In [7]:
# ✅ Run on your folder
folder_path = "/Users/joselema/Desktop/ConjuntionArticle/Code_3_Inter_PAR/tei"
para_summary_df = batch_process_paragraph_conjunctions(folder_path, flat_conj_dict)

# ✅ Add numbered prefix: interPAR0_file_name, interPAR1_total_paragraphs, etc.
def prefix_para_summary_columns(df):
    new_cols = {}
    for i, col in enumerate(df.columns):
        new_cols[col] = f"interPAR{i}_{col}"
    return df.rename(columns=new_cols)

para_summary_df_prefixed = prefix_para_summary_columns(para_summary_df)

# ✅ Export CSV
para_summary_df_prefixed.to_csv("paragraph_conjunction_summary.csv", index=False)

print("✅ Exported paragraph summary with interPAR column prefixes.")

✅ Exported paragraph summary with interPAR column prefixes.


In [11]:
para_summary_df_prefixed.shape

(2898, 37)

In [17]:
import pandas as pd

# ✅ Load inter-paragraph summary
conj_df = pd.read_csv("/Users/joselema/Desktop/ConjuntionArticle/Code_3_inter_PAR/paragraph_conjunction_summary.csv")

# ✅ Load GIG metadata
meta_df = pd.read_excel("/Users/joselema/Desktop/ConjuntionArticle/Code_3_inter_PAR/gig_metadata.xlsx")

# ✅ Extract batch_id and text_id from column: 'interPAR0_file_name'
conj_df["batch_id"] = conj_df["interPAR0_file_name"].str.extract(r"^(\d+)_")[0].astype(int)
conj_df["text_id"] = conj_df["interPAR0_file_name"].str.extract(r"_(.+)\.xml$")[0]

# ✅ Ensure text_id in metadata is string
meta_df["text_id"] = meta_df["text_id"].astype(str)

# ✅ Merge using batch_id and text_id
merged_df = pd.merge(conj_df, meta_df, on=["batch_id", "text_id"], how="inner")

# ✅ Save to correct path and rename output clearly
merged_df.to_csv("/Users/joselema/Desktop/ConjuntionArticle/Code_3_inter_PAR/interPAR_conj_results_merged_with_metadata.csv", index=False)

print("✅ Inter-paragraph results merged successfully using batch_id + text_id.")

✅ Inter-paragraph results merged successfully using batch_id + text_id.


In [21]:
merged_df.shape

(2898, 54)