**Code for xml files that were downloaded as zip files from github repositories**

In [None]:
import zipfile
import os
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
from google.colab import files
import csv

# === 1. Upload ZIP files ===
uploaded = files.upload()

# === 2. Create/clean working directory ===
work_dir = "tei_analysis"
if os.path.exists(work_dir):
    import shutil
    shutil.rmtree(work_dir)  # remove previous contents
os.makedirs(work_dir)

# === 3. Extract all uploaded ZIP files ===
for filename in uploaded:
    if filename.endswith(".zip"):
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(work_dir)

# === 4. Helper to get all XML files ===
def get_all_xml_files(directory):
    xml_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xml"):
                xml_files.append(os.path.join(root, file))
    return xml_files

xml_files = get_all_xml_files(work_dir)
print(f"Found {len(xml_files)} XML files.")

# === 5. Parse only tags inside <body> ===
tag_counter_by_file = defaultdict(Counter)
all_tags = set()

def extract_tags_from_body(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Remove namespace
        ns_tag = lambda tag: tag.split("}", 1)[1] if "}" in tag else tag

        # Find <body> element
        body = root.find(".//{http://www.tei-c.org/ns/1.0}body")
        if body is None:
            print(f"No <body> found in {file_path}")
            return

        for elem in body.iter():
            if elem is body:
                continue  # skip the <body> tag itself
            tag = ns_tag(elem.tag)
            tag_counter_by_file[os.path.basename(file_path)][tag] += 1
            all_tags.add(tag)

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")

for xml_file in xml_files:
    extract_tags_from_body(xml_file)

# === 6. Write output to CSV ===
output_file = "tei_tag_frequencies.csv"
all_tags = sorted(list(all_tags))  # ensure consistent column order

with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    header = ['filename'] + all_tags
    writer.writerow(header)

    for filename, tag_counts in tag_counter_by_file.items():
        row = [filename] + [tag_counts.get(tag, 0) for tag in all_tags]
        writer.writerow(row)

# === 7. Download the file ===
files.download(output_file)

Saving wright_tei.zip to wright_tei.zip
Found 2876 XML files.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Code for xml files available to download from each webpage on the codyarchive for books (https://codyarchive.org/).**

In [None]:
# Install dependencies
!pip install beautifulsoup4 requests tqdm

import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
import csv
from google.colab import files
from tqdm import tqdm
import time

# === CONFIGURATION ===
genre_to_scrape = "books"  # "books" or "correspondence"
base_url = f"https://codyarchive.org/texts/{genre_to_scrape}"

headers = {"User-Agent": "Mozilla/5.0"}
batch_size = 10

# === STEP 1: Get all item links from books/letters list ===
print(f"Crawling {base_url} ...")
r = requests.get(base_url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.select("a[href^='/item/wfc.']")
item_urls = ["https://codyarchive.org" + a["href"] for a in links]
item_urls = list(dict.fromkeys(item_urls))  # Remove duplicates
print(f"Found {len(item_urls)} item pages.")

# === STEP 2: Extract XML URLs ===
xml_urls = []
for item in tqdm(item_urls, desc="Fetching XML links"):
    try:
        time.sleep(0.5)
        r = requests.get(item, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        a = soup.find("a", href=lambda h: h and h.endswith(".xml"))
        if a:
            xml_href = a['href']
            if not xml_href.startswith("http"):
                xml_href = "https://codyarchive.org" + xml_href
            xml_urls.append((genre_to_scrape, xml_href))
    except Exception as e:
        print(f"Error parsing {item}: {e}")

print(f"Found {len(xml_urls)} XML files.")

# === STEP 3: Extract TEI tags ===
tag_counter_by_file = defaultdict(Counter)
all_tags = set()

def extract_tags(xml_url, genre):
    try:
        time.sleep(0.5)
        r = requests.get(xml_url, headers=headers)
        r.raise_for_status()
        root = ET.fromstring(r.content)
        ns = "{http://www.tei-c.org/ns/1.0}"
        body = root.find(f".//{ns}body")
        if body is None:
            print(f"No <body> in {xml_url}")
            return
        fname = xml_url.split("/")[-1]
        file_id = f"{genre}:{fname}"
        for elem in body.iter():
            if elem is body: continue
            tag = elem.tag.split("}", 1)[-1]
            tag_counter_by_file[file_id][tag] += 1
            all_tags.add(tag)
    except Exception as e:
        print(f"Error reading {xml_url}: {e}")

for i in range(0, len(xml_urls), batch_size):
    batch = xml_urls[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(len(xml_urls)-1)//batch_size + 1}")
    for genre, url in batch:
        extract_tags(url, genre)

# === STEP 4: Save CSV ===
all_tags = sorted(all_tags)
filename = f"cody_{genre_to_scrape}_tei_tags.csv"

with open(filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filename"] + all_tags)
    for file_id, tag_counts in tag_counter_by_file.items():
        row = [file_id] + [tag_counts.get(tag, 0) for tag in all_tags]
        writer.writerow(row)

print(f"Saved CSV: {filename}")
files.download(filename)

🔍 Crawling https://codyarchive.org/texts/correspondence ...
✅ Found 50 item pages.


Fetching XML links: 100%|██████████| 50/50 [00:37<00:00,  1.32it/s]


✅ Found 50 XML files.

📦 Processing batch 1/5

📦 Processing batch 2/5

📦 Processing batch 3/5

📦 Processing batch 4/5

📦 Processing batch 5/5

✅ Saved CSV: cody_correspondence_tei_tags.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Code for xml files available to download from each webpage on the codyarchive for letters (https://codyarchive.org/).**

In [None]:
!pip install beautifulsoup4 requests tqdm

import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
import csv
from google.colab import files
from tqdm import tqdm
import time
import re

# === CONFIGURATION ===
genre_to_scrape = "correspondence"  # or "books"
batch_size = 10
max_pages = None   # or set to integer for testing, e.g. 5

base_list_url = f"https://codyarchive.org/texts/{genre_to_scrape}"
headers = {"User-Agent": "Mozilla/5.0"}

# === STEP 1: Crawl all paginated index pages ===
item_urls = []
page = 1
while True:
    print(f"🔍 Fetching index page {page}")
    r = requests.get(f"{base_list_url}?page={page}", headers=headers)
    if r.status_code != 200:
        print(f"Failed or no more pages at page {page}")
        break
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.select("a[href^='/item/wfc.']")
    hrefs = {a["href"] for a in links}
    if not hrefs:
        print("No item links found on this page.")
        break
    for href in hrefs:
        item_urls.append("https://codyarchive.org" + href)
    page += 1
    time.sleep(1)
    if max_pages and page > max_pages:
        break

item_urls = list(dict.fromkeys(item_urls))
print(f"Total item pages found: {len(item_urls)}")

# === STEP 2: Extract XML links ===
xml_urls = []
for item in tqdm(item_urls, desc="Fetching XML links"):
    try:
        time.sleep(0.5)
        r = requests.get(item, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        a = soup.find("a", href=lambda h: h and h.endswith(".xml"))
        xml_href = a['href'] if a else None
        if not xml_href:
            m = re.search(r'([\w\.\-]+\.xml)', soup.text)
            xml_href = m.group(1) if m else None
        if xml_href:
            if not xml_href.startswith("http"):
                xml_href = "https://codyarchive.org" + xml_href
            xml_urls.append((genre_to_scrape, xml_href))
    except Exception as e:
        print(f"Error parsing {item}: {e}")

print(f"Found {len(xml_urls)} XML files.")

# === STEP 3: Extract TEI tags from <body> ===
tag_counter_by_file = defaultdict(Counter)
all_tags = set()

def extract_tags(xml_url, genre):
    try:
        time.sleep(0.5)
        r = requests.get(xml_url, headers=headers)
        r.raise_for_status()
        root = ET.fromstring(r.content)
        ns = "{http://www.tei-c.org/ns/1.0}"
        body = root.find(f".//{ns}body")
        if body is None:
            print(f"No <body> in {xml_url}")
            return
        fname = xml_url.split("/")[-1]
        file_id = f"{genre}:{fname}"
        for elem in body.iter():
            if elem is body: continue
            tag = elem.tag.split("}",1)[-1]
            tag_counter_by_file[file_id][tag] += 1
            all_tags.add(tag)
    except Exception as e:
        print(f"Error reading {xml_url}: {e}")

for i in range(0, len(xml_urls), batch_size):
    batch = xml_urls[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(len(xml_urls)-1)//batch_size + 1}")
    for genre, url in batch:
        extract_tags(url, genre)

# === STEP 4: Save results to CSV ===
all_tags = sorted(all_tags)
filename = f"cody_{genre_to_scrape}_tei_tags.csv"

with open(filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filename"] + all_tags)
    for file_id, counts in tag_counter_by_file.items():
        row = [file_id] + [counts.get(t, 0) for t in all_tags]
        writer.writerow(row)

print(f"Saved CSV: {filename}")
files.download(filename)

🔍 Fetching index page 1
🔍 Fetching index page 2
🔍 Fetching index page 3
🔍 Fetching index page 4
🔍 Fetching index page 5
🔍 Fetching index page 6
🔍 Fetching index page 7
🔍 Fetching index page 8
⚠️ No item links found on this page.
✅ Total item pages found: 347


Fetching XML links: 100%|██████████| 347/347 [03:51<00:00,  1.50it/s]


✅ Found 347 XML files.

📦 Processing batch 1/35

📦 Processing batch 2/35

📦 Processing batch 3/35

📦 Processing batch 4/35

📦 Processing batch 5/35

📦 Processing batch 6/35

📦 Processing batch 7/35

📦 Processing batch 8/35

📦 Processing batch 9/35

📦 Processing batch 10/35

📦 Processing batch 11/35

📦 Processing batch 12/35

📦 Processing batch 13/35

📦 Processing batch 14/35

📦 Processing batch 15/35

📦 Processing batch 16/35

📦 Processing batch 17/35

📦 Processing batch 18/35

📦 Processing batch 19/35

📦 Processing batch 20/35

📦 Processing batch 21/35

📦 Processing batch 22/35

📦 Processing batch 23/35

📦 Processing batch 24/35

📦 Processing batch 25/35

📦 Processing batch 26/35

📦 Processing batch 27/35

📦 Processing batch 28/35

📦 Processing batch 29/35

📦 Processing batch 30/35

📦 Processing batch 31/35

📦 Processing batch 32/35

📦 Processing batch 33/35

📦 Processing batch 34/35

📦 Processing batch 35/35

✅ Saved CSV: cody_correspondence_tei_tags.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>