In [1]:
# Latvian Communist Leaflet Corpus — Character, Word and Line Counts
# Run in Google Colab. Upload a ZIP archive containing .txt leaflet files when prompted.
# Output: per-file counts (characters with spaces, characters without spaces, words, lines)
# and a CSV for download.

import zipfile
import io
import os
import re
import csv

# Try to import pandas for nicer display (optional)
try:
    import pandas as pd
except Exception:
    pd = None

# Import Colab file utilities (if running in Colab)
try:
    from google.colab import files
    COLAB = True
except Exception:
    COLAB = False

if not COLAB:
    raise RuntimeError("This script is written for Google Colab. Please run it in Colab.")

# 1) Upload a ZIP file
print("Please upload a ZIP file containing .txt leaflet files (one ZIP).")
uploaded = files.upload()
if not uploaded:
    raise SystemExit("No file uploaded.")

# Take the first uploaded file (assume it's the zip)
zip_filename = next(iter(uploaded.keys()))
zip_bytes = uploaded[zip_filename]

# Utility: extract the text body after the 'text:' marker
def extract_text_body(raw_bytes):
    """
    Decode bytes to text (utf-8 with replacement) and return the content after the first
    occurrence of a 'text:' marker. If 'text:' is not present, return the whole file content.
    """
    s = raw_bytes.decode('utf-8', errors='replace')
    m = re.search(r'(?im)^\s*text:\s*\n', s)
    if m:
        return s[m.end():].strip()
    m = re.search(r'(?im)\btext:\s*(.*)', s, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    return s.strip()

# Utility: count words
WORD_RE = re.compile(r"\b[\w'’\-]+\b", flags=re.UNICODE)

def count_metrics(text):
    """
    Given a text, return a dict with:
    - chars_with_spaces: total characters (len of string)
    - chars_without_spaces: characters excluding all whitespace
    - words: number of word tokens
    - lines: number of lines
    """
    chars_with_spaces = len(text)
    chars_without_spaces = len(re.sub(r'\s+', '', text))
    words = len(WORD_RE.findall(text))
    lines = len(text.splitlines()) if text else 0
    return {
        'chars_with_spaces': chars_with_spaces,
        'chars_without_spaces': chars_without_spaces,
        'words': words,
        'lines': lines
    }

# Process the ZIP archive
zip_file = zipfile.ZipFile(io.BytesIO(zip_bytes))
txt_members = [m for m in zip_file.infolist() if m.filename.lower().endswith('.txt')]

if not txt_members:
    raise SystemExit("No .txt files found in the uploaded ZIP archive.")

results = []
total_chars_with = 0
total_chars_without = 0
total_words = 0
total_lines = 0

for member in sorted(txt_members, key=lambda m: m.filename):
    try:
        raw = zip_file.read(member.filename)
    except Exception:
        try:
            raw = zip_file.read(os.path.basename(member.filename))
        except Exception as e:
            print(f"Warning: could not read {member.filename}: {e}")
            continue

    text_body = extract_text_body(raw)
    m = count_metrics(text_body)

    s = raw.decode('utf-8', errors='replace')
    id_match = re.search(r'(?im)^\s*id:\s*(\d+)', s, flags=re.MULTILINE)
    meta_id = id_match.group(1) if id_match else ''
    fname_match = re.search(r'(?im)^\s*file_name:\s*(.+)$', s, flags=re.MULTILINE)
    meta_file_name = fname_match.group(1).strip() if fname_match else ''

    results.append({
        'meta_id': meta_id,
        'meta_file_name': meta_file_name,
        'chars_with_spaces': m['chars_with_spaces'],
        'chars_without_spaces': m['chars_without_spaces'],
        'words': m['words'],
        'lines': m['lines']
    })

    total_chars_with += m['chars_with_spaces']
    total_chars_without += m['chars_without_spaces']
    total_words += m['words']
    total_lines += m['lines']

# Prepare DataFrame / table
if pd is not None:
    df = pd.DataFrame(results)
    cols = ['meta_id', 'meta_file_name', 'chars_with_spaces', 'chars_without_spaces', 'words', 'lines']
    for c in cols:
        if c not in df.columns:
            df[c] = ''
    df = df[cols]
    display(df)
else:
    for r in results:
        print(f"{r['meta_id'] or '-'} | {r['meta_file_name']} | with_spaces={r['chars_with_spaces']} | without_spaces={r['chars_without_spaces']} | words={r['words']} | lines={r['lines']}")

# Print totals
print("\n--- TOTALS ---")
print(f"Files processed: {len(results)}")
print(f"Total characters (with spaces): {total_chars_with}")
print(f"Total characters (without spaces): {total_chars_without}")
print(f"Total words: {total_words}")
print(f"Total lines: {total_lines}")

# Save CSV
out_csv_path = "leaflet_counts_with_and_without_spaces.csv"
fieldnames = ['meta_id', 'meta_file_name', 'chars_with_spaces', 'chars_without_spaces', 'words', 'lines']
with open(out_csv_path, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for r in results:
        writer.writerow({k: r.get(k, '') for k in fieldnames})

print(f"\nCSV saved to: {out_csv_path}")
files.download(out_csv_path)


Please upload a ZIP file containing .txt leaflet files (one ZIP).


Saving latvian_communist_leaflets_1934-1935-partly-1936.zip to latvian_communist_leaflets_1934-1935-partly-1936.zip


Unnamed: 0,meta_id,meta_file_name,chars_with_spaces,chars_without_spaces,words,lines
0,1,revl-LKP_LKJS_Vidienas_org-[1934-01-11…].txt,3503,3008,482,13
1,2,revl-n002-LKP_soldiers_org-[1934-01-11…].txt,2432,2093,323,15
2,3,revl-n003-SP_CK-[…1934-01-21].txt,5578,4807,749,27
3,4,revl-n004a-LKP_CK-3000-[…1934-01-30].txt,7325,6333,976,11
4,4,revl-n004b-LKP_CK-10000-1934-02.txt,7325,6333,976,11
...,...,...,...,...,...,...
192,187,revl-n187-LKP_Riga_committee-5000-[…1936-07-29...,1110,955,143,12
193,188,revl-n188-LKP_Riga_committee-2000-1936-08.txt,4947,4287,632,25
194,189,revl-n189-SP_CK_SP_Riga_committee-3000-[...193...,2090,1788,283,15
195,190,revl-n190a-LKP_CK_LSSZP_CK-5000-1936-09.txt,5177,4463,697,15



--- TOTALS ---
Files processed: 197
Total characters (with spaces): 879675
Total characters (without spaces): 758859
Total words: 116244
Total lines: 3969

CSV saved to: leaflet_counts_with_and_without_spaces.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>