In [4]:
!pip install -q spacy stanza


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m63.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import re
import math
import zipfile
from pathlib import Path
from collections import Counter
import pandas as pd
import stanza

# =========================
# CONFIG
# =========================

ZIP_PATH = "/content/latvian_communist_leaflets_1934-1940.zip"
BASE_DIR = Path("/content/leaflets_unzipped")

BASE_DIR.mkdir(exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(BASE_DIR)

# NLP pipeline
nlp = stanza.Pipeline(
    lang="lv",
    processors="tokenize,lemma",
    tokenize_no_ssplit=True,
    verbose=False
)

# =========================
# HELPERS
# =========================

def extract_leaflet_text(raw: str) -> str:
    m = re.search(r"\btext\s*::?\s*(.*)\Z", raw, flags=re.IGNORECASE | re.DOTALL)
    return m.group(1).strip() if m else raw.strip()

def lemmatize_lv(text: str):
    doc = nlp(text)
    return [
        w.lemma.lower()
        for sent in doc.sentences
        for w in sent.words
        if w.text.isalpha()
    ]

def parse_metadata(raw: str):
    def get(key):
        m = re.search(rf"^\s*{re.escape(key)}\s*:\s*(.+)\s*$",
                      raw, flags=re.IGNORECASE | re.MULTILINE)
        return m.group(1).strip() if m else None
    return {
        "id": get("id"),
        "title": get("title"),
        "author": get("author"),
        "date": get("date"),
        "print_run": get("print_run"),
        "typography_name": get("typography_name"),
    }

def lexical_diversity(tokens):
    N = len(tokens)
    if N == 0:
        return {}

    freq = Counter(tokens)
    V = len(freq)
    hapax = sum(1 for c in freq.values() if c == 1)

    logN = math.log(N)
    logV = math.log(V)

    return {
        "N_tokens": N,
        "V_lemmas": V,
        "TTR": V / N,
        "RTTR": V / math.sqrt(N),
        "CTTR": V / math.sqrt(2 * N),
        "Herdan_C": logV / logN,
        "Maas_a2": (logN - logV) / (logN ** 2),
        "Hapax": hapax,
        "HapaxShare": hapax / V
    }

# =========================
# PROCESS CORPUS
# =========================

rows = []
all_lemmas = []

txt_files = sorted(BASE_DIR.rglob("*.txt"))
if not txt_files:
    raise RuntimeError("В ZIP нет .txt файлов")

for fp in txt_files:
    raw = fp.read_text(encoding="utf-8", errors="replace")

    text = extract_leaflet_text(raw)
    lemmas = lemmatize_lv(text)

    meta = parse_metadata(raw)
    metrics = lexical_diversity(lemmas)

    rows.append({
        "file": fp.name,
        **meta,
        **metrics
    })

    all_lemmas.extend(lemmas)

df_docs = pd.DataFrame(rows)

# =========================
# CORPUS-LEVEL METRICS
# =========================

corpus_metrics = lexical_diversity(all_lemmas)
df_corpus = pd.DataFrame([{
    "scope": "Latvian Communist Leaflet Corpus (1934–1940)",
    "documents": len(df_docs),
    **corpus_metrics
}])

# =========================
# SAVE
# =========================

df_docs.to_csv("/content/lexdiv_by_leaflet_lemmatized.csv", index=False)
df_corpus.to_csv("/content/lexdiv_corpus_lemmatized.csv", index=False)

df_docs.head(), df_corpus


(                                                file  id  \
 0  revl-n001-LKP_LKJS_Vidienas_org-5000-[1934-01-...   1   
 1  revl-n002-LKP_soldiers_org-1500-[1934-01-11…].txt   2   
 2              revl-n003-SP_CK-unk-[…1934-01-21].txt   3   
 3           revl-n004a-LKP_CK-3000-[…1934-01-30].txt  4a   
 4          revl-n004b-LKP_CK-10000-[…1934-01-30].txt  4b   
 
                                                title  \
 0  LKP un LKJS Vidienas organizācijas lapiņa par ...   
 1  No LKP kareivju organizācijas lapiņas ar karei...   
 2  Latvijas Sarkanās Palīdzības CK lapiņa, kas ve...   
 3  LKP CK aicinājums bezdarbniekiem organizēties ...   
 4  LKP CK aicinājums bezdarbniekiem organizēties ...   
 
                                               author           date print_run  \
 0  LKP Vidienas organizācija un LKJS Vidienas org...  [1934-01-11…]      5000   
 1                          LKP kareivju organizācija  [1934-01-11…]      1500   
 2     Latvijas Sarkanās Palīdzības Centrā