In [2]:
# ============================================================
# Collocations (PMI, t-score) around target words (no lemmatization)
# Input: a ZIP with .txt files (each contains a 'text:' or 'text::' section)
# Output: CSVs with top collocates and a ZIP to download
# ============================================================

import re, os, io, zipfile, math, unicodedata
from collections import Counter
import pandas as pd
from google.colab import files
from IPython.display import display

# ---- 1) Upload ZIP ----
print("⬆️ Choose a ZIP containing .txt leaflets (each file has a 'text:' section).")
uploaded = files.upload()
zip_name = list(uploaded.keys())[0]

extract_dir = "/content/leaflets"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(uploaded[zip_name]), 'r') as z:
    z.extractall(extract_dir)
print(f"Extracted to: {extract_dir}")

# ---- 2) Parameters ----
WINDOW = 5           # collocation window ±5 tokens
MIN_COOC = 3         # minimum observed co-occurrences to keep a pair
MIN_FREQ_W = 5       # minimum corpus frequency of the neighbor word
MIN_TOKEN_LEN = 2    # drop 1-char tokens (noise)

# Targets: list of surface forms (lowercased)
TARGETS = {
    "fasisms": {
        "forms": {
            "fašisms","fašistu","fašistam","fašistiem","fašistiskā","fašistisko","fašistiskās",
            "fašistiska","fašistiski","fašistiskie","fašistiskos","fašistisks","fašistiskums",
            "fašismu","fašisma","fašismā"
        }
    },
    "Ulmanis": {
        "forms": {
            "ulmanis","ulmani","ulmaņa","ulmanim","ulmanī","ulmanītis","ulmanītim","ulmanīti"
        }
    }
}

# Minimal Latvian stopword set (extend as needed)
STOPWORDS = set("""
un bet vai ka kā par ar no uz pie lai tas tā šis šie šīs šo šajā tad arī gan vien
tiek tika būs būtu bijis biju jūs mēs viņi viņš viņa tās tie to tam tajā tās
es man mani manis mums jums viņiem viņam viņas viņu tās tās
nav ir bija bijām bijāt esam esat būt
""".split())

# Tokenization: Latvian letters + digits (keeps diacritics)
TOKEN_RE = re.compile(r"[0-9A-Za-zĀĒĪŪŌĢĶĻŅČŖŠŪŽāēīūōģķļņčŗšūžĻŅĀĒĪŪČĢĶŠŽŖ]+", re.UNICODE)

def read_leaflet_text(path: str) -> str:
    """Return the text after the 'text:' label; fallback to whole file if missing."""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
    m = re.search(r"\btext\s*:\s*", content, flags=re.IGNORECASE)
    return content[m.end():].strip() if m else content

def normalize_text(txt: str) -> str:
    """Unicode-normalize and lowercase."""
    txt = unicodedata.normalize('NFC', txt)
    return txt.lower()

def tokenize(text: str):
    """Simple regex tokenization; keep Latvian diacritics."""
    toks = TOKEN_RE.findall(text)
    # Drop tokens with digits and length < MIN_TOKEN_LEN (reduce noise like 'b', '1934')
    toks = [t for t in toks if t.isalpha() and len(t) >= MIN_TOKEN_LEN]
    return toks

# ---- 3) Read and tokenize all documents ----
docs_tokens = []
for root, _, files_in in os.walk(extract_dir):
    for fn in files_in:
        if fn.lower().endswith(".txt"):
            p = os.path.join(root, fn)
            t = read_leaflet_text(p)
            if t.strip():
                toks = tokenize(normalize_text(t))
                if toks:
                    docs_tokens.append(toks)

print(f"Documents found: {len(docs_tokens)}")

# Flat frequency over the whole corpus
flat = [w for doc in docs_tokens for w in doc]
N_TOKENS = len(flat)
freq_w = Counter(flat)
print(f"Tokens: {N_TOKENS:,}, unique types: {len(freq_w):,}")

# ---- 4) Collocations for each target ----
def collocations_for_forms(target_forms, docs_tokens, window=5):
    """Count co-occurrences within a ±window around target forms."""
    cooc = Counter()
    f_target = 0
    tset = set(target_forms)
    for toks in docs_tokens:
        L = len(toks)
        for i, tok in enumerate(toks):
            if tok in tset:
                f_target += 1
                left = max(0, i - window)
                right = min(L, i + window + 1)
                for j in range(left, right):
                    if j == i:
                        continue
                    w = toks[j]
                    if w in tset or w in STOPWORDS or not TOKEN_RE.fullmatch(w):
                        continue
                    # skip tokens with digits or too short (double-check for robustness)
                    if not w.isalpha() or len(w) < MIN_TOKEN_LEN:
                        continue
                    cooc[w] += 1
    return cooc, f_target

def compute_stats_table(cooc, f_target, N_tokens, freq_w, window=5,
                        min_cooc=3, min_freq_w=5):
    """
    Compute expected counts (independence assumption), t-score and PMI (log2).
    E ≈ f_w * f_target * (2*window) / N_tokens
    t-score = (O - E) / sqrt(O)
    PMI = log2( O * N_tokens / (f_w * f_target * 2*window) )
    """
    rows = []
    span = 2 * window
    for w, O in cooc.items():
        f_w = freq_w.get(w, 0)
        if O < min_cooc or f_w < min_freq_w:
            continue
        E = (f_w * f_target * span) / max(N_tokens, 1)
        t_score = (O - E) / (O ** 0.5) if O > 0 else 0.0
        denom = (f_w * f_target * span)
        pmi = math.log2((O * N_tokens) / denom) if denom > 0 and O > 0 else float('-inf')
        rows.append((w, O, E, t_score, pmi, f_w))
    df = pd.DataFrame(rows, columns=["word","O_obs","E_exp","t_score","PMI","freq_w"])
    df_t = df.sort_values(["t_score","O_obs"], ascending=[False, False])
    df_p = df.sort_values(["PMI","O_obs"], ascending=[False, False])
    return df, df_t, df_p

results = {}
for tgt_name, cfg in TARGETS.items():
    cooc, f_tgt = collocations_for_forms(cfg["forms"], docs_tokens, window=WINDOW)
    df, df_t, df_pmi = compute_stats_table(cooc, f_tgt, N_TOKENS, freq_w,
                                           window=WINDOW, min_cooc=MIN_COOC, min_freq_w=MIN_FREQ_W)
    results[tgt_name] = {"raw": df, "by_t": df_t, "by_pmi": df_pmi, "f_target": f_tgt}
    print(f"\n{tgt_name}: target occurrences = {f_tgt}, kept candidates = {len(df)}")

# ---- 5) Save and preview ----
out_dir = "/content/collocations_out"
os.makedirs(out_dir, exist_ok=True)

for name, tabs in results.items():
    tabs["raw"].to_csv(f"{out_dir}/collocations_{name}_raw.csv", index=False)
    tabs["by_t"].head(200).to_csv(f"{out_dir}/collocations_{name}_top_tscore.csv", index=False)
    tabs["by_pmi"].head(200).to_csv(f"{out_dir}/collocations_{name}_top_pmi.csv", index=False)

print(f"\nCSVs saved to {out_dir}")

for name, tabs in results.items():
    print(f"\n=== {name}: TOP-50 by t-score (window ±{WINDOW}) ===")
    display(tabs["by_t"].head(50).reset_index(drop=True))
    print(f"\n=== {name}: TOP-50 by PMI (window ±{WINDOW}) ===")
    display(tabs["by_pmi"].head(50).reset_index(drop=True))

# Bundle results into a ZIP for download
out_zip = "/content/collocations_results.zip"
with zipfile.ZipFile(out_zip, 'w', compression=zipfile.ZIP_DEFLATED) as z:
    for fn in os.listdir(out_dir):
        z.write(os.path.join(out_dir, fn), arcname=fn)

print("\n⬇️ Download the results ZIP below.")
files.download(out_zip)


⬆️ Choose a ZIP containing .txt leaflets (each file has a 'text:' section).


Saving latvian_communist_leaflets_1934-1935-partly-1936.zip to latvian_communist_leaflets_1934-1935-partly-1936.zip
Extracted to: /content/leaflets
Documents found: 197
Tokens: 114,131, unique types: 17,602

fasisms: target occurrences = 1466, kept candidates = 850

Ulmanis: target occurrences = 558, kept candidates = 302

CSVs saved to /content/collocations_out

=== fasisms: TOP-50 by t-score (window ±5) ===


Unnamed: 0,word,O_obs,E_exp,t_score,PMI,freq_w
0,pret,439,167.240452,12.970376,1.392297,1302
1,ulmaņa,201,47.39764,10.834275,2.084308,369
2,nost,145,35.323444,9.108142,2.037355,275
3,valdība,106,30.442386,7.338804,1.799911,237
4,karu,99,27.359613,7.20013,1.855381,213
5,diktatūru,69,14.000929,6.62111,2.301074,109
6,valdību,64,12.844889,6.394389,2.316878,100
7,diktatūras,38,6.422444,5.122556,2.564805,50
8,valdības,50,16.056111,4.800391,1.638806,125
9,teroru,35,10.275911,4.179134,1.768089,80



=== fasisms: TOP-50 by PMI (window ±5) ===


Unnamed: 0,word,O_obs,E_exp,t_score,PMI,freq_w
0,uzbrūkošo,8,0.899142,2.510532,3.153379,7
1,komisāriem,6,0.770693,2.134856,2.960734,6
2,despotiju,6,0.770693,2.134856,2.960734,6
3,nodibināšanās,5,0.642244,1.948848,2.960734,5
4,ungārijas,5,0.642244,1.948848,2.960734,5
5,aģentūru,5,0.642244,1.948848,2.960734,5
6,zvērībām,8,1.15604,2.419705,2.790809,9
7,boikotēsim,6,0.899142,2.082416,2.738341,7
8,reakcija,5,0.770693,1.891403,2.697699,6
9,īsto,5,0.770693,1.891403,2.697699,6



=== Ulmanis: TOP-50 by t-score (window ±5) ===


Unnamed: 0,word,O_obs,E_exp,t_score,PMI,freq_w
0,valdība,90,11.587211,8.265434,2.957392,237
1,fašistisko,81,11.098299,7.766856,2.867583,227
2,nost,83,13.445076,7.634645,2.626033,275
3,valdību,59,4.889119,7.044637,3.593069,100
4,fašistiskā,62,8.751522,6.762563,2.824662,179
5,hitlera,33,5.231357,4.8339,2.657209,107
6,latvijā,38,8.898196,4.720936,2.094415,182
7,diktatūru,30,5.329139,4.504262,2.492988,109
8,valdības,31,6.111398,4.470125,2.342694,125
9,baloža,22,1.124497,4.450672,4.290151,23



=== Ulmanis: TOP-50 by PMI (window ±5) ===


Unnamed: 0,word,O_obs,E_exp,t_score,PMI,freq_w
0,aģentūru,5,0.244456,2.126744,4.354282,5
1,baloža,22,1.124497,4.450672,4.290151,23
2,rokaspuiši,7,0.488912,2.46096,3.839709,10
3,aģents,8,0.586694,2.620999,3.769319,12
4,tirāniju,4,0.293347,1.853326,3.769319,6
5,kalps,4,0.293347,1.853326,3.769319,6
6,pakalpīgā,3,0.244456,1.590914,3.617316,5
7,avantūrām,3,0.244456,1.590914,3.617316,5
8,valdību,59,4.889119,7.044637,3.593069,100
9,kliķe,8,0.782259,2.551857,3.354282,16



⬇️ Download the results ZIP below.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>