In [1]:
# =======================
# 1) Gensim instalēšana
# =======================
!pip install gensim --quiet

# =======================
# 2) ZIP faila ielāde
# =======================
from google.colab import files
import zipfile
import re
from gensim.models import Word2Vec

uploaded = files.upload()  # izvēlies latvian_communist_leaflets_1934-1940.zip
zip_filename = list(uploaded.keys())[0]
print("Uploaded:", zip_filename)

# =======================
# 3) Palīgfunkcijas
# =======================

def normalize_token(tok: str) -> str:
    """
    Primitīva "lematizācija":
    visas formas sociāldemokr- -> sociāldemokrāti
    visas formas fašist- -> fašisti
    """
    if tok.startswith("sociāldemokr"):
        return "sociāldemokrāti"
    if tok.startswith("fašist"):
        return "fašisti"
    return tok

def extract_text_block(raw: str) -> str:
    """
    Elastīgi izņemam galveno skrejlapas tekstu:
    1) meklējam rindu, kas sākas ar 'text' (text:, text::, TEXT:: u.tml.)
    2) ja neatradām — uzskatām, ka teksts sākas pēc 'key: value' tipa rindu bloka
    """
    lines = raw.splitlines()
    text_start_idx = None

    # 1) mēģinām atrast rindu text / text: / text:: (ignorējot reģistru)
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("text"):
            text_start_idx = i + 1
            break

    # 2) ja marķiera nav, meklējam metadatu 'key: value' beigas
    if text_start_idx is None:
        text_start_idx = 0
        for i, line in enumerate(lines):
            # rindas ar "id: ...", "file_name: ...", "date: ..."
            if re.match(r"^\s*\w+\s*:", line):
                continue
            else:
                text_start_idx = i
                break

    return "\n".join(lines[text_start_idx:])

# =======================
# 4) Tekstu lasīšana no ZIP
# =======================
texts = []
leaflet_count = 0

with zipfile.ZipFile(zip_filename, "r") as zf:
    all_names = zf.namelist()
    txt_names = [n for n in all_names if n.endswith(".txt")]
    print(f"Found {len(txt_names)} .txt files in ZIP")
    print("First 5 files:", txt_names[:5])

    for name in txt_names:
        raw_bytes = zf.read(name)
        try:
            raw = raw_bytes.decode("utf-8")
        except UnicodeDecodeError:
            # drošības labad, ja nu nav utf-8
            raw = raw_bytes.decode("utf-8", errors="ignore")

        # izņemam galveno tekstu
        text_body = extract_text_block(raw).lower()

        # aizstājam saīsinājumus s.-d. -> sociāldemokrāti
        text_body = re.sub(r"s\.\-d\.?", " sociāldemokrāti ", text_body, flags=re.IGNORECASE)

        # tokenizācija: latviešu diakritiskās zīmes saglabājas
        tokens = re.findall(r"\b\w+\b", text_body, flags=re.UNICODE)

        # normalizācija ("lematizācija")
        tokens = [normalize_token(t) for t in tokens]

        if tokens:
            texts.append(tokens)
            leaflet_count += 1

print(f"Parsed {leaflet_count} leaflet texts with non-empty tokens")

if not texts:
    raise RuntimeError("No texts parsed from ZIP — нужно посмотреть реальную структуру файлов.")

# =======================
# 5) Word2Vec apmācība
# =======================
model = Word2Vec(
    sentences=texts,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    workers=4
)

# =======================
# 6) Vārdu pieejamības pārbaude
# =======================
for term in ["sociāldemokrāti", "fašisti"]:
    print(term, "in vocab:", term in model.wv.key_to_index)

# =======================
# 7) Kosinusa līdzība
# =======================
if "sociāldemokrāti" in model.wv.key_to_index and "fašisti" in model.wv.key_to_index:
    similarity = model.wv.similarity("sociāldemokrāti", "fašisti")
    print("\nCosine similarity sociāldemokrāti \u2194 fašisti:", similarity)
else:
    print("\nOne of the terms is missing in the vocabulary")

# =======================
# 8) TOP-10 tuvākie vārdi
# =======================
for term in ["sociāldemokrāti", "fašisti"]:
    if term in model.wv.key_to_index:
        print(f"\nMost similar to '{term}':")
        for w, score in model.wv.most_similar(term, topn=10):
            print(f"  {w:<20} {score:.3f}")
    else:
        print(f"\nNo vector for '{term}' in the model.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25h

Saving latvian_communist_leaflets_1934-1940.zip to latvian_communist_leaflets_1934-1940.zip
Uploaded: latvian_communist_leaflets_1934-1940.zip
Found 266 .txt files in ZIP
First 5 files: ['revl-n251-LKP_Riga_committee-unk-1940-06-19.txt', 'revl-n001-LKP_LKJS_Vidienas_org-5000-[1934-01-11…].txt', 'revl-n002-LKP_soldiers_org-1500-[1934-01-11…].txt', 'revl-n003-SP_CK-unk-[…1934-01-21].txt', 'revl-n004a-LKP_CK-3000-[…1934-01-30].txt']
Parsed 266 leaflet texts with non-empty tokens
sociāldemokrāti in vocab: True
fašisti in vocab: True

Cosine similarity sociāldemokrāti ↔ fašisti: 0.6282911

Most similar to 'sociāldemokrāti':
  bezpartejiskie       0.893
  komunistiskie        0.890
  līderi               0.886
  jūs                  0.884
  bijušie              0.883
  jūsu                 0.882
  strādnieces          0.881
  darbaļaudis          0.878
  vefa                 0.878
  sociālistiskie       0.878

Most similar to 'fašisti':
  ulmaņa               0.918
  baloža               0.8