# Visualize terms document frequency, term frequency, and relative frequency over time

## Custom translations and colors for original terms

In [1]:
# let's create a dictionary of translations for all the keys from Latvian to English
original_terms_translations = {
    "auto": "car",	
    "autobuss": "bus",
    "automobilis": "automobile",
    "divjūgs": "pair carriage",
    "divritenis": "bicycle",
    "droška": "droshky",
    "dzelzceļš": "railway",
    "fūrmanis": "hired waggoner",
    "kamanas": "sleigh",
    "kariete": "coach",
    "linijdroška": "line droshky",
    "mašīna": "machine",
    "motocikls": "motorcycle",
    "ore": "farm wagon",
    "ormanis": "horse-drawn cab",
    "pajūgs": "rig",
    "ragavas": "sledge",
    # "rati": "carriage",
    "taksometrs": "taxi",
    "tramvajs": "tram",
    "velosipēds": "velocipede",
    "vezums": "wagon",
    "važonis": "coachman",
}
# how many keys
print(f"original_terms_translations keys: {len(original_terms_translations)}")

original_terms_translations keys: 22


## Loading Libraries and showing hardware used

In [1]:
# Show Python version
import sys
print(f"Python version: {sys.version}")
from datetime import datetime
print(f"Run date: {datetime.now()}")
from pathlib import Path
import os

# Get the project root by going one level up from the current notebook directory
project_root = Path().resolve().parent
print(f"Project root: {project_root}")
# what computer are we on?
import socket
print(f"Computer name: {socket.gethostname()}")
# CPU architecture
import platform
print(f"CPU architecture: {platform.machine()}")
# CPU type
print(f"CPU type {platform.processor()}")
# CPU count
print(f"CPU count: {os.cpu_count()}")
# OS name and version
print(f"OS name: {platform.system()}")
print(f"OS version: {platform.version()}")
# memory and disk space
import psutil
print(f"Memory: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB : free - {psutil.virtual_memory().available / (1024 ** 3):.2f} GB")
print(f"Swap memory: {psutil.swap_memory().total / (1024 ** 3):.2f} GB : free - {psutil.swap_memory().free / (1024 ** 3):.2f} GB")
print(f"Disk space: {psutil.disk_usage('/').total / (1024 ** 3):.2f} GB : free - {psutil.disk_usage('/').free / (1024 ** 3):.2f} GB")

# try importing the libraries we need
print("EXTERNAL libraries")

try:
    from tqdm import tqdm
    from tqdm import __version__ as tqdm_version
    print(f"tqdm version: {tqdm_version}")
except ImportError:
    print("tqdm not installed")
    print("Please install tqdm with 'pip install tqdm'")

#Pandas
try:
    import pandas as pd
    from pandas import __version__ as pandas_version
    print(f"Pandas version: {pandas_version}")
except ImportError:
    print("Pandas not installed")
    print("""Please install pandas with 'pip install "pandas[excel,parquet]"'""")

# now plotly
try:
    from plotly import express as px
    from plotly import graph_objects as go
    from plotly import __version__ as plotly_version
    print(f"Plotly version: {plotly_version}")
except ImportError:
    print("Plotly not installed")
    print("Please install plotly with 'pip install plotly'")


Python version: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Run date: 2025-05-26 11:14:47.592502
Project root: C:\Users\Valdis\Github\lnb_transports
Computer name: Val
CPU architecture: AMD64
CPU type Intel64 Family 6 Model 154 Stepping 3, GenuineIntel
CPU count: 20
OS name: Windows
OS version: 10.0.26100
Memory: 63.73 GB : free - 49.92 GB
Swap memory: 4.00 GB : free - 4.00 GB
Disk space: 951.65 GB : free - 215.47 GB
EXTERNAL libraries
tqdm version: 4.66.4
Pandas version: 2.2.2
Plotly version: 5.22.0


## Loading Main Data File

In [2]:
src = Path("../../not_repo/latsenrom_2025_05_09.parquet")

# assert src.exists()
assert src.is_file(), f"File not found: {src}"
# loading
print(f"Loading from {src}")
df = pd.read_parquet(src)
# check the dataframe
# shape
print(f"df.shape: {df.shape}")
# head
df.head()

Loading from ..\..\not_repo\latsenrom_2025_05_09.parquet
df.shape: (37605476, 17)


Unnamed: 0,deprel,form,index,lemma,parent,pos,tag,ufeats,upos,sent_ndx,author,title,dom_id,file_stem,file_stem_short,firstEdition,term
0,nmod,Mīlas,1,mīla,2.0,ncfsg_,ncfsg4,Case=Gen|Gender=Fem|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,mīla
1,nmod,ārprāta,2,ārprāts,3.0,ncmsg_,ncmsg1,Case=Gen|Gender=Masc|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,ārprāts
2,obl,varā,3,vara,6.0,ncfsl_,ncfsl4,Case=Loc|Gender=Fem|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,vara
3,nmod,ROMĀNS,4,Romāns,6.0,npmsn_,npmsn1,Case=Nom|Gender=Masc|Number=Sing,PROPN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,Romāns
4,punct,„,5,"""",6.0,zq,zq,_,PUNCT,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,""""


## Loading moto and horse terms

In [58]:
horse_moto_file = Path("../csv/Sauszemes-transporta-termini.csv")
# assert horse_moto_file.exists()
assert horse_moto_file.is_file(), f"File not found: {horse_moto_file}"
horse_moto_df = pd.read_csv(horse_moto_file, sep=";")
# check the dataframe
# shape
print(f"horse_moto_df.shape: {horse_moto_df.shape}")
# head
horse_moto_df.head()

horse_moto_df.shape: (160, 2)


Unnamed: 0,Word,Zirgu / Motorizēts
0,alfs romeo,m
1,atsperrati,z
2,auto,m
3,auto nāve limuzinis,m
4,autobuss,m


In [59]:
# let's change column names
# Word will be ngram, and Zirgu / Moto transporta termini will be horse_moto
horse_moto_df.rename(columns={
    "Zirgu / Motorizēts": "horse_moto",
    "Word": "ngram"
}, inplace=True)
# head now
horse_moto_df.head()

Unnamed: 0,ngram,horse_moto
0,alfs romeo,m
1,atsperrati,z
2,auto,m
3,auto nāve limuzinis,m
4,autobuss,m


In [60]:
# now let's make a function that creates tuple of terms from ngram
# the logic will be as follows we split by whitespace unless the ngram contains period then we keep it as a single term
def create_terms_tuple(ngram):
    # split by whitespace unless the ngram contains period
    if "." in ngram:
        return (ngram,)
    else:
        return tuple(ngram.split())
    
# now let's creat a new column with the terms tuple
horse_moto_df["terms_tuple"] = horse_moto_df["ngram"].apply(create_terms_tuple)
# shape
print(f"horse_moto_df.shape after adding terms_tuple: {horse_moto_df.shape}")
# head
horse_moto_df.head()

horse_moto_df.shape after adding terms_tuple: (160, 3)


Unnamed: 0,ngram,horse_moto,terms_tuple
0,alfs romeo,m,"(alfs, romeo)"
1,atsperrati,z,"(atsperrati,)"
2,auto,m,"(auto,)"
3,auto nāve limuzinis,m,"(auto, nāve, limuzinis)"
4,autobuss,m,"(autobuss,)"


In [61]:
# let's rename the column to lemma_tuple
horse_moto_df.rename(columns={"terms_tuple": "lemma_tuple"}, inplace=True)
# head now
horse_moto_df.head(20)

Unnamed: 0,ngram,horse_moto,lemma_tuple
0,alfs romeo,m,"(alfs, romeo)"
1,atsperrati,z,"(atsperrati,)"
2,auto,m,"(auto,)"
3,auto nāve limuzinis,m,"(auto, nāve, limuzinis)"
4,autobuss,m,"(autobuss,)"
5,autokārs,m,"(autokārs,)"
6,auto-limuzīna,m,"(auto-limuzīna,)"
7,automašina,m,"(automašina,)"
8,automašīna,m,"(automašīna,)"
9,automobilis,m,"(automobilis,)"


In [62]:
# now let's go through ngram tuples one by one and count how many times they appear in corpus in df dataframe
# we will use df.lemma column for that
# the extra difficulty is that lemma contains single words but we could have multiple sequential words in ngram

# our first approach will involve creating a single mega string from all lemma by joining them with whitespace
mega_string = " ".join(df["lemma"].astype(str).tolist())
# how long is the mega string?
print(f"Length of mega_string: {len(mega_string)} characters")
# how many uppercase letters?
uppercase_count = sum(1 for c in mega_string if c.isupper())
print(f"Uppercase letters in mega_string: {uppercase_count}")


Length of mega_string: 193504859 characters
Uppercase letters in mega_string: 1537949


In [63]:
# lowercase the mega string
mega_string = mega_string.lower()
# how many lowercase letters?

In [64]:
# now we can count how many times each ngram appears in the mega string
def count_ngram_in_mega_string(ngram, text):
    return text.count(ngram)

# let's test it on first ngram
first_ngram = horse_moto_df["ngram"].iloc[0]
print(f"First ngram: {first_ngram}")
# count how many times it appears in mega_string
count = count_ngram_in_mega_string(first_ngram, mega_string)
print(f"Count of first ngram '{first_ngram}' in mega_string: {count}")


First ngram: alfs romeo
Count of first ngram 'alfs romeo' in mega_string: 4


In [65]:
# how about 4 more ngrams?
for i in range(1, 5):
    ngram = horse_moto_df["ngram"].iloc[i]
    count = count_ngram_in_mega_string(ngram, mega_string)
    print(f"Count of ngram '{ngram}' in mega_string: {count}")

Count of ngram 'atsperrati' in mega_string: 13
Count of ngram 'auto' in mega_string: 5557
Count of ngram 'auto nāve limuzinis' in mega_string: 1
Count of ngram 'autobuss' in mega_string: 230


In [66]:
# let's find 20 characters before and after each instance of term "romeo"
def find_term_context(term, text, context_length=20):
    indices = []
    start = 0
    while True:
        start = text.find(term, start)
        if start == -1:
            break
        indices.append(start)
        start += len(term)  # move past the current term
    contexts = []
    for index in indices:
        start_index = max(0, index - context_length)
        end_index = min(len(text), index + len(term) + context_length)
        contexts.append(text[start_index:end_index])
    return contexts

# let's test it on romeo
term = "Romeo"
contexts = find_term_context(term, mega_string)
# how many contexts we found?
print(f"Found {len(contexts)} contexts for term '{term}'")
# print the first 5 contexts
for i, context in enumerate(contexts[:5]):
    print(f"Context {i+1}: {context}")

Found 0 contexts for term 'Romeo'


In [67]:
# let's create a new column ngram_count in horse_moto_df that will contain the count of each ngram in mega_string
horse_moto_df["ngram_count"] = horse_moto_df["ngram"].apply(lambda x: count_ngram_in_mega_string(x.lower(), mega_string))
# show the first 20 rows
horse_moto_df.head(20)

Unnamed: 0,ngram,horse_moto,lemma_tuple,ngram_count
0,alfs romeo,m,"(alfs, romeo)",4
1,atsperrati,z,"(atsperrati,)",13
2,auto,m,"(auto,)",5557
3,auto nāve limuzinis,m,"(auto, nāve, limuzinis)",1
4,autobuss,m,"(autobuss,)",230
5,autokārs,m,"(autokārs,)",9
6,auto-limuzīna,m,"(auto-limuzīna,)",1
7,automašina,m,"(automašina,)",3
8,automašīna,m,"(automašīna,)",38
9,automobilis,m,"(automobilis,)",1316


In [68]:
# let's save the horse_moto_df to xlsx file
output_file = Path("../xlsx/sauszemes_transporta_termini.xlsx")
# save to xlsx
horse_moto_df.to_excel(output_file, index=False)
# show the output file path
print(f"Saved horse_moto_df to {output_file}")

Saved horse_moto_df to ..\xlsx\sauszemes_transporta_termini.xlsx


In [69]:
# show which ngrams have count 0
zero_count_df = horse_moto_df[horse_moto_df["ngram_count"] == 0]
# shape of zero_count_df
print(f"zero_count_df shape: {zero_count_df.shape}")
# show the first 20 rows of zero_count_df
zero_count_df.head(20)

zero_count_df shape: (10, 4)


Unnamed: 0,ngram,horse_moto,lemma_tuple,ngram_count
15,autovāģis,m,"(autovāģis,)",0
51,dzelzsceļš rats,m,"(dzelzsceļš, rats)",0
61,essex roadster,m,"(essex, roadster)",0
67,fūrmaņs kamanas,z,"(fūrmaņs, kamanas)",0
68,goda-rati,z,"(goda-rati,)",0
69,HarleysDavidsons,m,"(HarleysDavidsons,)",0
98,motorrats,m,"(motorrats,)",0
116,redelains ore,z,"(redelains, ore)",0
121,Roll-Rois,m,"(Roll-Rois,)",0
123,sanitars automobilis,m,"(sanitars, automobilis)",0


In [39]:
"šībervilcien" in mega_string
# find contexts for "šībervilcien"
contexts = find_term_context("šībervilcien", mega_string, context_length=50)
# print the first 5 contexts
for i, context in enumerate(contexts[:5]):
    print(f"Context {i+1}: {context}")


Context 1:  . tamdēļ vilciens iet stunda un seši minūte . ār šībervilciena tā pats gabals varēt nobraukt četrdesmit seši mi


In [41]:
# let's save zero_count_df to xlsx file
zero_count_output_file = Path("../xlsx/sauszemes_transporta_termini_zero_count.xlsx")   
zero_count_df.to_excel(zero_count_output_file, index=False)

In [None]:
# # let's check kurvjrat
# kurvjrat = "kurvjrat"
# # find contexts for "kurvjrat"
# contexts = find_term_context(kurvjrat, mega_string, context_length=50)
# # how many contexts we found?
# print(f"Found {len(contexts)} contexts for term '{kurvjrat}'")
# # print the first 5 contexts
# for i, context in enumerate(contexts[:5]):
#     print(f"Context {i+1}: {context}")

Found 7 contexts for term 'kurvjrat'
Context 1: bet liels bailes gan būt , jo turpat pie dzeltēns kurvjrats sta - vējš arī pats bārdiņš . laiks , atvest pac
Context 2: ta pie slita stāvēt sirms zirgs , sajūgt dzeltens kurvjrats . tas šķist redzēt . pārcilāt pēdējais diena atm
Context 3: jš riksis , sirms zirgs vilkt , nobraukt dzeltēns kurvjrats . līt un reinis viņš pazīt . " zemdega braukt ..
Context 4: varēt un apkaunot , viņš atraisīt zirgs un ielēkt kurvjrats . pātaga vairākas reize ķert sirmis mugura . tas
Context 5: ņot prāvnieks . priekša rikšot zemdega šķimelis . kurvjrats sēdēt augusts un šādums . gabaliņš aiz tas sīki 


In [None]:
# let's create a function that will take a term and mega_string and return contexts
# it will also print how many contexts were found and by default will print the first 5 contexts
def print_term_contexts(term, text, context_length=50, max_print=5):
    contexts = find_term_context(term, text, context_length)
    print(f"Found {len(contexts)} contexts for term '{term}'")
    for i, context in enumerate(contexts[:max_print]):
        print(f"Context {i+1}: {context}")
# let's test it on "kurvjrat"
#print_term_contexts("autovāģ", mega_string, context_length=50, max_print=5)

Found 0 contexts for term 'autovāģ'


In [None]:
# now ātrs palīdzība rat
#print_term_contexts("ātrs palīdzība rat", mega_string, context_length=50, max_print=5)

Found 7 contexts for term 'ātrs palīdzība rat'
Context 1: iņa personība . bet neviens viņš pazīt . atbraukt ātrs palīdzība rats un ierasties ārsts . ) kk " viņa būt mirt . > te
Context 2: ugunsdzēsējs , tuvējs bekinghema pils sar - dzi , ātrs palīdzība rats , vairākus veterina - rus un četri galvenā konfe
Context 3: : viņš aizvest uz slimnīca . bet kamēr , atbraukt ātrs palīdzība rats , es pamanīt viņš mētelis iekškabata jūs grāmata
Context 4: ka blakša gandrīz zaudēt samaņa . krišiņš izsaukt ātrs palīdzība rats un aiz - vest sieva uz slimnīca . tas pats ārsts
Context 5: s . lūdzu , priekšnieks kungs , kas mēs darīt ? - ātrs palīdzība rats ! sasodīta būšana ! iznākt nepatikšanas . varēt 


In [None]:
# now bagažas vilcien
#print_term_contexts("bagaža vilcien", mega_string, context_length=50, max_print=5)

Found 1 contexts for term 'bagaža vilcien'
Context 1: laist uz stacija pēc manta , kas nakts pienākt ar bagaža vilciens , varbūt pats braukt pārvest ? tikai dzirdēt par


In [None]:
# now goda-rat
#print_term_contexts("gods rats", mega_string, context_length=50, max_print=5)

Found 50 contexts for term 'gods rats'
Context 1: as izskatīties . un kad putra stāstīt , kāds viņš gods rats beigas izskatīšoties , tad būt arī kas klausīties
Context 2: s uz buka pat tad , ja viens zirgs un dažreiz arī gods rats būt palienēt no kaimiņš . par polis saukties arī 
Context 3: cauri liels muiša , vins panākt silarajs , braukt gods rats un ar labako sirgt ilkss . padot labdiena un sirg
Context 4: diesin kas silarajs domats , ja redsets rute fēšs gods rats blakus tāds brasfhs jauns puifim . . . pa gatuve 
Context 5: a patraukls labaks sirgt jau stāvēt iejūgt viegls gods rats . drīt abi sēdas tanẽs iekš un pa garš , nelīdsen
