# Visualize terms document frequency, term frequency, and relative frequency over time

## Custom translations and colors for original terms

In [1]:
# let's create a dictionary of translations for all the keys from Latvian to English
# original_terms_translations = {
#     "auto": "car",	
#     "autobuss": "bus",
#     "automobilis": "automobile",
#     "divjūgs": "pair carriage",
#     "divritenis": "bicycle",
#     "droška": "droshky",
#     "dzelzceļš": "railway",
#     "fūrmanis": "hired waggoner",
#     "kamanas": "sleigh",
#     "kariete": "coach",
#     "linijdroška": "line droshky",
#     "mašīna": "machine",
#     "motocikls": "motorcycle",
#     "ore": "farm wagon",
#     "ormanis": "horse-drawn cab",
#     "pajūgs": "rig",
#     "ragavas": "sledge",
#     # "rati": "carriage",
#     "taksometrs": "taxi",
#     "tramvajs": "tram",
#     "velosipēds": "velocipede",
#     "vezums": "wagon",
#     "važonis": "coachman",
# }
# # how many keys
# print(f"original_terms_translations keys: {len(original_terms_translations)}")

## Loading Libraries and showing hardware used

In [2]:
# Show Python version
import sys
print(f"Python version: {sys.version}")
from datetime import datetime
print(f"Run date: {datetime.now()}")
from pathlib import Path
import os
import re

# Get the project root by going one level up from the current notebook directory
project_root = Path().resolve().parent
print(f"Project root: {project_root}")
# what computer are we on?
import socket
print(f"Computer name: {socket.gethostname()}")
# CPU architecture
import platform
print(f"CPU architecture: {platform.machine()}")
# CPU type
print(f"CPU type {platform.processor()}")
# CPU count
print(f"CPU count: {os.cpu_count()}")
# OS name and version
print(f"OS name: {platform.system()}")
print(f"OS version: {platform.version()}")
# memory and disk space
import psutil
print(f"Memory: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB : free - {psutil.virtual_memory().available / (1024 ** 3):.2f} GB")
print(f"Swap memory: {psutil.swap_memory().total / (1024 ** 3):.2f} GB : free - {psutil.swap_memory().free / (1024 ** 3):.2f} GB")
print(f"Disk space: {psutil.disk_usage('/').total / (1024 ** 3):.2f} GB : free - {psutil.disk_usage('/').free / (1024 ** 3):.2f} GB")

# try importing the libraries we need
print("EXTERNAL libraries")

try:
    from tqdm import tqdm
    from tqdm import __version__ as tqdm_version
    print(f"tqdm version: {tqdm_version}")
except ImportError:
    print("tqdm not installed")
    print("Please install tqdm with 'pip install tqdm'")

#Pandas
try:
    import pandas as pd
    from pandas import __version__ as pandas_version
    print(f"Pandas version: {pandas_version}")
except ImportError:
    print("Pandas not installed")
    print("""Please install pandas with 'pip install "pandas[excel,parquet]"'""")

# now plotly
try:
    from plotly import express as px
    from plotly import graph_objects as go
    from plotly import __version__ as plotly_version
    print(f"Plotly version: {plotly_version}")
except ImportError:
    print("Plotly not installed")
    print("Please install plotly with 'pip install plotly'")


Python version: 3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
Run date: 2025-05-28 10:33:50.751087
Project root: C:\Users\vsaules\Github\lnb_transports
Computer name: 11P00694
CPU architecture: AMD64
CPU type Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
CPU count: 8
OS name: Windows
OS version: 10.0.19045
Memory: 31.80 GB : free - 23.47 GB
Swap memory: 4.75 GB : free - 4.04 GB
Disk space: 222.96 GB : free - 54.12 GB
EXTERNAL libraries
tqdm version: 4.66.2
Pandas version: 2.2.1
Plotly version: 5.19.0


## Loading Main Data File

In [3]:
# src = Path("../../not_repo/latsenrom_2025_05_09.parquet")

# # assert src.exists()
# assert src.is_file(), f"File not found: {src}"
# # loading
# print(f"Loading from {src}")
# df = pd.read_parquet(src)
# # check the dataframe
# # shape
# print(f"df.shape: {df.shape}")
# # head
# df.head()

## Loading moto and horse terms

In [4]:
# horse_moto_file = Path("../csv/Sauszemes-transporta-termini.csv")
# # assert horse_moto_file.exists()
# assert horse_moto_file.is_file(), f"File not found: {horse_moto_file}"
# horse_moto_df = pd.read_csv(horse_moto_file, sep=";")
# # check the dataframe
# # shape
# print(f"horse_moto_df.shape: {horse_moto_df.shape}")
# # head
# horse_moto_df.head()

In [5]:
# let's change column names
# Word will be ngram, and Zirgu / Moto transporta termini will be horse_moto
# horse_moto_df.rename(columns={
#     "Zirgu / Motorizēts": "horse_moto",
#     "Word": "ngram"
# }, inplace=True)
# # head now
# horse_moto_df.head()

In [6]:
# now let's make a function that creates tuple of terms from ngram
# the logic will be as follows we split by whitespace unless the ngram contains period then we keep it as a single term
# def create_terms_tuple(ngram):
#     # split by whitespace unless the ngram contains period
#     if "." in ngram:
#         return (ngram,)
#     else:
#         return tuple(ngram.split())
    
# # now let's creat a new column with the terms tuple
# horse_moto_df["terms_tuple"] = horse_moto_df["ngram"].apply(create_terms_tuple)
# # shape
# print(f"horse_moto_df.shape after adding terms_tuple: {horse_moto_df.shape}")
# # head
# horse_moto_df.head()

In [7]:
# let's rename the column to lemma_tuple
# horse_moto_df.rename(columns={"terms_tuple": "lemma_tuple"}, inplace=True)
# # head now
# horse_moto_df.head(20)

## Cleaning out Zero occurance terms

Due to differences between NoSketch tagger for Vert and NLP-PIPE tagger we have some differences in terms which we have to clean semi manually.

In [8]:
# now let's go through ngram tuples one by one and count how many times they appear in corpus in df dataframe
# we will use df.lemma column for that
# the extra difficulty is that lemma contains single words but we could have multiple sequential words in ngram

# our first approach will involve creating a single mega string from all lemma by joining them with whitespace
# mega_string = " ".join(df["lemma"].astype(str).tolist())
# # how long is the mega string?
# print(f"Length of mega_string: {len(mega_string)} characters")
# # how many uppercase letters?
# uppercase_count = sum(1 for c in mega_string if c.isupper())
# print(f"Uppercase letters in mega_string: {uppercase_count}")


In [9]:
# lowercase the mega string
# mega_string = mega_string.lower()
# how many lowercase letters?

In [10]:
# now we can count how many times each ngram appears in the mega string
# def count_ngram_in_mega_string(ngram, text):
#     return text.count(ngram)

# # let's test it on first ngram
# first_ngram = horse_moto_df["ngram"].iloc[0]
# print(f"First ngram: {first_ngram}")
# # count how many times it appears in mega_string
# count = count_ngram_in_mega_string(first_ngram, mega_string)
# print(f"Count of first ngram '{first_ngram}' in mega_string: {count}")


In [11]:
# # how about 4 more ngrams?
# for i in range(1, 5):
#     ngram = horse_moto_df["ngram"].iloc[i]
#     count = count_ngram_in_mega_string(ngram, mega_string)
#     print(f"Count of ngram '{ngram}' in mega_string: {count}")

In [12]:
# let's find 20 characters before and after each instance of term "romeo"
# def find_term_context(term, text, context_length=20):
#     indices = []
#     start = 0
#     while True:
#         start = text.find(term, start)
#         if start == -1:
#             break
#         indices.append(start)
#         start += len(term)  # move past the current term
#     contexts = []
#     for index in indices:
#         start_index = max(0, index - context_length)
#         end_index = min(len(text), index + len(term) + context_length)
#         contexts.append(text[start_index:end_index])
#     return contexts

# # let's test it on romeo
# term = "Romeo"
# contexts = find_term_context(term, mega_string)
# # how many contexts we found?
# print(f"Found {len(contexts)} contexts for term '{term}'")
# # print the first 5 contexts
# for i, context in enumerate(contexts[:5]):
#     print(f"Context {i+1}: {context}")

In [13]:
# let's create a new column ngram_count in horse_moto_df that will contain the count of each ngram in mega_string
# horse_moto_df["ngram_count"] = horse_moto_df["ngram"].apply(lambda x: count_ngram_in_mega_string(x.lower(), mega_string))
# # show the first 20 rows
# horse_moto_df.head(20)

In [14]:
# let's save the horse_moto_df to xlsx file
# output_file = Path("../xlsx/sauszemes_transporta_termini.xlsx")
# # save to xlsx
# horse_moto_df.to_excel(output_file, index=False)
# # show the output file path
# print(f"Saved horse_moto_df to {output_file}")

In [15]:
# show which ngrams have count 0
# zero_count_df = horse_moto_df[horse_moto_df["ngram_count"] == 0]
# # shape of zero_count_df
# print(f"zero_count_df shape: {zero_count_df.shape}")
# # show the first 20 rows of zero_count_df
# zero_count_df.head(20)

In [16]:
# "šībervilcien" in mega_string
# # find contexts for "šībervilcien"
# contexts = find_term_context("šībervilcien", mega_string, context_length=50)
# # print the first 5 contexts
# for i, context in enumerate(contexts[:5]):
#     print(f"Context {i+1}: {context}")


In [17]:
# let's save zero_count_df to xlsx file
# zero_count_output_file = Path("../xlsx/sauszemes_transporta_termini_zero_count.xlsx")   
# zero_count_df.to_excel(zero_count_output_file, index=False)

In [18]:
# # let's check kurvjrat
# kurvjrat = "kurvjrat"
# # find contexts for "kurvjrat"
# contexts = find_term_context(kurvjrat, mega_string, context_length=50)
# # how many contexts we found?
# print(f"Found {len(contexts)} contexts for term '{kurvjrat}'")
# # print the first 5 contexts
# for i, context in enumerate(contexts[:5]):
#     print(f"Context {i+1}: {context}")

In [19]:
# let's create a function that will take a term and mega_string and return contexts
# it will also print how many contexts were found and by default will print the first 5 contexts
# def print_term_contexts(term, text, context_length=50, max_print=5):
#     contexts = find_term_context(term, text, context_length)
#     print(f"Found {len(contexts)} contexts for term '{term}'")
#     for i, context in enumerate(contexts[:max_print]):
#         print(f"Context {i+1}: {context}")
# let's test it on "kurvjrat"
#print_term_contexts("autovāģ", mega_string, context_length=50, max_print=5)

In [20]:
# now ātrs palīdzība rat
#print_term_contexts("ātrs palīdzība rat", mega_string, context_length=50, max_print=5)

In [21]:
# now bagažas vilcien
#print_term_contexts("bagaža vilcien", mega_string, context_length=50, max_print=5)

In [22]:
# now goda-rat
#print_term_contexts("gods rats", mega_string, context_length=50, max_print=5)

In [23]:
# zero_ngrams = zero_count_df["ngram"].tolist()
# print(f"Number of zero count ngrams: {len(zero_ngrams)}")
# print(f"Those are: {zero_ngrams}")

In [24]:
# let's find dzelzsceļs rati
# print_term_contexts("dzelzceļš rats", mega_string, context_length=50, max_print=15)

In [25]:
# # now essex
# print_term_contexts("essex", mega_string, context_length=50, max_print=15)
# # roadster
# print_term_contexts("roadster", mega_string, context_length=50, max_print=15)

In [26]:
# now kamanas
# print_term_contexts("kamanas", mega_string, context_length=50, max_print=15)

In [27]:
# let's use regex to find kamanas where previous word starts with f
# import re
# def find_kamanas_with_previous_f(text):
#     pattern = r'f\S+\s+kamanas'
#     matches = re.findall(pattern, text)
#     return matches  
# # let's find kamanas with previous word starting with f
# matches = find_kamanas_with_previous_f(mega_string)
# print(f"Found {len(matches)} matches for 'kamanas' with previous word starting with 'f'")
# # print all matches
# for i, match in enumerate(matches):
#     print(f"Match {i+1}: {match}")

In [28]:
# let's make a generic function that will return all matches given text, needle and previous word pattern
# def find_matches_with_previous_word(text, needle, previous_word_pattern):
#     pattern = rf'{previous_word_pattern}\S+\s+{needle}'
#     matches = re.findall(pattern, text)
#     return matches


In [29]:
# # first print context for Davidsons
# print_term_contexts("davidsons", mega_string, context_length=50, max_print=15)
# # how about harley
# print_term_contexts("harley", mega_string, context_length=50, max_print=15)

In [30]:
# # now context for motorrats
# print_term_contexts("motorrats", mega_string, context_length=50, max_print=15)
# # how about motorrati
# print_term_contexts("motorrati", mega_string, context_length=50, max_print=15)
# # motorats
# print_term_contexts("motorats", mega_string, context_length=50, max_print=15)

In [31]:
# # now try redelains 
# print_term_contexts("redelains", mega_string, context_length=50, max_print=15)
# # how about redelain
# print_term_contexts("redelain", mega_string, context_length=50, max_print=15)
# # redela
# print_term_contexts("redela", mega_string, context_length=50, max_print=15)

In [32]:
# # now "roll-rois"
# print_term_contexts("roll-rois", mega_string, context_length=50, max_print=15)
# # how about just roll
# print_term_contexts("roll", mega_string, context_length=50, max_print=15)
# # hmm how about just rois
# print_term_contexts("rois", mega_string, context_length=50, max_print=15)
# # now rol-rois
# print_term_contexts("rol-rois", mega_string, context_length=50, max_print=15)

In [33]:
# now finally sanitars
# print_term_contexts("sanitars", mega_string, context_length=50, max_print=15)
# how about automobil where word previous to it starts with s
# matches = find_matches_with_previous_word(mega_string, "automobil", r'san\S+')
# print(f"Found {len(matches)} matches for 'automobil' with previous word starting with 's'")
# # print all matches
# for i, match in enumerate(matches):
#     print(f"Match {i+1}: {match}")
# # now let's try context for sanitārs automobil
# print_term_contexts("sanitārs automobil", mega_string, context_length=50, max_print=15)

## Separating horse_mote into horse and moto terms


In [34]:
# horse_moto_df.head()

In [35]:
# let's create separate lists of terms that are moto and horse
# moto_terms = horse_moto_df[horse_moto_df["horse_moto"] == "m"]["ngram"].tolist()
# horse_terms = horse_moto_df[horse_moto_df["horse_moto"] == "z"]["ngram"].tolist()
# # how many total terms we have?
# total_terms = len(moto_terms) + len(horse_terms)
# print(f"Total terms: {total_terms}")
# # assert number is equal to horse_moto_df shape
# assert total_terms == horse_moto_df.shape[0], f"Total terms {total_terms} does not match horse_moto_df shape {horse_moto_df.shape[0]}"

## Create lower case lemma texts for each document

In [36]:
# we will create a dictionary where key will be unique file_stem_short + firstEdition
# value will be lowercase lemma joined by whitespace
# we can actually do this using pandas using group by and then aggregate
# let's create a new column with file_stem_short + firstEdition
# df["file_stem_short_firstEdition"] = df["file_stem_short"] + "_" + df["firstEdition"].astype(str)
# # how many unique file_stem_short_firstEdition we have?
# unique_file_stem_firstEdition_count = df["file_stem_short_firstEdition"].nunique()
# print(f"Unique file_stem_short_firstEdition count: {unique_file_stem_firstEdition_count}")
# # TODO why is count 470 when korpuss.lnb.lv shows 458 ?
# # TODO create set difference between korpuss.lnb.lv and our dataframe
# # assert that it is equal to the number of unique file_stem_short
# unique_file_stem_count = df["file_stem_short"].nunique()
# print(f"Unique file_stem_short count: {unique_file_stem_count}")
# assert unique_file_stem_firstEdition_count == unique_file_stem_count, \
#     f"Unique file_stem_short_firstEdition count {unique_file_stem_firstEdition_count} does not match unique file_stem_short count {unique_file_stem_count}"

In [37]:
# which file_stem_short have more than one firstEdition?
# multiple_firstEdition = df.groupby("file_stem_short")["firstEdition"].nunique()
# multiple_firstEdition = multiple_firstEdition[multiple_firstEdition > 1]
# print(f"File stems with multiple first editions: {len(multiple_firstEdition)}")

In [38]:
# multiple_firstEdition.head(10)

In [39]:
# print the years for those with multiple first editions
# for file_stem, editions in multiple_firstEdition.items():
#     years = df[df["file_stem_short"] == file_stem]["firstEdition"].unique()
#     print(f"{file_stem}: {years}")

In [40]:
# # so let's add years later and just use file_stem_short for now
# # we will group by file_stem_short and aggregate lemma using whitespace join then lowercase it
# df_grouped = df.groupby("file_stem_short")["lemma"].apply(lambda x: " ".join(x).lower()).reset_index()
# # rename the column to lemma_joined
# df_grouped.rename(columns={"lemma": "lemma_joined"}, inplace=True)
# # index name to file_stem_short
# df_grouped.set_index("file_stem_short", inplace=True)
# # now we have a dataframe with file_stem_short as index and lemma_joined as column
# # shape of df_grouped
# print(f"df_grouped shape: {df_grouped.shape}")

In [41]:
# # let's add year column to df_grouped
# # we will use the min firstEdition for each file_stem_short
# df_grouped["year"] = df.groupby("file_stem_short")["firstEdition"].min().values
# # now we have a dataframe with file_stem_short as index, lemma_joined and year as columns
# # shape of df_grouped after adding year
# print(f"df_grouped shape after adding year: {df_grouped.shape}")

In [42]:
# let's see year for index DambV_GaitC
# print(f"Year for index 'DambV_GaitC': {df_grouped.loc['DambV_GaitC', 'year']}")

In [43]:
# let's save this df_grouped to a parquet file
# output_grouped_file = Path("../../not_repo/latsenrom_file_stem_short_grouped_lemma_lowercase.parquet")
# df_grouped.to_parquet(output_grouped_file)
# print(f"Saved df_grouped to {output_grouped_file}")

In [44]:
# # loading the grouped dataframe from parquet file
# df_grouped_loaded = pd.read_parquet(output_grouped_file)
# # shape of loaded dataframe
# print(f"Loaded df_grouped shape: {df_grouped_loaded.shape}")

## Counting absolute and relative frequencies in invidual works

In [45]:
# now that we have plaintexts for each file stem short and we have moto and horse terms
# we can count each term in each file stem short
# we will create a new dictionary where key will be file_stem_short and value will be another dictionary
# this inner dictionary will contain year and counts for each term
# let's create a function that will do that
# def count_terms_in_file_stems(df_grouped, terms):
#     counts_dict = {}
#     for file_stem in tqdm(df_grouped.index):
#         # get the lemma_joined for this file_stem
#         lemma_joined = df_grouped.loc[file_stem, "lemma_joined"]
#         # create a dictionary to hold counts for this file_stem
#         counts = {term: lemma_joined.count(term.lower()) for term in terms}
#         # add year
#         counts["year"] = df_grouped.loc[file_stem, "year"]
#         # add to the main dictionary
#         counts_dict[file_stem] = counts
#     return counts_dict

In [46]:
# let's create moto_dict and horse_dict
# moto_dict = count_terms_in_file_stems(df_grouped_loaded, moto_terms)    
# horse_dict = count_terms_in_file_stems(df_grouped_loaded, horse_terms)

In [47]:
# let's create two new dataframes from these dictionaries
# moto_df = pd.DataFrame.from_dict(moto_dict, orient='index')
# # we want year column to be first column right after index
# moto_df.reset_index(inplace=True)
# moto_df.rename(columns={"index": "file_stem_short"}, inplace=True)
# # reorder columns to have year first
# moto_df = moto_df[["file_stem_short", "year"] + [col for col in moto_df.columns if col not in ["file_stem_short", "year"]]]
# # shape of moto_df
# print(f"moto_df shape: {moto_df.shape}")
# # head of moto_df
# moto_df.head()

In [48]:
# # let's do the same for horse_dict
# horse_df = pd.DataFrame.from_dict(horse_dict, orient='index')
# # we want year column to be first column right after index
# horse_df.reset_index(inplace=True)
# horse_df.rename(columns={"index": "file_stem_short"}, inplace=True)
# # reorder columns to have year first
# horse_df = horse_df[["file_stem_short", "year"] + [col for col in horse_df.columns if col not in ["file_stem_short", "year"]]]
# # shape of horse_df
# print(f"horse_df shape: {horse_df.shape}")
# # head of horse_df
# horse_df.head()

In [49]:
# # before saving let's check if any columns have all zeros
def check_zero_columns(df):
    zero_columns = [col for col in df.columns if df[col].sum() == 0]
    return zero_columns
# # check zero columns in moto_df
# zero_columns_moto = check_zero_columns(moto_df)
# print(f"Zero columns in moto_df: {zero_columns_moto}")
# # check zero columns in horse_df
# zero_columns_horse = check_zero_columns(horse_df)
# print(f"Zero columns in horse_df: {zero_columns_horse}")

In [50]:
# let's save these dataframes to parquet files
output_moto_file = Path("../parquet/latsenrom_moto_terms_counts.parquet")
output_horse_file = Path("../parquet/latsenrom_horse_terms_counts.parquet")
# moto_df.to_parquet(output_moto_file)
# horse_df.to_parquet(output_horse_file)

In [51]:
# let's load the saved dataframes
moto_df_loaded = pd.read_parquet(output_moto_file)
horse_df_loaded = pd.read_parquet(output_horse_file)
# # assert that loaded dataframes have the same shape as original
# assert moto_df_loaded.shape == moto_df.shape, f"Loaded moto_df shape {moto_df_loaded.shape} does not match original {moto_df.shape}"
# assert horse_df_loaded.shape == horse_df.shape, f"Loaded horse_df shape {horse_df_loaded.shape} does not match original {horse_df.shape}"
# now we can name these dfs moto_df and horse_df
moto_df = moto_df_loaded
horse_df = horse_df_loaded
# print shapes
print(f"moto_df shape: {moto_df.shape}")
print(f"horse_df shape: {horse_df.shape}")

moto_df shape: (463, 89)
horse_df shape: (463, 72)


In [52]:
# assert there are no zero columns in moto_df and horse_df
assert check_zero_columns(moto_df) == [], "moto_df has zero columns"
assert check_zero_columns(horse_df) == [], "horse_df has zero columns"
print("No zero columns in moto_df and horse_df")

No zero columns in moto_df and horse_df


In [53]:
# head
print("moto_df head:")
display(moto_df.head())
print("horse_df head:")
display(horse_df.head())

moto_df head:


Unnamed: 0,file_stem_short,year,alfs romeo,auto,auto nāve limuzinis,autobuss,autokārs,auto-limuzīna,automašina,automašīna,...,traktors,trams,tramvajs,tramvajvāģs,transports,ugunsdzēsējs auto,velomotors,velosipēds,vilciens,vilciens sastāvs
0,AizsV_MilaU,1933,0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0
1,AkurJ_DegoS,1912,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
2,AkurJ_PeteD,1921,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,9,0
3,AkurJ_UgunZ,1925,0,7,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,10,0
4,Andra_Elita,1930,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0


horse_df head:


Unnamed: 0,file_stem_short,year,atsperrati,ātrs palīdzība rats,ātrs palīdzība vāģis,bēris,četrjūgs,darbs vāģis,diližanss,divjūgs,...,važonis,velosipeds,vezumnieks,vezumnieks ragavas,vezums,vienjūgs,zirdziņš,zirgs,zirgs tramvajs,zirģelis
0,AizsV_MilaU,1933,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,AkurJ_DegoS,1912,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,0
2,AkurJ_PeteD,1921,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,15,0,0
3,AkurJ_UgunZ,1925,0,0,0,0,0,0,0,0,...,0,0,12,0,10,0,0,18,0,0
4,Andra_Elita,1930,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [54]:
# let's order dataframes by year and save to xlsx files
# output_moto_xlsx = Path("../xlsx/latsenrom_moto_terms_counts.xlsx")
# output_horse_xlsx = Path("../xlsx/latsenrom_horse_terms_counts.xlsx")
# moto_df.sort_values(by="year").to_excel(output_moto_xlsx, index=False)
# horse_df.sort_values(by="year").to_excel(output_horse_xlsx, index=False)
# print(f"Saved moto_df to {output_moto_xlsx}")
# print(f"Saved horse_df to {output_horse_xlsx}")

## Creating relative document frequency for each term by year

In [None]:
# TODO now we want to create a table of relative document frequency for each term 
# relative document frequency will be calculated as follows:
# relative_document_frequency = (term_existance_in_document_in_year / total_documents_in_year) * 100

# so let's group by year use following aggregation:
# count how many documents we have in each year
# for each term count how many non-zero counts we have in each year
# we will use moto_df and horse_df for that
# let's start with moto_df
def calculate_relative_document_frequency(df):
    # group by year and count how many documents we have in each year
    relative_documents_per_year = df.groupby("year").size().to_frame(name='total_documents')

    # now let's add columns for each term
    terms = df.columns[2:]  # skip 'file_stem_short' and 'year'
    relative_df = pd.DataFrame(index=relative_documents_per_year.index)
    relative_df['year'] = relative_documents_per_year.index
    relative_df['total_documents'] = relative_documents_per_year['total_documents']
    for term in terms:
        # count how many non-zero counts we have in each year
        term_counts = df.groupby("year")[term].apply(lambda x: (x > 0).sum())
        # calculate relative document frequency
        relative_df[term] = (term_counts / relative_documents_per_year['total_documents']) * 100
   # fill NaN values with 0
    relative_df.fillna(0, inplace=True)
    # set year as index
    relative_df.set_index('year', inplace=True)
    return relative_df

    

# # calculate relative document frequency for moto_df
# moto_relative_df = calculate_relative_document_frequency(moto_df)
# # calculate relative document frequency for horse_df
# horse_relative_df = calculate_relative_document_frequency(horse_df)
# # shape of relative dataframes
# print(f"moto_relative_df shape: {moto_relative_df.shape}")
# print(f"horse_relative_df shape: {horse_relative_df.shape}")
# # display the first few rows of relative dataframes
# print("moto_relative_df head:")
# display(moto_relative_df.head())
# print("*"*80)
# print("horse_relative_df head:")
# display(horse_relative_df.head())

moto_relative_df shape: (44, 88)
horse_relative_df shape: (44, 71)
moto_relative_df head:


Unnamed: 0_level_0,total_documents,alfs romeo,auto,auto nāve limuzinis,autobuss,autokārs,auto-limuzīna,automašina,automašīna,automobilis,...,traktors,trams,tramvajs,tramvajvāģs,transports,ugunsdzēsējs auto,velomotors,velosipēds,vilciens,vilciens sastāvs
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0
1890,1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1892,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1893,1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0


********************************************************************************
horse_relative_df head:


Unnamed: 0_level_0,total_documents,atsperrati,ātrs palīdzība rats,ātrs palīdzība vāģis,bēris,četrjūgs,darbs vāģis,diližanss,divjūgs,divjūgs kulba,...,važonis,velosipeds,vezumnieks,vezumnieks ragavas,vezums,vienjūgs,zirdziņš,zirgs,zirgs tramvajs,zirģelis
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50.0,0.0,100.0,0.0,50.0,100.0,0.0,50.0
1890,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,0.0,0.0
1891,3,0.0,0.0,0.0,33.333333,0.0,0.0,0.0,66.666667,0.0,...,0.0,0.0,33.333333,0.0,100.0,0.0,100.0,100.0,0.0,33.333333
1892,1,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,100.0,0.0,100.0,100.0,0.0,0.0
1893,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0


In [None]:
# we want to created a simple average of all columsn after total_documents
def add_average_relative_frequency(df):
    # calculate the average of all columns except 'total_documents'
    df['average_relative_frequency'] = df.iloc[:, 1:].mean(axis=1)
    # move it to right after 'total_documents' column which is the first column
    cols = df.columns.tolist()
    cols.remove('average_relative_frequency')
    cols.insert(1, 'average_relative_frequency')
    df = df[cols]
    return df

# # shape of relative dataframes before adding average
# print(f"moto_relative_df shape before adding average: {moto_relative_df.shape}")
# print(f"horse_relative_df shape before adding average: {horse_relative_df.shape}")

# # add average relative frequency to moto_relative_df
# moto_relative_df = add_average_relative_frequency(moto_relative_df)
# # add average relative frequency to horse_relative_df
# horse_relative_df = add_average_relative_frequency(horse_relative_df)
# # shape of relative dataframes after adding average
# print(f"moto_relative_df shape after adding average: {moto_relative_df.shape}")
# print(f"horse_relative_df shape after adding average: {horse_relative_df.shape}")

# # display the first few rows of relative dataframes after adding average
# print("moto_relative_df head after adding average:")
# display(moto_relative_df.head())
# print("*" * 80)
# print("horse_relative_df head after adding average:")
# display(horse_relative_df.head())


moto_relative_df shape before adding average: (44, 88)
horse_relative_df shape before adding average: (44, 71)
moto_relative_df shape after adding average: (44, 89)
horse_relative_df shape after adding average: (44, 72)
moto_relative_df head after adding average:


Unnamed: 0_level_0,total_documents,average_relative_frequency,alfs romeo,auto,auto nāve limuzinis,autobuss,autokārs,auto-limuzīna,automašina,automašīna,...,traktors,trams,tramvajs,tramvajvāģs,transports,ugunsdzēsējs auto,velomotors,velosipēds,vilciens,vilciens sastāvs
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,2.873563,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0
1890,1,1.149425,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,3,1.915709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1892,1,1.149425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1893,1,3.448276,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0


********************************************************************************
horse_relative_df head after adding average:


Unnamed: 0_level_0,total_documents,average_relative_frequency,atsperrati,ātrs palīdzība rats,ātrs palīdzība vāģis,bēris,četrjūgs,darbs vāģis,diližanss,divjūgs,...,važonis,velosipeds,vezumnieks,vezumnieks ragavas,vezums,vienjūgs,zirdziņš,zirgs,zirgs tramvajs,zirģelis
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,19.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50.0,0.0,100.0,0.0,50.0,100.0,0.0,50.0
1890,1,8.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,0.0,0.0
1891,3,14.761905,0.0,0.0,0.0,33.333333,0.0,0.0,0.0,66.666667,...,0.0,0.0,33.333333,0.0,100.0,0.0,100.0,100.0,0.0,33.333333
1892,1,18.571429,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,100.0,0.0,100.0,100.0,0.0,0.0
1893,1,8.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0


## Saving and Loading relative document frequency data

In [None]:
# # let's save both relative dataframes to parquet files
output_moto_relative_file = Path("../parquet/latsenrom_moto_terms_relative_counts.parquet")
output_horse_relative_file = Path("../parquet/latsenrom_horse_terms_relative_counts.parquet")
# moto_relative_df.to_parquet(output_moto_relative_file)
# horse_relative_df.to_parquet(output_horse_relative_file)
# print(f"Saved moto_relative_df to {output_moto_relative_file}")
# print(f"Saved horse_relative_df to {output_horse_relative_file}")

Saved moto_relative_df to ..\parquet\latsenrom_moto_terms_relative_counts.parquet
Saved horse_relative_df to ..\parquet\latsenrom_horse_terms_relative_counts.parquet


In [64]:
# let's load the saved relative dataframes
moto_relative_df_loaded = pd.read_parquet(output_moto_relative_file)
horse_relative_df_loaded = pd.read_parquet(output_horse_relative_file)
# # assert that loaded dataframes have the same shape as original
# assert moto_relative_df_loaded.shape == moto_relative_df.shape, \
#     f"Loaded moto_relative_df shape {moto_relative_df_loaded.shape} does not match original {moto_relative_df.shape}"
# assert horse_relative_df_loaded.shape == horse_relative_df.shape, \
#     f"Loaded horse_relative_df shape {horse_relative_df_loaded.shape} does not match original {horse_relative_df.shape}"
# print("Loaded relative dataframes have the same shape as original")
# now we can name these dfs moto_relative_df and horse_relative_df
moto_relative_df = moto_relative_df_loaded
horse_relative_df = horse_relative_df_loaded
# let's round to 4 decimal places
moto_relative_df = moto_relative_df.round(4)
horse_relative_df = horse_relative_df.round(4)
# shape of relative dataframes
print(f"moto_relative_df shape: {moto_relative_df.shape}")
print(f"horse_relative_df shape: {horse_relative_df.shape}")
# display the first few rows of relative dataframes
print("moto_relative_df head:")
display(moto_relative_df.head())
print("*" * 80)
print("horse_relative_df head:")
display(horse_relative_df.head())

moto_relative_df shape: (44, 89)
horse_relative_df shape: (44, 72)
moto_relative_df head:


Unnamed: 0_level_0,total_documents,average_relative_frequency,alfs romeo,auto,auto nāve limuzinis,autobuss,autokārs,auto-limuzīna,automašina,automašīna,...,traktors,trams,tramvajs,tramvajvāģs,transports,ugunsdzēsējs auto,velomotors,velosipēds,vilciens,vilciens sastāvs
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,2.8736,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0
1890,1,1.1494,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,3,1.9157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1892,1,1.1494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
1893,1,3.4483,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0


********************************************************************************
horse_relative_df head:


Unnamed: 0_level_0,total_documents,average_relative_frequency,atsperrati,ātrs palīdzība rats,ātrs palīdzība vāģis,bēris,četrjūgs,darbs vāģis,diližanss,divjūgs,...,važonis,velosipeds,vezumnieks,vezumnieks ragavas,vezums,vienjūgs,zirdziņš,zirgs,zirgs tramvajs,zirģelis
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1879,2,19.2857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50.0,0.0,100.0,0.0,50.0,100.0,0.0,50.0
1890,1,8.5714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,0.0,0.0
1891,3,14.7619,0.0,0.0,0.0,33.3333,0.0,0.0,0.0,66.6667,...,0.0,0.0,33.3333,0.0,100.0,0.0,100.0,100.0,0.0,33.3333
1892,1,18.5714,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,100.0,0.0,100.0,100.0,0.0,0.0
1893,1,8.5714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0


In [None]:
# # let's save to excel files as well
# output_moto_relative_xlsx = Path("../xlsx/latsenrom_moto_terms_relative_counts.xlsx")
# output_horse_relative_xlsx = Path("../xlsx/latsenrom_horse_terms_relative_counts.xlsx")
# # before saving to excel let's round to 4 decimal places
# moto_relative_df = moto_relative_df.round(4)
# horse_relative_df = horse_relative_df.round(4)
# moto_relative_df.to_excel(output_moto_relative_xlsx)
# horse_relative_df.to_excel(output_horse_relative_xlsx)

## Visualizing the results

In [56]:
## TODO