# Analyzing Responses of LLM to prompt

In [2]:
# first we need to import the basic libraries
# date
from datetime import datetime
now = datetime.now()
print(f"Date: {now}")
# python version
import sys
print(f"Python version: {sys.version}")
from pathlib import Path
import json
# import time for delay
import time
import requests
# print version
print(f"Requests version: {requests.__version__}")

from tqdm import tqdm

import pandas as pd
print(f"Pandas version: {pd.__version__}")

Date: 2025-03-01 12:43:58.715529
Python version: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Requests version: 2.32.3
Pandas version: 2.2.2


In [2]:
# let's see what folders are in our ../data/responses folder
data_folder = Path("../data/responses")
print(f"Data folder: {data_folder}")
print(f"Data folder exists: {data_folder.exists()}")
print(f"Data folder is dir: {data_folder.is_dir()}")
# let's see what subfolders are in our data folder
subfolders = [f for f in data_folder.iterdir() if f.is_dir()]
print(f"Subfolders:")
for subfolder in subfolders:
    print(subfolder)

Data folder: ..\data\responses
Data folder exists: True
Data folder is dir: True
Subfolders:
..\data\responses\2025_01_28_gemini_2_experimental
..\data\responses\2025_01_29_google_gemini-flash-1.5-8b_no_terms
..\data\responses\2025_01_29_google_gemini-flash-1.5-8b_with_terms
..\data\responses\2025_02_04_google_gemini-flash-1.5-8b_with_terms
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_1
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_2
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt_2
..\data\responses\2025_02_27_google_gemini-flash-1.5_maritime_prompt
..\data\responses\2025_02_28_google_gemini-flash-1.5_air_prompt
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2
..\data\responses\temp_responses_2025_02_26
..\data\responses\unconsolidated_2025_02_26_op

## Consolidate openai responses

OpenAI prompts required us to break down files into smaller chunks, now we need to consolidate them back into a single file.


```python

In [4]:
# subfolders that contain openai in their name
openai_folders = [f for f in data_folder.iterdir() if f.is_dir() and "openai" in f.name]
print(f"OpenAI folders:")
for openai_folder in openai_folders:
    print(openai_folder)
    

OpenAI folders:
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2


In [5]:
# we want to create a function that given a subfolder will return a dictionary 
# keys will be first three parts of file name when split by _
# values will be actual file names
def get_files(subfolder):
    files = {}
    for file in subfolder.iterdir():
        if file.is_file():
            parts = file.name.split("_")
            key = "_".join(parts[:3])
            if key in files:
                files[key].append(file)
            else:
                files[key] = [file]
    return files

# let's run this function on one of the openai folders
openai_files = get_files(openai_folders[0])
print(f"OpenAI files:")
for key, value in openai_files.items():
    print(f"{key}: {value}")

OpenAI files:
AustA_KaspG_948026: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_KaspG_948026_0.txt')]
AustA_Puisk_1047362: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_1.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_2.txt')]
FimbK_KadNa_1049450: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_KadNa_1049450_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_KadNa_1049450_1.txt')]
FimbK_TiltP_1049479: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_TiltP_1049479_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_TiltP_1049479_1.txt')]
GulbA_Gaidi_1350352: [WindowsPath('.

In [11]:
# let's run get_files on all openai folders
# the key will be folder name and values will be dictionaries returned by get_files
openai_files = {}
for openai_folder in openai_folders:
    openai_files[openai_folder.name] = get_files(openai_folder)

# how many files are in each folder
for key, value in openai_files.items():
    print(f"{key}: {len(value)}")

2025_02_26_openai_gpt-4o-2024-11-20_land_prompt: 20
2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2: 20


In [15]:
# now let's write a function that given a file name and file list and new_subfolder will write consolidated file to new_subfolder
# logic is as follows:
# we want to read all content of files in file list up to empty line
# we want to write all this content to new file in new_subfolder
# then we want to separately read all lines starting with line that starts with "System prompt:"
# we want to write this content only once to new file in new_subfolder
# we want use utf-8 encoding
def consolidate_files(file_name, file_list, new_subfolder):
    # create new subfolder if it does not exist
    new_subfolder.mkdir(parents=True, exist_ok=True)
    with open(new_subfolder / f"{file_name}.txt", "w", encoding="utf-8") as new_file:
        system_prompts = []
        for file in file_list:
            with open(file, "r", encoding="utf-8") as old_file:
                text = old_file.read()
                # let's split on "System prompt:"
                parts = text.split("System prompt:")
                # let's write first part
                new_file.write(parts[0].strip()+"\n")
                # append second part to system_prompts
                system_prompts.append(parts[1])
        # let's write system prompts only once
        # first check if system prompts are identical
        if len(set(system_prompts)) == 1:
            new_file.write("\nSystem prompt:" + system_prompts[0])
        else:
            for system_prompt in system_prompts:
                new_file.write("System prompt:\n" + system_prompt)

# test it on second key of openai_files
# we will create a new subfolder in the data respones folder
# new_subfolder = data_folder / "consolidated"
# consolidate_files(list(openai_files.keys())[1], openai_files[list(openai_files.keys())[1]], new_subfolder)



In [16]:
# now let's write a function that will consolidate all files in all openai folders
# we will use consolidate_files function
# new subfolder will be in data folder
# it will be called consolidated_ + key of openai_files
def consolidate_all_files(openai_files, data_folder):
    for key, value in openai_files.items():
        new_subfolder = data_folder / ("consolidated_" + key)
        # value is a dictionary that contains keys that are first three parts of file name and values that are lists of files
        for key2, value2 in value.items():
            consolidate_files(key2, value2, new_subfolder)

# let's run this function
consolidate_all_files(openai_files, data_folder)

## Getting the subfolders for analysis

In [19]:
# now we want to get all folders that we want to analyze
# they are in data_folder 
# we want those that start with consolidated_2025_02_26 or consolidated_2025_02_27
# we also want those that start with 2025_02_26 or 2025_02_27 and also contain words land_prompt
# these will be the folders that we want to analyze
folders_to_analyze = [f for f in data_folder.iterdir() if f.is_dir() and (f.name.startswith("consolidated_2025_02_26") or f.name.startswith("consolidated_2025_02_27"))]
folders_to_analyze += [f for f in data_folder.iterdir() if f.is_dir() and (f.name.startswith("2025_02_26") or f.name.startswith("2025_02_27")) and "land_prompt" in f.name]
print(f"Folders to analyze:")
for folder in folders_to_analyze:
    print(folder)

Folders to analyze:
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_1
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_2
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt_2


## Reading Plaintext into  memory

In [9]:
# Plaintexts are in another repo - private in our parent folder
# let's list all text files in data/docs folder
# data_folder = Path("../data/docs")
# data_folder = Path("../../lnb_lat_sen_rom_releases/lat_sen_rom_2025_01_28")
data_folder = Path("../../lnb_lat_sen_rom_releases/lat_sen_rom_2025_02_04")
# assert folder exists
assert data_folder.exists(), f"Folder {data_folder} does not exist"
                   
# list all files
files = list(data_folder.glob("*.txt"))
# print all files
# how many files do we have?
print(f"Number of files: {len(files)}")
# let's load the files into a dictionary with filename stem as key and text as value
# remember to decode the text as utf-8
texts = {}
for file in tqdm(files):
  with open(file, "r", encoding="utf-8") as f:
    texts[file.stem] = f.read()
# how many texts do we have?
print(f"Number of texts: {len(texts)}")
# how many characters do we have in total?
total_chars = sum([len(text) for text in texts.values()])
print(f"Total characters: {total_chars}")
# what is the smallest text?
min_text = min(texts, key=lambda x: len(texts[x]))
print(f"Key for smallest text: {min_text}")
# how many characters does the smallest text have?
min_chars = len(texts[min_text])
print(f"Number of characters in smallest text: {min_chars}")
# what is the largest text?
max_text = max(texts, key=lambda x: len(texts[x]))
print(f"Key for largest text: {max_text}")
# how many characters does the largest text have?
max_chars = len(texts[max_text])
print(f"Number of characters in largest text: {max_chars}")


Number of files: 458


100%|██████████| 458/458 [00:07<00:00, 61.34it/s]

Number of texts: 458
Total characters: 191069647
Key for smallest text: VentA_DepuT_1293527
Number of characters in smallest text: 18648
Key for largest text: DeglA_LabaF_1053655
Number of characters in largest text: 2375090





## Comparing responses

In [23]:
# first let's assert that all our folders have identical file names
# we will use the first folder as reference
reference_files = Path(folders_to_analyze[0]).iterdir()
reference_files = [file.name for file in reference_files]
for folder in folders_to_analyze[1:]:
    files = Path(folder).iterdir()
    files = [file.name for file in files]
    assert reference_files == files, f"Files in {folders_to_analyze[0]} and {folder} are not identical"

print("All files are identical")

All files are identical


In [8]:
# let's write a function that given a file will extract all response lines
# response lines are those that come before empty line
# we will return a list of response lines
def get_response_lines(file):
    response_lines = []
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.strip() == "":
                break
            response_lines.append(line.strip())
    return response_lines

# test on first file in reference folder
# response_lines = get_response_lines(Path(folders_to_analyze[0]) / reference_files[0])
# print(f"Response lines: {response_lines}")

In [27]:
folders_to_analyze

[WindowsPath('../data/responses/consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt'),
 WindowsPath('../data/responses/consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2'),
 WindowsPath('../data/responses/2025_02_26_google_gemini-flash-1.5_land_prompt_1'),
 WindowsPath('../data/responses/2025_02_26_google_gemini-flash-1.5_land_prompt_2'),
 WindowsPath('../data/responses/2025_02_27_google_gemini-2.0-flash-001_land_prompt'),
 WindowsPath('../data/responses/2025_02_27_google_gemini-2.0-flash-001_land_prompt_2')]

In [12]:
# now let's write a function that given a file and texts dictionary will return a dataframe with two columns
# first column will be terms sorted from get_response_lines (could be duplicates)
# second column will count of occurences of term in text from matching key in texts dictionary
# key will be file name stem
def get_response_df(file, texts):
    response_lines = get_response_lines(file)
    data = []
    plaintext = texts.get(file.stem, "")
    # our term column name will be parent folder name of file
    term_column = file.parent.name
    # term_column = file.stem
    if plaintext == "":
        print(f"Plaintext not found for {file.stem}")
    for line in sorted(response_lines):
        data.append({term_column: line, "count": plaintext.count(line)})
    return pd.DataFrame(data)

# test on first file in reference folder
# df = get_response_df(Path(folders_to_analyze[0]) / reference_files[0], texts)
# print(f"Response dataframe:")
# df.head()

In [28]:
reference_files[0]

'AustA_KaspG_948026.txt'

In [6]:
# now let's write a function tht given a file and list of subfolders will return a combined dataframe
# columns will obtained by horizontally concatenating dataframes obtained by get_response_df
# index will be numerical
def get_combined_df(file, texts, subfolders):
    dfs = []
    for subfolder in subfolders:
        df = get_response_df(subfolder / file, texts)
        dfs.append(df)
    return pd.concat(dfs, axis=1)

# let's test it on first file in reference folder
# df = get_combined_df(reference_files[0], texts, folders_to_analyze)
# print(f"Combined dataframe:")
# df.head()



In [7]:
# now let's create a function that will create a CSV file for each file in reference folder
# we will use get_combined_df to get the dataframe
# we will supply target folder where we want to save the CSV files
def create_csv_files(reference_files, texts, subfolders, target_folder, save_excel=True):
    # create target folder if it does not exist
    target_folder.mkdir(parents=True, exist_ok=True)
    
    for file in reference_files:
        df = get_combined_df(file, texts, subfolders)
        df.to_csv(target_folder / f"{Path(file).stem}.csv", index=False)
        if save_excel:
            df.to_excel(target_folder / f"{Path(file).stem}.xlsx", index=False)

# let's test it on reference files
# target folder will be data folder with name analysis and datetime stamp
# target_folder = Path("../data") / "analysis" / now.strftime("%Y_%m_%d_%H_%M_%S")
# create_csv_files(reference_files, texts, folders_to_analyze, target_folder)

In [32]:
print(target_folder)

..\..\lnb_lat_sen_rom_releases\lat_sen_rom_2025_02_04\analysis\2025_02_27_21_05_04


## Maritime Analysis

In [36]:
# let's get a list of folders that contain words maritime_prompt
data_folder = Path("../data/responses")
maritime_folders = [f for f in data_folder.iterdir() if f.is_dir() and "maritime_prompt" in f.name]
print(f"Maritime folders:")
for folder in maritime_folders:
    print(folder)

Maritime folders:
..\data\responses\2025_02_27_google_gemini-flash-1.5_maritime_prompt


In [40]:
# let's get list of files in first maritime folder
maritime_files = Path(maritime_folders[0]).iterdir()
maritime_files = [file.name for file in maritime_files]
print(f"Maritime files:")
# how many files do we have?
print(f"Number of files: {len(maritime_files)}")
# first 5 files
for file in maritime_files[:5]:
    print(file)

Maritime files:
Number of files: 458
AizsV_MilaU_1049452.txt
AkurJ_DegoS_771400.txt
AkurJ_PeteD_886346.txt
AkurJ_UgunZ_1049441.txt
Andra_Elita_1053573.txt


In [41]:
# let's create_csv_files for maritime files
# target folder will be data folder with name analysis_maritime and datetime stamp
target_folder = Path("../data") / "analysis_maritime" / now.strftime("%Y_%m_%d_%H_%M_%S")   
create_csv_files(maritime_files, texts, maritime_folders, target_folder)

## Air transport Analysis

In [3]:
# let's get subfolder that contains words air_prompt
data_folder = Path("../data/responses")
air_folders = [f for f in data_folder.iterdir() if f.is_dir() and "air_prompt" in f.name]
print(f"Air folders:")
for folder in air_folders:
    print(folder)

Air folders:
..\data\responses\2025_02_28_google_gemini-flash-1.5_air_prompt


In [10]:
# we need to get a list of files in first air folder
air_files = Path(air_folders[0]).iterdir()
air_files = [file.name for file in air_files]

In [13]:
# now let's create csv files for air folders
# target folder will be data folder with name analysis_air and datetime stamp
target_folder = Path("../data") / "analysis_air" / now.strftime("%Y_%m_%d_%H_%M_%S")
create_csv_files(air_files, texts, air_folders, target_folder, save_excel=False)

## Loading parquet file with lemma and pos tags

In [3]:
# let's load parquet from outside our repository
src = Path("../../not_repo/latsenrom_2025_02_05.parquet")
# assert file exists
assert src.exists(), f"File {src} does not exist"
# memory usage before loading
import os
import psutil
process = psutil.Process(os.getpid())
print(f"Memory usage before loading: {process.memory_info().rss / 1024**2} MB")
# load parquet
df = pd.read_parquet(src)
# memory usage after loading
print(f"Memory usage after loading: {process.memory_info().rss / 1024**2} MB")
# shape
print(f"Shape: {df.shape}")
# sample
df.sample(5)

Memory usage before loading: 129.90234375 MB
Memory usage after loading: 12099.02734375 MB
Shape: (37605476, 17)


Unnamed: 0,deprel,form,index,lemma,parent,pos,tag,ufeats,upos,sent_ndx,author,title,dom_id,file_stem,file_stem_short,firstEdition,term
17154210,,rokā,6,roka,,ncfsl_,ncfsl4,,,775,LaciJ,MuzaM,963944,LaciJ_MuzaM_963944,LaciJ_MuzaM,1936,roka
35637918,,rokas,4,roka,,ncfpn_,ncfpn4,,,927,ZiemV,ZemBe,1049480,ZiemV_ZemBe_1049480,ZiemV_ZemBe,1934,roka
13721680,conj,Kristine,9,Kristine,7.0,npfsn_,npfsn5,Case=Nom|Gender=Fem|Number=Sing,PROPN,387,JekaK,KrisU,1296792,JekaK_KrisU_1296792,JekaK_KrisU,1930,Kristine
37883302,punct,!,11,!,10.0,zs,zs,_,PUNCT,140,NiedAi,PiekB,1049492,NiedAi_PiekB_1049492,NiedAi_PiekB,1931,!
11644067,,.,67,.,,zs,zs,,,79,JansJ,Ligav,1053661,JansJ_Ligav_1053661,JansJ_Ligav,1932,.


In [97]:
# how many unique form and how many lemma do we have?
unique_forms = df["form"].nunique()
unique_lemmas = df["lemma"].nunique()
print(f"Unique forms: {unique_forms}")
print(f"Unique lemmas: {unique_lemmas}")

Unique forms: 1249151
Unique lemmas: 777635


In [99]:
needle = "kuģitī"
# print sample of 10 rows where lemma is needle
df[df["lemma"] == needle].head(10)

Unnamed: 0,deprel,form,index,lemma,parent,pos,tag,ufeats,upos,sent_ndx,author,title,dom_id,file_stem,file_stem_short,firstEdition,term
872755,,kuģitī,2,kuģitī,,np000_,nc___0,,,91,AustA,GaraJ,1025406,AustA_GaraJ_1025406,AustA_GaraJ,1926,kuģitī


In [100]:
# how about where form is needle?
df[df["form"] == needle].head(10)

Unnamed: 0,deprel,form,index,lemma,parent,pos,tag,ufeats,upos,sent_ndx,author,title,dom_id,file_stem,file_stem_short,firstEdition,term
872755,,kuģitī,2,kuģitī,,np000_,nc___0,,,91,AustA,GaraJ,1025406,AustA_GaraJ_1025406,AustA_GaraJ,1926,kuģitī
6070454,obl,kuģitī,3,kuģitis,2.0,ncmsl_,ncmsl2,Case=Loc|Gender=Masc|Number=Sing,NOUN,997,EgliV,SkolK,771017,EgliV_SkolK_771017,EgliV_SkolK,1921,kuģitis
6101521,obl,kuģitī,4,kuģitis,3.0,ncmsl_,ncmsl2,Case=Loc|Gender=Masc|Number=Sing,NOUN,601,EgliV,SkolK,771017,EgliV_SkolK_771017,EgliV_SkolK,1921,kuģitis
10752258,,kuģitī,20,kuģitis,,npmsl_,ncmsl2,,,638,IeviK,SievM,1051668,IeviK_SievM_1051668,IeviK_SievM,1926,kuģitis
16626927,,kuģitī,8,kuģitis,,ncmsl_,ncmsl2,,,713,KukuJ,Laimi,1058165,KukuJ_Laimi_1058165,KukuJ_Laimi,1939,kuģitis


In [None]:
# we see that some lemmas are not in the form we are looking for

In [101]:
# let's check the shape of different needle = "kuģītis"
needle = "kuģītis"
print(f"Shape where lemma is {needle}: {df[df['lemma'] == needle].shape}")
print(f"Shape where form is {needle}: {df[df['form'] == needle].shape}")

Shape where lemma is kuģītis: (205, 17)
Shape where form is kuģītis: (27, 17)


## Combining responses into unique response document frequency counts

Next we will combine all responses from specific prompt into a single dataframe. We will use a parquet file with lemmatized results to compare against.

In [None]:
# let's compare set of file stems in our anaylsis folder with set of stems in our parquet file
# we will use set comprehension
# we will use set comprehension

# list of files in our target folder
air_analysis_files = list(target_folder.iterdir())
# set of stems in our target folder
air_analysis_stems = {file.stem for file in air_analysis_files}
# set of stems in our parquet file
parquet_stems = set(df["file_stem"].unique())
# how many total of each?
print(f"Total stems in air analysis folder: {len(air_analysis_stems)}")
print(f"Total stems in parquet file: {len(parquet_stems)}")
# let's see the difference
print(f"Stems in air analysis folder but not in parquet file: {air_analysis_stems - parquet_stems}")
print(f"Stems in parquet file but not in air analysis folder: {parquet_stems - air_analysis_stems}")

# we can see that there have been some changes in the file names but it should not meaningfully affect our analysis


Total stems in air analysis folder: 458
Total stems in parquet file: 485
Stems in air analysis folder but not in parquet file: {'RoziP_DivaS_1053490', 'JansJ_Dzimt_1051747', 'ZariK_DzivU_1051770', 'MateJ_PatrU_1293506', 'Anoni_BandK_419229', 'CeplA_Zeme_882015', 'JaunJ_Aija_656227', 'KaijI_IedzG_1296798', 'NiedA_LiduD_907728', 'VecoJ_ViktH_1296782', 'DeglA_Zelte_414397', 'JaunJ_BaltG_413159', 'KaijI_Juga_886317', 'UpitA_JaunA_103001', 'NiedA_Siksp_1544155', 'KaudR_MernL_771080', 'EldgH_ZvaiN_771102', 'DeglA_Patri_66131', 'JatnG_TevPi_1049477', 'ZeibJ_BaroB_1293562', 'KaijI_Dzint_1053686', 'UpitA_ZidaT_869211', 'KaijI_Sfink_886333', 'SpriJ_NaveL_1053548', 'MateJ_SadzV_416277'}
Stems in parquet file but not in air analysis folder: {'JansJ_Dzim_1051747', 'JatnG_TevPi_049477', 'KaudR_MernL_413085', 'KabeV_DzelD_1049482', 'UpitA_ZemNa_102999', 'UpitA_ZidaT_869228', 'FimbK_PecPu_1051725', 'UpitA_JaunA_1040993', 'RudzE_CaurE_886344', 'CeplA_Zeme1_882015', 'RoziP_DivaS_957619', 'NiedAn_LiduD_4

In [20]:
target_folder

WindowsPath('../data/analysis_air/2025_02_28_17_50_54')

In [39]:
from pandas.errors import EmptyDataError # we need to import this error
# now let's create a function that takes a subfolder with responses and returns a dataframe with unique terms and their combined counts
# to do so we will go through all csv files in subfolder
# we will extract rows with unique terms and their counts
# and then we will merge them into a single dataframe by adding counts
# we will return this dataframe
def get_combined_response_df(subfolder, verbose=False):
    dfs = []
    for file in tqdm(subfolder.iterdir()):
        if file.suffix == ".csv":
            try:
                df = pd.read_csv(file)
            except EmptyDataError:
                print(f"Empty data error for {file}")
                continue
            # drop duplicates
            df = df.drop_duplicates()
            # name first column term
            df.columns = ["term", "term_freq"]
            # we also want to add doc_freq column with value 1
            df["doc_freq"] = 1
            # df = df.groupby("term").sum().reset_index()
            dfs.append(df)
    if verbose:
        # print shape of first 5 dataframes
        for df in dfs[:5]:
            print(df.shape)
    df = pd.concat(dfs).groupby("term").sum().reset_index()
    # sort alphabetically
    # df = df.sort_values("term_freq", ascending=False)
    df = df.sort_values("doc_freq", ascending=False)
    return df



In [37]:
# let's test it on first air folder
air_df = get_combined_response_df(target_folder)
# shape
print(f"Shape: {air_df.shape}")
#  head 10
air_df.head(10)

458it [00:00, 948.89it/s]

Shape: (1744, 3)





Unnamed: 0,term,term_freq,doc_freq
1504,vilciens,566,102
771,laivas,925,95
160,auto,1875,83
582,kamanas,149,71
1176,rati,2222,63
727,kuģis,480,60
786,laivu,502,56
825,lidmašīna,546,52
722,kuģi,1513,52
1386,tvaikonis,158,50


In [28]:
# let's save this dataframe to a csv file in parent of target folder
# the file name will be air_combined with datetime stamp and then .csv
# we will save it without index

# target file
target_file = Path(target_folder).parent / f"air_combined_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
# save
air_df.to_csv(target_file, index=False)

## Extracting tf and df for maritime responses

In [91]:
# let's do the same for maritime folders
maritime_target_parent = Path("../data") / "analysis_maritime" 
# get subfolders
maritime_folders = [f for f in maritime_target_parent.iterdir() if f.is_dir()]
# print
print(f"Maritime folders:")
for folder in maritime_folders:
    print(folder)

Maritime folders:
..\data\analysis_maritime\2025_02_27_21_05_04


In [92]:

maritime_target_folder = Path(maritime_folders[0])
maritime_target_folder

WindowsPath('../data/analysis_maritime/2025_02_27_21_05_04')

In [40]:
maritime_df = get_combined_response_df(maritime_target_folder, verbose=True)
# shape
print(f"Shape: {maritime_df.shape}")
#  head 10
maritime_df.head(10)

604it [00:00, 1956.77it/s]

Empty data error for ..\data\analysis_maritime\2025_02_27_21_05_04\RozeL_StipK_964055.csv


916it [00:01, 498.51it/s] 

(11, 3)
(5, 3)
(7, 3)
(14, 3)
(2, 3)
Shape: (1240, 3)





Unnamed: 0,term,term_freq,doc_freq
560,laivas,1146,183
492,kuģis,770,138
485,kuģi,2492,127
843,rati,2821,86
580,laivu,590,78
581,laivā,521,65
1003,tvaikonis,196,64
1118,vilciens,362,63
397,kamanas,141,60
558,laiva,944,50


In [41]:
# let's save results to maritime_target_parent
# target file will have name maritime_term_tf_doc_tf with datetime stamp and then .csv
# we will save it without index
target_file = maritime_target_parent / f"maritime_term_tf_doc_tf_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
# save
maritime_df.to_csv(target_file, index=False)

## Analysing combined LLM results from air and maritime prompts

Next we will load the saved CSV files and see if we can combine some terms to obtain candidates for time series analysis.



In [4]:
# martime analysis folder
maritime_src_folder = Path("../data/analysis_maritime")
# get all csv files
maritime_files = list(Path(maritime_src_folder).glob("*.csv"))
# sort by date created
maritime_files = sorted(maritime_files, key=lambda x: x.stat().st_ctime) # st_ctime is depreceated but still works for time being
# how many files do we have?
print(f"Number of files: {len(list(maritime_files))}")
# print names
for file in maritime_files:
    print(file)

Number of files: 1
..\data\analysis_maritime\maritime_term_tf_doc_tf_2025_02_28_17_50_54_gemini_flash_15.csv


In [5]:
# let's load the latest file into df
maritime_df = pd.read_csv(maritime_files[-1])
# shape
print(f"Shape: {maritime_df.shape}")
# head
maritime_df.head()

Shape: (1240, 3)


Unnamed: 0,term,term_freq,doc_freq
0,laivas,1146,183
1,kuģis,770,138
2,kuģi,2492,127
3,rati,2821,86
4,laivu,590,78


In [6]:
# let's sort by term see if we can combine some terms
maritime_df = maritime_df.sort_values("term")
# let's see if we can combine some terms
# we will use fuzzy matching
# we will use rapidfuzz library
import rapidfuzz
from rapidfuzz import process
# let's see how many unique terms we have
unique_terms = maritime_df["term"].unique()
print(f"Number of unique terms: {len(unique_terms)}")

Number of unique terms: 1240


In [7]:
# we would like to combine terms that are similar
# we will use process.extract to get similar terms
# we will use threshold of 90
# we will use limit of 5
# we will use scorer fuzz.ratio
# we will use rapidfuzz.fuzz.ratio
# we will use rapidfuzz.fuzz.token_sort_ratio
# we will use rapidfuzz.fuzz.token_set_ratio

# let's test it on first term
term = unique_terms[0]
# get similar terms
similar_terms = process.extract(term, unique_terms, scorer=rapidfuzz.fuzz.ratio, 
                                limit=10, 
                                score_cutoff=90)
                                
# print
print(f"Term: {term}")
for similar_term in similar_terms:
    print(similar_term)

Term: "Ausekli"
('"Ausekli"', 100.0, 0)


In [22]:
# how about kuģis?
term = "kuģis"
# get similar terms
similar_terms = process.extract(term, unique_terms, scorer=rapidfuzz.fuzz.ratio, 
                                limit=50, 
                                score_cutoff=50)
# print
print(f"Term: {term}")
for similar_term in similar_terms:
    print(similar_term)

Term: kuģis
('kuģis', 100.0, 492)
('kuģits', 90.9090909090909, 499)
('kušģis', 90.9090909090909, 545)
('kuģi', 88.88888888888889, 485)
('kuģelis', 83.33333333333334, 481)
('kuģitis', 83.33333333333334, 498)
('kuģītis', 83.33333333333334, 522)
('kuģim', 80.0, 487)
('kuģit', 80.0, 496)
('kuģos', 80.0, 510)
('kuģus', 80.0, 515)
('kuģģi', 80.0, 516)
('kuģši', 80.0, 530)
('kuģģitis', 76.92307692307692, 517)
('* kuģi', 72.72727272727273, 24)
('kuģcus', 72.72727272727273, 479)
('kuģeli', 72.72727272727273, 480)
('kuģiem', 72.72727272727273, 486)
('kuģiti', 72.72727272727273, 497)
('kuģitī', 72.72727272727273, 500)
('kuģiša', 72.72727272727273, 501)
('kuģiši', 72.72727272727273, 502)
('kuģišu', 72.72727272727273, 503)
('kuģīti', 72.72727272727273, 520)
('kuģīši', 72.72727272727273, 525)
('bruņkuģis', 71.42857142857143, 152)
('kaŗakuģis', 71.42857142857143, 432)
('kuģinieks', 71.42857142857143, 490)
('tankkuģis', 71.42857142857143, 938)
('gaisakuģis', 66.66666666666667, 315)
('kumēlis', 66.6666

In [23]:
# let's try the same for laiva
term = "laiva"
# get similar terms
similar_terms = process.extract(term, unique_terms, scorer=rapidfuzz.fuzz.ratio, 
                                limit=50, 
                                score_cutoff=50)
# print
print(f"Term: {term}")
for similar_term in similar_terms:
    print(similar_term)

Term: laiva
('laiva', 100.0, 558)
('laimva', 90.9090909090909, 551)
('laivam', 90.9090909090909, 559)
('laivas', 90.9090909090909, 560)
('laiv', 88.88888888888889, 557)
('laiviņa', 83.33333333333334, 574)
('laivāam', 83.33333333333334, 582)
('laivu', 80.0, 580)
('laivā', 80.0, 581)
('lajva', 80.0, 586)
('* laivas', 76.92307692307692, 25)
('laiviņas', 76.92307692307692, 575)
('lielaiva', 76.92307692307692, 610)
('laipas', 72.72727272727273, 555)
('laivām', 72.72727272727273, 583)
('laivās', 72.72727272727273, 584)
('burulaiva', 71.42857142857143, 190)
('liellaiva', 71.42857142857143, 618)
('zvejlaiva', 71.42857142857143, 1180)
('Liellaivas', 66.66666666666667, 44)
('burulaivas', 66.66666666666667, 191)
('laivelē', 66.66666666666667, 562)
('laivinieka', 66.66666666666667, 566)
('laiviņu', 66.66666666666667, 576)
('laiviņā', 66.66666666666667, 577)
('lielalivas', 66.66666666666667, 611)
('liellaivas', 66.66666666666667, 619)
('lielļaivas', 66.66666666666667, 623)
('motorlaiva', 66.6666666

In [None]:
# as we can see from above example a reasonable cutoff would be 70 that would still include various forms of the same word
# also we might want to consider later filtering out terms that do not start with same letter as those might be compound words 
# we will want to treat compound words separately

In [13]:
df.columns

Index(['deprel', 'form', 'index', 'lemma', 'parent', 'pos', 'tag', 'ufeats',
       'upos', 'sent_ndx', 'author', 'title', 'dom_id', 'file_stem',
       'file_stem_short', 'firstEdition', 'term'],
      dtype='object')

## Normalizing lemma set from big parquet data in latsenrom

In [25]:
unique_lemma_set = set(df["lemma"].unique())
# how many unique lemmas do we have from large df?
print(f"Number of unique lemmas: {len(unique_lemma_set)}")
# this is much larger number than we would expect in Latvian language
# mostly due to mispellings and ocr errors and other issues
# let's convert this set to set of all lowercase lemmas 
unique_lemma_lower_set = {lemma.lower() for lemma in unique_lemma_set}
# how many unique lemmas do we have in lowercase?
print(f"Number of unique lemmas in lowercase: {len(unique_lemma_lower_set)}")

Number of unique lemmas: 777635
Number of unique lemmas in lowercase: 731566


In [27]:
# now let's remove all non-letter characters from lemmas,except for hyphens and regular whitespace
# we will use regular expressions
import re
# let's test it on first lemma
lemma = "kuģis  ! *3 vasara"
# remove all non-letter characters
lemma = re.sub(r"[^a-zāčēģīķļņšūž -]", "", lemma.lower())
# we also want to replace occurrences of multiple spaces with single space
lemma = re.sub(r"\s+", " ", lemma)
# print
print(f"Lemma: {lemma}")


Lemma: kuģis vasara


In [68]:
# now let's create a function that will take a lemma and return a cleaned lemma
# we will remove all non-letter characters except hyphens and regular whitespace
# we will also replace multiple spaces with single space
def clean_lemma(lemma):
    # lowercase
    lemma = lemma.lower()
    # replace ŗ with r
    lemma = lemma.replace("ŗ", "r")
    # remove all non-letter characters
    lemma = re.sub(r"[^a-zāčēģīķļņšūž -]", "", lemma.lower())
    # replace multiple spaces with single space
    lemma = re.sub(r"\s+", " ", lemma)
    return lemma
# now let's run it on set of unique unique_lemma_lower_set
# how many unique lemmas do we have in uncleaned set?
print(f"Number of unique lemmas in uncleaned set: {len(unique_lemma_lower_set)}")
# let's create cleaned set
cleaned_lemma_set = {clean_lemma(lemma) for lemma in unique_lemma_lower_set}
# how many unique lemmas do we have in cleaned set?
print(f"Number of unique lemmas in cleaned set: {len(cleaned_lemma_set)}")

Number of unique lemmas in uncleaned set: 731566
Number of unique lemmas in cleaned set: 701916


In [102]:
# since we have many erroronous lemma, idea is to filter out those lemma that occur less than some threshold
# we will use threshold of 5 - assumption being that if lemma occurs less than 5 times it is likely an error
# we will use collections.Counter to count occurrences of each lemma
from collections import Counter
# let's test it on first 10 lemmas
lemma_counts = Counter(df["lemma"].str.lower())
# print first 10 lemma counts
for lemma, count in list(lemma_counts.items())[:10]:
    print(f"{lemma}: {count}")

mīla: 7060
ārprāts: 650
vara: 7726
romāns: 1028
": 836408
redzēt: 58049
,: 3041049
vents: 669
cik: 34624
līksmi: 442


In [103]:
# how many lemma do we have in total?
print(f"Total number of lemma: {len(lemma_counts)}")

Total number of lemma: 731566


In [104]:
# how many lemma do we have that occur 5 or more times?
threshold = 5
# we will use dictionary comprehension
# we will use key value pair if value is greater than or equal to threshold
# we will use items method to get key value pairs
# we will use dict method to convert key value pairs to dictionary
lemma_counts_filtered = dict({key: value for key, value in lemma_counts.items() if value >= threshold}.items())
# how many lemma do we have that occur 5 or more times?
print(f"Total number of lemma that occur {threshold} or more times: {len(lemma_counts_filtered)}")

Total number of lemma that occur 5 or more times: 121086


In [105]:
# let's ordre lemma_counts_filtered by value
# we will use sorted function
# we will use lambda function
# we will use itemgetter
from operator import itemgetter
lemma_counts_sorted = dict(sorted(lemma_counts_filtered.items(), key=itemgetter(1), reverse=True))
# let's print first 10
for lemma, count in list(lemma_counts_sorted.items())[:10]:
    print(f"{lemma}: {count}")

,: 3041049
.: 2176740
un: 1088962
būt: 920018
": 836408
-: 711881
tas: 503867
viņš: 500675
es: 358826
kas: 318374


In [None]:
# how about last 10
for lemma, count in list(lemma_counts_sorted.items())[-10:]:
    print(f"{lemma}: {count}")
# we can see thatr some of the lemma are proper nouns but that is fine
# we've made a reduction in lemma count by about 600% which is good
# the current count of 120k lemma is still too high but we will leave it for now

laupītajs: 5
mārietiņ: 5
vēps: 5
uriels: 5
landze: 5
miroņrate: 5
rījnieks: 5
grāvele: 5
bogurski: 5
mche-che-che: 5


In [107]:
# so cleaned_lemma_set will be keys of lemma_counts_filtered in a set
cleaned_lemma_set = set(lemma_counts_filtered.keys())
# how many unique lemmas do we have in cleaned_lemma_set?
print(f"Number of unique lemmas in cleaned set: {len(cleaned_lemma_set)}")

Number of unique lemmas in cleaned set: 121086


In [108]:
# sample 10 random lemmas from cleaned set
import random
random_lemmas = random.sample(sorted(cleaned_lemma_set), 20)
random.seed(2025)
# print
for lemma in random_lemmas:
    print(lemma)
# we can see that most are variations of misspellings and ocr errors

atsacitot
kāsa
lesim
biljards
apspis
būiu
tirpics
fpēdams
kuraã
bizons
logeem
arbuzs
priecadams
andreas
pamudinats
nokļūsēt
atslaucīt
turpat
šķeltnis
fagrābt


In [109]:
"kuģis" in cleaned_lemma_set # lookup should be in O(1) time for existance in set

True

In [110]:
# convert unique_terms to set
unique_terms_set = set(unique_terms)
# how many unique terms do we have?
print(f"Number of unique terms: {len(unique_terms_set)}")
# let's keep only lowercase terms
unique_terms_set = {term.lower() for term in unique_terms_set}
# how many unique terms do we have in lowercase?
print(f"Number of unique terms in lowercase: {len(unique_terms_set)}")

Number of unique terms: 1240
Number of unique terms in lowercase: 1237


In [111]:
# now let's clean our unique terms using the same approach of lowercasing and removing non-letter characters
# we will use the same function as well
# how many unique terms do we have?
print(f"Number of unique terms: {len(unique_terms_set)}")
# let's create cleaned set
cleaned_term_set = {clean_lemma(term) for term in unique_terms_set}
# how many unique terms do we have in cleaned set?
print(f"Number of unique terms in cleaned set: {len(cleaned_term_set)}")
# what is difference between unique terms and cleaned terms?
print(f"Difference between unique terms and cleaned terms: {len(unique_terms_set - cleaned_term_set)}")
# let's get a sample of 10 terms that are in unique terms but not in cleaned terms
random.seed(2025)
sample_terms = random.sample(sorted(unique_terms_set - cleaned_term_set), 10)

# print
for term in sample_terms:
    print(term)
# we can see that our cleaning function got rid of some slavic characters and other non-letter characters
# we also have replace ŗ with r

Number of unique terms: 1237
Number of unique terms in cleaned set: 1227
Difference between unique terms and cleaned terms: 82
земūdens laivu
"krīvu"
ratí
* baļķi
«kurmis»
"ausekli"
kaŗakuģis
kaŗakuģu
куģiem
* spārnu ratu


In [112]:
# let's find only terms that are lemmas 
# to do so we will filter with a requirment that the term can be found in lemma column in the large df that we loaded earlier
# it is not 100% accurate (as it is theoretically possible some lemma is not in the term) but it should be a good approximation



lemmas = [term.lower() for term in tqdm(unique_terms_set) if term.lower() in cleaned_lemma_set]
# how many lemmas do we have from our maritime terms?
print(f"Number of lemmas: {len(lemmas)}")
# first five sorted
lemmas = sorted(lemmas)
for lemma in lemmas[:25]:
    print(lemma)
# we see there are some lemma that are still a variation of the same word

100%|██████████| 1237/1237 [00:00<00:00, 857437.46it/s]

Number of lemmas: 206
aero
aeroplana
aeroplans
aeroplāns
aizjūgs
arkls
artilerija
artilērija
auto
autobuss
autokārs
automašīna
automobilis
automobils
automobīlis
autotaksis
bajars
baltiņš
barka
barkase
barkass
barža
berbep
bobs
brauceji





In [113]:
# let's create a dictionary with lemmas as keys and list of fuzzy similar terms as values
# we will use a threshold of 75
lemma_dict = {}
for lemma in tqdm(lemmas):
    similar_terms = process.extract(lemma, cleaned_term_set, scorer=rapidfuzz.fuzz.ratio, 
                                    limit=50, 
                                    score_cutoff=75)
    lemma_dict[lemma] = [term for term, score, _ in similar_terms]

# let's see first 5 keys and values
for key, value in list(lemma_dict.items())[:15]:
    print(f"{key}: {value}")

100%|██████████| 206/206 [00:00<00:00, 10798.98it/s]

aero: ['aero']
aeroplana: ['aeroplana', 'aeroplanu', 'aeroplani', 'aeroplanā', 'aeroplans', 'aeroplanis', 'aeroplāns']
aeroplans: ['aeroplans', 'aeroplanis', 'aeroplanu', 'aeroplani', 'aeroplāns', 'aeroplana', 'aeroplanā']
aeroplāns: ['aeroplāns', 'aeroplans', 'aeroplanis', 'aeroplanu', 'aeroplani', 'aeroplana', 'aeroplanā']
aizjūgs: ['aizjūgs', 'aizjūgos', 'aizjūgus', 'aizjūgi', 'aizjūgā', 'aizjūga', 'aizjūgam', 'pajūgs']
arkls: ['arkls']
artilerija: ['artilerija', 'artilērija']
artilērija: ['artilērija', 'artilerija']
auto: ['auto', 'rato']
autobuss: ['autobuss', 'autobusos', 'autobusā', 'autobusi', 'autobusa', 'autobusu']
autokārs: ['autokārs']
automašīna: ['automašīna', 'automašīnas', 'automašīnu', 'mašīna']
automobilis: ['automobilis', 'automobils', 'automobili', 'automobilim', 'automobīlis', 'automobiļis', 'automobiļi', 'automobilī', 'automobiļos', 'automobiļus', 'automobiļiem', 'automobīlī', 'automobīļi', 'automobiļa', 'automobiļu']
automobils: ['automobils', 'automobilis', 'aut




In [64]:
# how about last 5 items
for key, value in list(lemma_dict.items())[-15:]:
    print(f"{key}: {value}")

zirgs: ['zirgs', 'zirgi', 'zirgu', 'zirgelis']
zirģelis: ['zirģelis', 'zirgelis']
zvejlaiva: ['zvejlaiva', 'zvejlaivas', 'zvejas laivas', 'zvejnieku laiva']
zārks: ['zārks']
zēģele: ['zēģele', 'zēģeles', 'zēģelnieka']
zēģelkuģis: ['zēģelkuģis', 'zēģelkuģišiem', 'zēģelniekus']
ātrvilciens: ['ātrvilciens', 'ātrvilcienu', 'ātrvilcienā', 'vilciens', 'vilcienus', 'vilcienos', ' vilciens', 'sanitārvilciens', 'bruņuvilciens']
četrjūgs: ['četrjūgs', 'četrjūgā', 'trijjūgs']
čoliņa: ['čoliņa', 'čoliņu', 'čoliņā']
ērzelis: ['ērzelis']
ķēve: ['ķēve', 'ķēvi']
šalons: ['šalons', 'ešalons', 'ešalonu', 'ešelons', 'ešaloni', 'galeons']
škipers: ['škipers']
šoneris: ['šoneris', 'šoneri', 'šonerus', 'šoneriem']
štīmers: ['štīmers', 'štīmeri', 'stīmeris']


In [114]:
# let's filter the values in our lemma_dict so that only terms that start with same letter as lemma are included
# also we want to exclude terms that have difference in length from original lemma by more than two characters
# we will create a new dictionary with these requirements
filtered_lemma_dict = {}
for key, value in lemma_dict.items():
    filtered_value = [term for term in value if term.startswith(key[0]) and abs(len(term) - len(key)) <= 2]
    filtered_lemma_dict[key] = filtered_value

# let's see first 5 keys and values
for key, value in list(filtered_lemma_dict.items())[:15]:
    print(f"{key}: {value}")

aero: ['aero']
aeroplana: ['aeroplana', 'aeroplanu', 'aeroplani', 'aeroplanā', 'aeroplans', 'aeroplanis', 'aeroplāns']
aeroplans: ['aeroplans', 'aeroplanis', 'aeroplanu', 'aeroplani', 'aeroplāns', 'aeroplana', 'aeroplanā']
aeroplāns: ['aeroplāns', 'aeroplans', 'aeroplanis', 'aeroplanu', 'aeroplani', 'aeroplana', 'aeroplanā']
aizjūgs: ['aizjūgs', 'aizjūgos', 'aizjūgus', 'aizjūgi', 'aizjūgā', 'aizjūga', 'aizjūgam']
arkls: ['arkls']
artilerija: ['artilerija', 'artilērija']
artilērija: ['artilērija', 'artilerija']
auto: ['auto']
autobuss: ['autobuss', 'autobusos', 'autobusā', 'autobusi', 'autobusa', 'autobusu']
autokārs: ['autokārs']
automašīna: ['automašīna', 'automašīnas', 'automašīnu']
automobilis: ['automobilis', 'automobils', 'automobili', 'automobilim', 'automobīlis', 'automobiļis', 'automobiļi', 'automobilī', 'automobiļos', 'automobiļus', 'automobiļiem', 'automobīlī', 'automobīļi', 'automobiļa', 'automobiļu']
automobils: ['automobils', 'automobilis', 'automobili', 'automobilī', 'aut

In [115]:
# last 15 items
for key, value in list(filtered_lemma_dict.items())[-15:]:
    print(f"{key}: {value}")

vērsis: ['vērsis']
vēzums: ['vēzums', 'vēzumus', 'vezums', 'vezumos']
zemūdene: ['zemūdene', 'zemūdenes', 'zemūdenē', 'zemūdeņu']
zemūdenslaiva: ['zemūdenslaiva', 'zemūdenslaivas', 'zemūdens laivas', 'zemūdenslaivu', 'zemūdens laivu']
zirdziņš: ['zirdziņš', 'zirdziņi']
zirgs: ['zirgs', 'zirgi', 'zirgu']
zirģelis: ['zirģelis', 'zirgelis']
zārks: ['zārks']
zēģele: ['zēģele', 'zēģeles']
ātrvilciens: ['ātrvilciens', 'ātrvilcienu', 'ātrvilcienā']
četrjūgs: ['četrjūgs', 'četrjūgā']
čoliņa: ['čoliņa', 'čoliņu', 'čoliņā']
ērzelis: ['ērzelis']
ķēve: ['ķēve', 'ķēvi']
šoneris: ['šoneris', 'šoneri', 'šonerus', 'šoneriem']


In [117]:
# shape
print(f"Shape: {maritime_df.shape}")
maritime_df.head()


Shape: (1227, 2)


Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
laivas,1146,183
kuģis,770,138
kuģi,2492,127
rati,2821,86
laivu,590,78


In [118]:
# let's create index for maritime_df that is term lowercased and contains only letters and hyphens
# reset index first
maritime_df = maritime_df.reset_index()
maritime_df["term"] = maritime_df["term"].apply(clean_lemma)
# use term as index
maritime_df = maritime_df.set_index("term")
# let's see first 5 index values
maritime_df.head()
# do we have any duplicate index values?
print(f"Number of duplicate index values: {maritime_df.index.duplicated().sum()}")
# let's see the duplicates
duplicates = maritime_df.index[maritime_df.index.duplicated()]
# print
for duplicate in duplicates:
    print(duplicate)

Number of duplicate index values: 0


In [81]:
# let's combine the rows with same index by summing term_freq and doc_freq
# to do so we will first reset index
maritime_df = maritime_df.reset_index()
# now we will group by term and sum
maritime_df = maritime_df.groupby("term").sum().reset_index()
# let's see first 5 rows
maritime_df.head()

Unnamed: 0,term,term_freq,doc_freq
0,,0,7
1,baļķi,0,1
2,divrati,0,1
3,kuģi,0,1
4,laivas,0,2


In [119]:
# shape
print(f"Shape: {maritime_df.shape}")
# sort by most doc_freq
maritime_df = maritime_df.sort_values("doc_freq", ascending=False)
# let's see first 10 rows  
maritime_df.head(10)

Shape: (1227, 2)


Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
laivas,1146,183
kuģis,770,138
kuģi,2492,127
rati,2821,86
laivu,590,78
laivā,522,66
tvaikonis,196,64
vilciens,362,63
kamanas,141,60
laiva,944,50


In [85]:
# make term the index
maritime_df = maritime_df.set_index("term")
# let's see first 5 index values
maritime_df.head()

Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
laivas,1146,183
kuģis,770,138
kuģi,2492,127
rati,2821,86
laivu,590,78


In [120]:
# now let's create a a new dataframe using filtered_lemma_dict as a guide
# rows will be lemmas - keyis from filtered_lemma_dict
# there will be following columns:
# terms - values from the filtered_lemma_dict
# term_freq - sum of term_freq for terms
# doc_freq - sum of doc_freq for terms
# we will sort by doc_freq

# first let's create a list of dictionaries
data = []
for key, value in tqdm(filtered_lemma_dict.items()):
    term_freq = 0
    # we need to sum all term_freq for items in value
    for term in value:
        term_freq += maritime_df.loc[term, "term_freq"]
    doc_freq = 0
    # we need to sum all doc_freq for items in value
    for term in value:
        doc_freq += maritime_df.loc[term, "doc_freq"]
    data.append({"lemma": key, "terms": value, "term_freq": term_freq, "doc_freq": doc_freq})
# let's create a dataframe
lemma_df = pd.DataFrame(data)
# sort by doc_freq
lemma_df = lemma_df.sort_values("doc_freq", ascending=False)
# let's see first 10 rows
lemma_df.head(50)



100%|██████████| 206/206 [00:00<00:00, 32091.32it/s]


Unnamed: 0,lemma,terms,term_freq,doc_freq
99,laiv,"[laiv, laivā, laiva, laivu, laivās, laivām, la...",3364,418
100,laiva,"[laiva, laimva, laivas, laivam, laiv, laivāam,...",3283,405
90,kuģis,"[kuģis, kuģits, kušģis, kuģi, kuģelis, kuģītis...",3410,318
102,laiviņa,"[laiviņa, laiviņas, laiviņā, laiviņu, laiva, l...",2262,301
86,kuģa,"[kuģa, kuģga, kuģiša, kuģīša, kuģu, kuģī, kuģi...",3547,228
91,kuģitis,"[kuģitis, kuģģitis, kuģits, kuģiti, kuģītis, k...",825,176
93,kuģītis,"[kuģītis, kuģīti, kuģītim, kuģitis, kuģis, kuģ...",818,175
98,kuši,"[kuši, kuģši, kugiši, kuģiši, kušģis, kuģīši, ...",2844,148
173,tvaikonis,"[tvaikonis, tvaikoni, tvaikonītis, tvaikonitis...",422,147
87,kuģelis,"[kuģelis, kuģeli, kuģis, kuģits, kušģis]",772,142


In [121]:
# let's save this dataframe to maritime_target_parent
# target file will have name maritime_lemma_term_tf_doc_tf with datetime stamp and then .csv
# we will save it without index
target_file = maritime_target_folder.parent / f"maritime_lemma_term_tf_doc_tf_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
# save
lemma_df.to_csv(target_file, index=False)


In [None]:
# let's double check is really "kuģitī" in df.lemma column
needle = "kuģitī"

print(f"Is {needle} in lemma column: {needle in df['lemma'].values}")
# this means the lemma column has a lot of dubious entries meaning the lemmatizer did not work as well as we would have hoped

Is kuģitī in lemma column: True


## Extracting actual maritime terms for further analysis

We have to make a judgment call on what terms we want to analyze further.
Our choice is to manually select terms from top 50 that are related to maritime industry.

We will keep demuinative terms and remove terms that are not related to maritime industry.

In [126]:
# print top 50 terms from index
for term in lemma_df.lemma[:50]:
    print(term)

laiv
laiva
kuģis
laiviņa
kuģa
kuģitis
kuģītis
kuši
tvaikonis
kuģelis
kuŗi
rats
tvaikonītis
vilciens
pajūgs
kamans
tvaikonitis
automobilis
ratiņš
automobīlis
kamanas
automobils
vezums
plosts
kuša
kuga
kuģga
lidmašīna
ragavas
jachta
auto
lidmašina
vagons
motorlaiva
važonis
tramvajs
lokomotive
lokomotīve
lokomobīle
ormanis
liellaiva
ormans
ratus
plostiņš
vāģis
lāviņa
velkonis
barka
mašīna
barkase


In [None]:
# # terms to keep
maritime_terms_to_keep = ["laiva", "laiviņa", "kuģis", "kuģītis", "jachta", "plosts", "motorlaiva", "liellaiva", "tvaikonis", "gondola"]
# # let's save these files into a csv file in parent of target folder
# # the file name will be maritime_terms_to_keep with datetime stamp and then .csv
# # we will save it without index
# # target file
target_file = maritime_target_folder.parent / f"maritime_terms_to_keep_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
with open(target_file, "w", encoding="utf-8") as f:
    f.write("\n".join(maritime_terms_to_keep))

## Extracting actual air transport terms for further analysis

In [131]:
# airtime analysis folder
air_target_parent = Path("../data/analysis_air")
# get list of csv files
air_files = list(air_target_parent.glob("*.csv"))
# sort by date created
air_files = sorted(air_files, key=lambda x: x.stat().st_ctime) # st_ctime is depreceated but still works for time being
# how many files do we have?
print(f"Number of files: {len(list(air_files))}")
# print names
for file in air_files:
    print(file)

Number of files: 1
..\data\analysis_air\air_term_tf_doc_tf_2025_02_28_17_50_54_gemini_flash_15.csv


In [133]:
# let's load the latest file into df
air_df = pd.read_csv(air_files[-1])
# shape
print(f"Shape: {air_df.shape}")
# head
air_df.head(25)

Shape: (1744, 3)


Unnamed: 0,term,term_freq,doc_freq
0,vilciens,566,102
1,laivas,925,95
2,auto,1875,83
3,kamanas,149,71
4,rati,2222,63
5,kuģis,480,60
6,laivu,502,56
7,lidmašīna,546,52
8,kuģi,1513,52
9,tvaikonis,158,50


In [150]:
# cleaned air terms
cleaned_air_terms_list = air_df["term"].apply(clean_lemma).to_list()
# how clean air terms do we have?
print(f"Number of cleaned air terms: {len(cleaned_air_terms_list)}")

Number of cleaned air terms: 1744


In [147]:
# now we want to keep only lemma that are in cleaned_lemma_set
# we will use isin method
# we will use str.lower method
air_lemmas = air_df["term"].str.lower().isin(cleaned_lemma_set)
# how many lemmas do we have?
print(f"Number of lemmas: {air_lemmas.sum()}")
# convert air_lemmas to list
air_lemmas = air_df[air_lemmas]
# shape
print(f"Shape: {air_lemmas.shape}")
# we just want term as set
air_lemmas_set = set(air_lemmas["term"])
# how many unique lemmas do we have?
print(f"Number of unique lemmas: {len(air_lemmas_set)}")

Number of lemmas: 269
Shape: (269, 3)
Number of unique lemmas: 269


In [148]:
# now let's create cleaned air_term set using same approach as for maritime terms
# we will use clean_lemma function
# we will use set comprehension
# we will use str.lower method

cleaned_air_term_set = {clean_lemma(term) for term in air_lemmas_set}
# how many unique terms do we have in cleaned air term set?
print(f"Number of unique terms in cleaned air term set: {len(cleaned_air_term_set)}")


Number of unique terms in cleaned air term set: 262


In [151]:
type(cleaned_air_term_set), type(cleaned_air_terms_list)

(set, list)

In [155]:
# now we want to create air_lemma_dict where keys are lemmas and values are similar terms
# we will use the same approach as for maritime terms
# we will use threshold of 75
air_lemma_dict = {}
for lemma in tqdm(cleaned_air_term_set):
    similar_terms = process.extract(lemma, sorted(cleaned_air_terms_list), scorer=rapidfuzz.fuzz.ratio, 
                                    limit=50, 
                                    score_cutoff=75)
    air_lemma_dict[lemma] = [term for term, score, _ in similar_terms]
# let's see first 5 keys and values
for key, value in list(air_lemma_dict.items())[:15]:
    print(f"{key}: {value}")
print("Last 15 items")
# last 15 items
for key, value in list(air_lemma_dict.items())[-15:]:
    print(f"{key}: {value}")


  0%|          | 0/262 [00:00<?, ?it/s]

100%|██████████| 262/262 [00:00<00:00, 2220.60it/s]

ešalona: ['ešalona', 'ešaloni', 'ešalons', 'ešalonu', 'ešalonus', 'šalons', 'ešaloniem']
straujš: ['straujš']
ekipāža: ['ekipāža', 'ekipāžai', 'ekipāžas', 'ekipāžā', 'ekipažas', 'ekipāžām', 'ekipāžās']
federrati: ['federrati']
dūkanis: ['dūkanis', 'dūkanais']
šoneris: ['šoneris', 'šoneri']
karuselis: ['karuselis', 'kamelis']
sikspārnis: ['sikspārnis']
salnis: ['salnis']
puskariete: ['puskariete', 'puskariete', 'puskarietei', 'puskarieti', 'puskarieti', 'puskarietē', 'puskarīte', 'kariete', 'kariete', 'karietei', 'karietes', 'karietes']
autobuss: ['autobuss', 'autobusos', 'autobus', 'autobusa', 'autobusi', 'autobusu', 'autobusā', 'autobusam', 'autobusiem']
kūlejs: ['kūlejs', 'kūleju']
groži: ['groži', 'grožu']
lode: ['lode']
ragaviņas: ['ragaviņas', 'ragaviņās', 'ragavas', 'ragavas', 'ragaviņām', 'ragutiņas', 'ragavās']
Last 15 items
motorlaiva: ['motorlaiva', 'motorlaivas', 'motorlaivu', 'motorlaivā', 'motorlaivā', 'motorlaiviņas', 'motorlaiva kaija', 'motora']
telegrafeja: ['telegrafe




In [167]:
# if term is not index make it index
if "term" in air_df.columns:
    air_df = air_df.set_index("term")
# air_df shape
print(f"Shape: {air_df.shape}")
# head()
air_df.head()

Shape: (1744, 2)


Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
vilciens,566,102
laivas,925,95
auto,1875,83
kamanas,149,71
rati,2222,63


In [168]:
# let's normalize term
# first reset index if term is index
if "term" in air_df.index.name:
    air_df = air_df.reset_index()
# now let's normalize term
air_df["term"] = air_df["term"].apply(clean_lemma)
# make term the index
air_df = air_df.set_index("term")
# let's see first 5 index values
air_df.head()

Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
vilciens,566,102
laivas,925,95
auto,1875,83
kamanas,149,71
rati,2222,63


In [170]:
# let's get rid of duplicate rows
# first reset index
air_df = air_df.reset_index()
# drop duplicates
air_df = air_df.drop_duplicates()
# make term the index
air_df = air_df.set_index("term")
# let's see first 5 index values
air_df.head()

Unnamed: 0_level_0,term_freq,doc_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1
vilciens,566,102
laivas,925,95
auto,1875,83
kamanas,149,71
rati,2222,63


In [174]:
# any duplicate index values?
print(f"Number of duplicate index values: {air_df.index.duplicated().sum()}")
# let's drop the duplicates
air_df = air_df[~air_df.index.duplicated()]
# any duplicate index values?
print(f"Number of duplicate index values: {air_df.index.duplicated().sum()}")

Number of duplicate index values: 30
Number of duplicate index values: 0


In [177]:
# now let's create a a new dataframe using air_lemma_dict as a guide
# rows will be lemmas - keyis from air_lemma_dict
# there will be following columns:
# terms - values from the air_lemma_dict
# term_freq - sum of term_freq for terms
# doc_freq - sum of doc_freq for terms
# we will sort by doc_freq

# first let's create a list of dictionaries
data = []
for key, value in tqdm(air_lemma_dict.items()):
    term_freq = 0
    # we need to sum all term_freq for items in value
    for term in value:
        try:
            term_freq += air_df.loc[term, "term_freq"]
        except KeyError:
            print(f"term_freq KeyError for {term}")
    doc_freq = 0
    # we need to sum all doc_freq for items in value
    for term in value:
        try:
            doc_freq += air_df.loc[term, "doc_freq"]
        except KeyError:
            print(f"doc_freq KeyError for {term}")  
    data.append({"lemma": key, "terms": value, "term_freq": term_freq, "doc_freq": doc_freq})
# let's create a dataframe
air_lemma_df = pd.DataFrame(data)
# sort by doc_freq
air_lemma_df = lemma_df.sort_values("doc_freq", ascending=False)
# let's see first 10 rows
air_lemma_df.head(50)

100%|██████████| 262/262 [00:00<00:00, 28320.17it/s]


Unnamed: 0,lemma,terms,term_freq,doc_freq
99,laiv,"[laiv, laivā, laiva, laivu, laivās, laivām, la...",3364,418
100,laiva,"[laiva, laimva, laivas, laivam, laiv, laivāam,...",3283,405
90,kuģis,"[kuģis, kuģits, kušģis, kuģi, kuģelis, kuģītis...",3410,318
102,laiviņa,"[laiviņa, laiviņas, laiviņā, laiviņu, laiva, l...",2262,301
86,kuģa,"[kuģa, kuģga, kuģiša, kuģīša, kuģu, kuģī, kuģi...",3547,228
91,kuģitis,"[kuģitis, kuģģitis, kuģits, kuģiti, kuģītis, k...",825,176
93,kuģītis,"[kuģītis, kuģīti, kuģītim, kuģitis, kuģis, kuģ...",818,175
98,kuši,"[kuši, kuģši, kugiši, kuģiši, kušģis, kuģīši, ...",2844,148
173,tvaikonis,"[tvaikonis, tvaikoni, tvaikonītis, tvaikonitis...",422,147
87,kuģelis,"[kuģelis, kuģeli, kuģis, kuģits, kušģis]",772,142


In [180]:
# let's save this dataframe to air_target_parent
air_target_csv_file = air_target_parent / f"air_lemma_term_tf_doc_tf_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
# save
air_lemma_df.to_csv(air_target_csv_file, index=False)

In [181]:
# now let's  have a manual list of air terms from air_lemma_df
# we have very few air terms so we can do it manually
air_target_terms = ["lidmašīna", "dirižablis", "cepelīns", "aeroplāns", "hidroplāns"]

In [182]:
# let's save those terms to a csv file in parent of target folder
# the file name will be air_target_terms with datetime stamp and then .csv
# we will save it without index
# target file
target_file = air_target_parent / f"air_target_terms_{now.strftime('%Y_%m_%d_%H_%M_%S')}.csv"
with open(target_file, "w", encoding="utf-8") as f:
    f.write("\n".join(air_target_terms))