# Analyzing Responses of LLM to prompt

In [1]:
# first we need to import the basic libraries
# date
from datetime import datetime
now = datetime.now()
print(f"Date: {now}")
# python version
import sys
print(f"Python version: {sys.version}")
from pathlib import Path
import json
# import time for delay
import time
import requests
# print version
print(f"Requests version: {requests.__version__}")

from tqdm import tqdm

import pandas as pd
print(f"Pandas version: {pd.__version__}")

Date: 2025-02-27 21:05:04.367645
Python version: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Requests version: 2.32.3
Pandas version: 2.2.2


In [3]:
# let's see what folders are in our ../data/responses folder
data_folder = Path("../data/responses")
print(f"Data folder: {data_folder}")
print(f"Data folder exists: {data_folder.exists()}")
print(f"Data folder is dir: {data_folder.is_dir()}")
# let's see what subfolders are in our data folder
subfolders = [f for f in data_folder.iterdir() if f.is_dir()]
print(f"Subfolders:")
for subfolder in subfolders:
    print(subfolder)

Data folder: ..\data\responses
Data folder exists: True
Data folder is dir: True
Subfolders:
..\data\responses\2025_01_28_gemini_2_experimental
..\data\responses\2025_01_29_google_gemini-flash-1.5-8b_no_terms
..\data\responses\2025_01_29_google_gemini-flash-1.5-8b_with_terms
..\data\responses\2025_02_04_google_gemini-flash-1.5-8b_with_terms
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_1
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_2
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt_2
..\data\responses\2025_02_27_google_gemini-flash-1.5_land_prompt
..\data\responses\2025_02_27_google_gemini-flash-1.5_maritime_prompt
..\data\responses\temp_responses_2025_02_26


## Consolidate openai responses

OpenAI prompts required us to break down files into smaller chunks, now we need to consolidate them back into a single file.


```python

In [4]:
# subfolders that contain openai in their name
openai_folders = [f for f in data_folder.iterdir() if f.is_dir() and "openai" in f.name]
print(f"OpenAI folders:")
for openai_folder in openai_folders:
    print(openai_folder)
    

OpenAI folders:
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2


In [5]:
# we want to create a function that given a subfolder will return a dictionary 
# keys will be first three parts of file name when split by _
# values will be actual file names
def get_files(subfolder):
    files = {}
    for file in subfolder.iterdir():
        if file.is_file():
            parts = file.name.split("_")
            key = "_".join(parts[:3])
            if key in files:
                files[key].append(file)
            else:
                files[key] = [file]
    return files

# let's run this function on one of the openai folders
openai_files = get_files(openai_folders[0])
print(f"OpenAI files:")
for key, value in openai_files.items():
    print(f"{key}: {value}")

OpenAI files:
AustA_KaspG_948026: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_KaspG_948026_0.txt')]
AustA_Puisk_1047362: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_1.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/AustA_Puisk_1047362_2.txt')]
FimbK_KadNa_1049450: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_KadNa_1049450_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_KadNa_1049450_1.txt')]
FimbK_TiltP_1049479: [WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_TiltP_1049479_0.txt'), WindowsPath('../data/responses/2025_02_26_openai_gpt-4o-2024-11-20_land_prompt/FimbK_TiltP_1049479_1.txt')]
GulbA_Gaidi_1350352: [WindowsPath('.

In [11]:
# let's run get_files on all openai folders
# the key will be folder name and values will be dictionaries returned by get_files
openai_files = {}
for openai_folder in openai_folders:
    openai_files[openai_folder.name] = get_files(openai_folder)

# how many files are in each folder
for key, value in openai_files.items():
    print(f"{key}: {len(value)}")

2025_02_26_openai_gpt-4o-2024-11-20_land_prompt: 20
2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2: 20


In [15]:
# now let's write a function that given a file name and file list and new_subfolder will write consolidated file to new_subfolder
# logic is as follows:
# we want to read all content of files in file list up to empty line
# we want to write all this content to new file in new_subfolder
# then we want to separately read all lines starting with line that starts with "System prompt:"
# we want to write this content only once to new file in new_subfolder
# we want use utf-8 encoding
def consolidate_files(file_name, file_list, new_subfolder):
    # create new subfolder if it does not exist
    new_subfolder.mkdir(parents=True, exist_ok=True)
    with open(new_subfolder / f"{file_name}.txt", "w", encoding="utf-8") as new_file:
        system_prompts = []
        for file in file_list:
            with open(file, "r", encoding="utf-8") as old_file:
                text = old_file.read()
                # let's split on "System prompt:"
                parts = text.split("System prompt:")
                # let's write first part
                new_file.write(parts[0].strip()+"\n")
                # append second part to system_prompts
                system_prompts.append(parts[1])
        # let's write system prompts only once
        # first check if system prompts are identical
        if len(set(system_prompts)) == 1:
            new_file.write("\nSystem prompt:" + system_prompts[0])
        else:
            for system_prompt in system_prompts:
                new_file.write("System prompt:\n" + system_prompt)

# test it on second key of openai_files
# we will create a new subfolder in the data respones folder
# new_subfolder = data_folder / "consolidated"
# consolidate_files(list(openai_files.keys())[1], openai_files[list(openai_files.keys())[1]], new_subfolder)



In [16]:
# now let's write a function that will consolidate all files in all openai folders
# we will use consolidate_files function
# new subfolder will be in data folder
# it will be called consolidated_ + key of openai_files
def consolidate_all_files(openai_files, data_folder):
    for key, value in openai_files.items():
        new_subfolder = data_folder / ("consolidated_" + key)
        # value is a dictionary that contains keys that are first three parts of file name and values that are lists of files
        for key2, value2 in value.items():
            consolidate_files(key2, value2, new_subfolder)

# let's run this function
consolidate_all_files(openai_files, data_folder)

## Getting the subfolders for analysis

In [19]:
# now we want to get all folders that we want to analyze
# they are in data_folder 
# we want those that start with consolidated_2025_02_26 or consolidated_2025_02_27
# we also want those that start with 2025_02_26 or 2025_02_27 and also contain words land_prompt
# these will be the folders that we want to analyze
folders_to_analyze = [f for f in data_folder.iterdir() if f.is_dir() and (f.name.startswith("consolidated_2025_02_26") or f.name.startswith("consolidated_2025_02_27"))]
folders_to_analyze += [f for f in data_folder.iterdir() if f.is_dir() and (f.name.startswith("2025_02_26") or f.name.startswith("2025_02_27")) and "land_prompt" in f.name]
print(f"Folders to analyze:")
for folder in folders_to_analyze:
    print(folder)

Folders to analyze:
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt
..\data\responses\consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_1
..\data\responses\2025_02_26_google_gemini-flash-1.5_land_prompt_2
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt
..\data\responses\2025_02_27_google_gemini-2.0-flash-001_land_prompt_2


## Reading Plaintext into  memory

In [20]:
# Plaintexts are in another repo - private in our parent folder
# let's list all text files in data/docs folder
# data_folder = Path("../data/docs")
# data_folder = Path("../../lnb_lat_sen_rom_releases/lat_sen_rom_2025_01_28")
data_folder = Path("../../lnb_lat_sen_rom_releases/lat_sen_rom_2025_02_04")
# assert folder exists
assert data_folder.exists(), f"Folder {data_folder} does not exist"
                   
# list all files
files = list(data_folder.glob("*.txt"))
# print all files
# how many files do we have?
print(f"Number of files: {len(files)}")
# let's load the files into a dictionary with filename stem as key and text as value
# remember to decode the text as utf-8
texts = {}
for file in tqdm(files):
  with open(file, "r", encoding="utf-8") as f:
    texts[file.stem] = f.read()
# how many texts do we have?
print(f"Number of texts: {len(texts)}")
# how many characters do we have in total?
total_chars = sum([len(text) for text in texts.values()])
print(f"Total characters: {total_chars}")
# what is the smallest text?
min_text = min(texts, key=lambda x: len(texts[x]))
print(f"Key for smallest text: {min_text}")
# how many characters does the smallest text have?
min_chars = len(texts[min_text])
print(f"Number of characters in smallest text: {min_chars}")
# what is the largest text?
max_text = max(texts, key=lambda x: len(texts[x]))
print(f"Key for largest text: {max_text}")
# how many characters does the largest text have?
max_chars = len(texts[max_text])
print(f"Number of characters in largest text: {max_chars}")


Number of files: 458


100%|██████████| 458/458 [00:08<00:00, 57.20it/s]

Number of texts: 458
Total characters: 191069647
Key for smallest text: VentA_DepuT_1293527
Number of characters in smallest text: 18648
Key for largest text: DeglA_LabaF_1053655
Number of characters in largest text: 2375090





## Comparing responses

In [23]:
# first let's assert that all our folders have identical file names
# we will use the first folder as reference
reference_files = Path(folders_to_analyze[0]).iterdir()
reference_files = [file.name for file in reference_files]
for folder in folders_to_analyze[1:]:
    files = Path(folder).iterdir()
    files = [file.name for file in files]
    assert reference_files == files, f"Files in {folders_to_analyze[0]} and {folder} are not identical"

print("All files are identical")

All files are identical


In [24]:
# let's write a function that given a file will extract all response lines
# response lines are those that come before empty line
# we will return a list of response lines
def get_response_lines(file):
    response_lines = []
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.strip() == "":
                break
            response_lines.append(line.strip())
    return response_lines

# test on first file in reference folder
response_lines = get_response_lines(Path(folders_to_analyze[0]) / reference_files[0])
print(f"Response lines: {response_lines}")

Response lines: ['zirgi', 'kamanas', 'trijjūgi', 'kropli', 'rati', 'zirgu slidas']


In [27]:
folders_to_analyze

[WindowsPath('../data/responses/consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt'),
 WindowsPath('../data/responses/consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2'),
 WindowsPath('../data/responses/2025_02_26_google_gemini-flash-1.5_land_prompt_1'),
 WindowsPath('../data/responses/2025_02_26_google_gemini-flash-1.5_land_prompt_2'),
 WindowsPath('../data/responses/2025_02_27_google_gemini-2.0-flash-001_land_prompt'),
 WindowsPath('../data/responses/2025_02_27_google_gemini-2.0-flash-001_land_prompt_2')]

In [26]:
# now let's write a function that given a file and texts dictionary will return a dataframe with two columns
# first column will be terms sorted from get_response_lines (could be duplicates)
# second column will count of occurences of term in text from matching key in texts dictionary
# key will be file name stem
def get_response_df(file, texts):
    response_lines = get_response_lines(file)
    data = []
    plaintext = texts.get(file.stem, "")
    # our term column name will be parent folder name of file
    term_column = file.parent.name
    # term_column = file.stem
    if plaintext == "":
        print(f"Plaintext not found for {file.stem}")
    for line in sorted(response_lines):
        data.append({term_column: line, "count": plaintext.count(line)})
    return pd.DataFrame(data)

# test on first file in reference folder
df = get_response_df(Path(folders_to_analyze[0]) / reference_files[0], texts)
print(f"Response dataframe:")
df.head()

Response dataframe:


Unnamed: 0,consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt,count
0,kamanas,1
1,kropli,1
2,rati,2
3,trijjūgi,3
4,zirgi,4


In [28]:
reference_files[0]

'AustA_KaspG_948026.txt'

In [29]:
# now let's write a function tht given a file and list of subfolders will return a combined dataframe
# columns will obtained by horizontally concatenating dataframes obtained by get_response_df
# index will be numerical
def get_combined_df(file, texts, subfolders):
    dfs = []
    for subfolder in subfolders:
        df = get_response_df(subfolder / file, texts)
        dfs.append(df)
    return pd.concat(dfs, axis=1)

# let's test it on first file in reference folder
df = get_combined_df(reference_files[0], texts, folders_to_analyze)
print(f"Combined dataframe:")
df.head()



Combined dataframe:


Unnamed: 0,consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt,count,consolidated_2025_02_26_openai_gpt-4o-2024-11-20_land_prompt_2,count.1,2025_02_26_google_gemini-flash-1.5_land_prompt_1,count.2,2025_02_26_google_gemini-flash-1.5_land_prompt_2,count.3,2025_02_27_google_gemini-2.0-flash-001_land_prompt,count.4,2025_02_27_google_gemini-2.0-flash-001_land_prompt_2,count.5
0,kamanas,1.0,kamanas,1.0,auļos,1,kamanam,1.0,dzelzsceļa,2.0,automobiļi,0.0
1,kropli,1.0,krievu trijjūgi,1.0,fūrmaņu kamanas,0,ragavas,1.0,fūrmaņu kamanas,0.0,kamanām,0.0
2,rati,2.0,mērnieka kājas,0.0,kamanam,1,slitas,1.0,kamanas,1.0,pātagas,2.0
3,trijjūgi,3.0,ragavas,1.0,kamanas,1,trijjūgiem,1.0,trijjūgi,3.0,zirgiem,4.0
4,zirgi,4.0,trijjūgi,3.0,kamanās,8,vilcienu,1.0,vezumnieku ragavas,1.0,,


In [34]:
# now let's create a function that will create a CSV file for each file in reference folder
# we will use get_combined_df to get the dataframe
# we will supply target folder where we want to save the CSV files
def create_csv_files(reference_files, texts, subfolders, target_folder, save_excel=True):
    # create target folder if it does not exist
    target_folder.mkdir(parents=True, exist_ok=True)
    
    for file in reference_files:
        df = get_combined_df(file, texts, subfolders)
        df.to_csv(target_folder / f"{Path(file).stem}.csv", index=False)
        if save_excel:
            df.to_excel(target_folder / f"{Path(file).stem}.xlsx", index=False)

# let's test it on reference files
# target folder will be data folder with name analysis and datetime stamp
target_folder = Path("../data") / "analysis" / now.strftime("%Y_%m_%d_%H_%M_%S")
create_csv_files(reference_files, texts, folders_to_analyze, target_folder)

In [32]:
print(target_folder)

..\..\lnb_lat_sen_rom_releases\lat_sen_rom_2025_02_04\analysis\2025_02_27_21_05_04
