In [9]:
import pandas as pd

from ssrq_retro_lab.config import PROJECT_ROOT, ZG_DATA_ROOT
from ssrq_retro_lab.pipeline.templates.utils import render_template

In [10]:
test = pd.read_pickle("./pkl_cache/openai_finetune_ocr_validation_v2.pkl")

In [11]:
import json

from ssrq_retro_lab.train.messages import SYSTEM_ROLE_V2
from ssrq_retro_lab.pipeline.llm import chat
from ssrq_retro_lab.pipeline.components.ocr_corrector import CorrectedOCRText

schema = json.dumps(CorrectedOCRText.model_json_schema(), indent=2)

In [12]:
prompt = chat.create_chat_completion_param(
    system=SYSTEM_ROLE_V2,
    user=render_template(
        template_name="openai_ocr_training_user_v2.jinja2",
        schema=schema,
        text_input=json.dumps(test[:40]["source"].to_list(), ensure_ascii=False)
    ),
    assistant=None,
)

In [14]:
response = chat.generate(
    prompt=prompt,
    model_name="ft:gpt-3.5-turbo-1106:personal:ssrq-ocr-cor:8tgnqalq",
    extract_language=True,
    language="json",
)

[32m2024-02-19 11:08:44.807[0m | [34m[1mDEBUG   [0m | [36mssrq_retro_lab.pipeline.llm.chat[0m:[36m_chat_with_open_ai[0m:[36m82[0m - [34m[1mRequestion chat completion with model ft:gpt-3.5-turbo-1106:personal:ssrq-ocr-cor:8tgnqalq for prompt:
 [{'role': 'system', 'content': 'You are an helpful research assistant named Walther, who helps researchers to correct texts. You have an extremly good knowledge in scholarly editing.'}, {'role': 'user', 'content': 'You are an expert in Text Classification with proven knowledge in Textual Scholarship.\n\nYour task is to correct the following lines of text, which were created with OCR. The text is part of a scholarly edition and is written in old and middle German. The input is a JSON array containing one or more lines of text. Strictly follow this rules:\n1. Do not modernize the text.\n2. Correct each line of contained in the input array.\n3. Do not change the order of the lines.\n4. Do not add or remove lines.\n5. Do not mix any lines

In [16]:
result = CorrectedOCRText.model_validate_json(response.unwrap())

In [18]:
from ssrq_retro_lab.validate import general, ocr

expected_results: list[str ] = test[:40]["target"].to_list()

In [19]:
accuracy, f1 = general.calc_ml_metrics(expected_results, result.text)

print(f"Accuracy: {accuracy}, F1: {f1}")

Accuracy: 0.75, F1: 0.75


In [31]:
import textdistance


def calculate_wer(s1, s2):
    s1_words = s1.split()
    s2_words = s2.split()

    lev_distance = textdistance.levenshtein.normalized_distance(s1_words, s2_words) * 100

    wer = lev_distance / len(s1_words)

    return wer

In [32]:
wer_before = calculate_wer(
    "\n".join(expected_results).strip(), "\n".join(test[:40]["source"].tolist()).strip()
)
wer_after = calculate_wer(
    "\n".join(expected_results).strip(), "\n".join(result.text).strip()
)
error_rate_before = ocr.calc_error_rate(
    "\n".join(expected_results).strip(), "\n".join(test[:40]["source"].tolist()).strip()
)
error_rate_after = ocr.calc_error_rate(
    "\n".join(expected_results).strip(), "\n".join(result.text).strip()
)

print(
    f"Error Rate Before \n CER: {error_rate_before.cer}, WER: {wer_before}, Cosine Similarity: {error_rate_after.similarity}"
)
print(
    f"After Correction \n CER: {error_rate_after.cer}, WER: {wer_after}, Cosine Similarity: {error_rate_after.similarity}"
)

Error Rate Before 
 CER: 3.145424836601307, WER: 0.0191766045424582, Cosine Similarity: 0.983545704294228
After Correction 
 CER: 1.4043783560512184, WER: 0.013884780469547055, Cosine Similarity: 0.983545704294228
