# Creation of Training Data for OCR Post-Correction

This notebook is used to create a training data for OCR post-correction. In contrast to the previous [approach](./create_training_data.ipynb) where the OCR training data was based on the complete text of a single page, we will be using line-level data in this notebook. 


In [56]:
import json

import pandas as pd
from ssrq_retro_lab.config import PROJECT_ROOT
from ssrq_retro_lab.pipeline.components.text_extractor import (
    ExtractionInput,
    TextExtractor,
)
from ssrq_retro_lab.repository import reader, writer
from ssrq_retro_lab.train import data, messages
from thefuzz import process

In [6]:
txt_pdf_conversion_table = json.loads(
    reader.TextReader((PROJECT_ROOT / "data/ZG/txt_to_pdf.json")).read()
)

In [11]:
master_transcriptions = [
    txt for txt in (PROJECT_ROOT / "data/ZG/master").glob("*[0-9].txt")
]
volumes = [pdf for pdf in (PROJECT_ROOT / "data/ZG/pdf").glob("*.pdf")]

In [33]:
from collections import namedtuple
from parsel import Selector

LineToLineMatch = namedtuple(
    "LineToLineMatch",
    [
        "source",
        "target",
    ],
)

line_to_line_matches = []

for volume in volumes:
    doc = reader.PDFReader(volume).read()
    volume_name = volume.name.removesuffix(".pdf").replace(".", "_")
    transcriptions = [
        transcription
        for transcription in master_transcriptions
        if transcription.name.startswith(volume_name)
    ]

    for transcription in transcriptions:
        page_number = int(
            txt_pdf_conversion_table[volume_name][
                transcription.name.removesuffix(".txt")
            ]
        )
        # extract all lines from the transcription
        transcription_lines = (
            reader.TextReader(transcription).read().strip().split("\n")
        )
        # extract text at the given page
        text_extraction = TextExtractor().invoke(
            ExtractionInput(1, page_number + 1, page_number + 1)
        )
        page_paragraphs = Selector(
            text_extraction.unwrap()["pages"][0], type="xml"
        ).xpath("//p")
        text_lines = [p.xpath("string()").get() for p in page_paragraphs]

        for transcription_line in transcription_lines:
            match_from_extraction = process.extractOne(
                transcription_line,
                text_lines,
            )
            line_to_line_matches.append(
                LineToLineMatch(match_from_extraction[0], transcription_line)
            )

In [46]:
pd.set_option("display.max_rows", 50)

line_to_line_df = pd.DataFrame(line_to_line_matches)

In [48]:
from thefuzz import fuzz

# Calculate similarity for each row

line_to_line_df["similarity"] = line_to_line_df.apply(
    lambda row: fuzz.ratio(row["source"], row["target"]), axis=1
)

In [51]:
high_similarity_df = line_to_line_df[line_to_line_df["similarity"] >= 75]
high_similarity_df.sort_values(by="similarity", ascending=False, inplace=True)
training_data_df = high_similarity_df.drop("similarity", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_similarity_df.sort_values(by="similarity", ascending=False, inplace=True)


In [54]:
# The result is a training set with 617 entries
training_data_df.reset_index(drop=True, inplace=True)
training_data_df

Unnamed: 0,source,target
0,"ein herr uff, ald der, dem er denn den gwalt g...","ein herr uff, ald der, dem er denn den gwalt g..."
1,"standen dem, den er beklaget vor dem amman des...","standen dem, den er beklaget vor dem amman des..."
2,"wegen, gantz ernstlich und fründtlich ermant u...","wegen, gantz ernstlich und fründtlich ermant u..."
3,Signum [MF] Hludowici serenissimi regis.,Signum [MF] Hludowici serenissimi regis.
4,tation wol bewart syn.,tation wol bewart syn.
...,...,...
612,E. Staatswesen • 572-583,E. Staatswesen : 572-183 377
613,"3 s. Ist öch, dc ein gast ze deweder müli melt...","3 s. Ist oͮch, dc ein gâst zê dêwêder múli mêl..."
614,"QW 1 ,1, Nr. 17.","QW I, I, Nr. 17."
615,Hünenberg.,zue Hünenberg


In [55]:
training_data_df.to_pickle("./pkl_cache/ocr_line_based_training.pkl")