In [63]:
import transformers
import torch
from transformers import AutoTokenizer
import gzip
import json
from random import randint
import unicodedata
import re

from eval_metrics import calculate_metrics
from Bio.Align import PairwiseAligner

In [49]:
def align(input_text, output_text):
    aligner = PairwiseAligner()
    aligner.mode = 'global'
    aligner.target_end_gap_score = 0.0
    aligner.query_end_gap_score = 0.0
    alignments = aligner.align(input_text, output_text)
    alignment = alignments[0]
    return alignment

def read_data(fname, max_examples=None):
    examples = []
    with gzip.open(fname, "rt", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            examples.append(example)
            if max_examples and len(examples) >= max_examples:
               break
    return examples

def sliding_window(tokens, window_size, prompt_size=0):
    start = randint(prompt_size, max(prompt_size, tokens["input_ids"].size()[1] - window_size))
    truncated_tokens = tokens["input_ids"][:,start:start+window_size]
    truncated_attention_mask = tokens["attention_mask"][:,start:start+window_size]
    return { "input_ids" : truncated_tokens.to(device), "attention_mask" : truncated_attention_mask.to(device)}


examples = read_data("/scratch/project_2000539/jenna/ocr-correction/by_page_dev_slim.jsonl.gz")

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [66]:
input_test, output_test = examples[0]["input"], examples[0]["output"]

a = str(align(input_test.replace("\n", ""), output_test.replace("\n", "")))

print(a.split("\n"))

['certainty as to compos-ers in the fifteenth century;-for not only the madrigals that were invented after-the new notation were at that time printed, but many-of the old ones were made to ass--ume this more per-s-ect form, and, therefore, are pres-erved even to this-day. "; Su7mer is Icumen, ------------------a celebrated madrigal forf--ix voices, the manus-cript of which is now in the-Britis-h Mus-eum, was compos-ed about 1460. SKEL-TON, in the reign of HENRY the s-eventh, wrotes--ongs, which were compos-ed in parts by CORNISI1-,-and many others might be mentioned.---------------FRANCHINUS, who wrote a work which wa~--printed at MILAN, gives s-ome of the firs-t examples-for the improvement of mus-ical notation, but thes-e-characters were cut out in blocks; the Gern-ans,-however, improved upon this practis-e, and that arts--eems to have arrived to s-omething like perfection-about the year 1500, s-o that this improvements--eemed ready for the us-e it was pur- to afterwards in-ENGLAND; 

In [62]:
def extract_aligned_text(base_text, alignment_string, query_text):
    
    matches = list(re.finditer(r'\|+', alignment_string))
    if not matches:
        return ""  

    refined_matches = []
    for match in matches:
        start, end = match.start(), match.end()
        if base_text[start:end] == query_text[start:end]:
            refined_matches.append(match)

    if not refined_matches:
        return ""  
        
    print(refined_matches)
    last_match = max(refined_matches, key=lambda m: m.end())
    first_match = min(refined_matches, key=lambda m: m.start())
    start, end = first_match.start(), last_match.end()

    aligned_text = base_text[start:end]
    return aligned_text

# Example usage
base_text = "hello there im john from"
alignment_string = "------------|||||--|||||-------"
query_text = "            im josh from london"
aligned_text = extract_aligned_text(base_text, alignment_string, query_text)
print("Aligned Text:", aligned_text)

[<re.Match object; span=(12, 17), match='|||||'>, <re.Match object; span=(19, 24), match='|||||'>]
Aligned Text: im john from
