In [1]:
from datasets import Dataset
from IPython.core.getipython import get_ipython
from peft import AutoPeftModelForCausalLM
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, pipeline
import Levenshtein
import pandas as pd
import torch

In [None]:
# Compute character error rate (CER)
def cer(prediction, target):
    distance = Levenshtein.distance(prediction, target)
    return distance / len(target)

# Helper function to store results as a CSV
def get_results(data, preds):
    results = data.to_pandas()
    results['Model Correction'] = preds
    results = results.rename(columns={'CER': 'old_CER'})
    results['new_CER'] = results.apply(lambda row: cer(row['Model Correction'], row['Ground Truth']), axis=1)
    results['CER_reduction'] = ((results['old_CER'] - results['new_CER']) / results['old_CER']) * 100
    return results

## BART

Generate post-OCR corrections with BART and save to `results` folder of the project.

In [None]:
model_dir = 'pykale/bart-large-ocr'

test = pd.read_csv('data/test.csv')
test = Dataset.from_pandas(test)

model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
generator = pipeline('text2text-generation', model=model.to('cuda'), tokenizer=tokenizer, device='cuda', max_length=1024)

preds = []
for sample in tqdm(test):
    preds.append(generator(sample['OCR Text'])[0]['generated_text'])

results = get_results(test, preds)
results.to_csv('results/bart-large.csv', index=False)

## Llama 2

Generate post-OCR corrections with Llama 2 and save to `results` folder of the project.

In [None]:
model_dir = 'pykale/llama-2-13b-ocr'

test = pd.read_csv('data/test.csv')
test = Dataset.from_pandas(test)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoPeftModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

i = 0
preds = []

cell = '''
prompt = f"""### Instruction:
Fix the OCR errors in the provided text.

### Input:
{test[i]['OCR Text']}

### Response:
"""

input_ids = tokenizer(prompt, max_length=1024, return_tensors='pt', truncation=True).input_ids.cuda()
with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.1, top_k=40)
pred = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):].strip()
preds.append(pred)
i += 1
'''

ipython = get_ipython()
for _ in tqdm(range(len(test))):
    ipython.run_cell(cell)

results = get_results(test, preds)
results.to_csv('results/llama-2-13b.csv', index=False)

## Corrections

Examine post-OCR corrections of a given model on the test set.

In [2]:
results = {'bart-base': pd.read_csv('results/bart-base.csv'),
           'bart-large': pd.read_csv('results/bart-large.csv'),
           'llama-2-7b': pd.read_csv('results/llama-2-7b.csv'),
           'llama-2-13b': pd.read_csv('results/llama-2-13b.csv')}

corrections = results['llama-2-13b']
corrections.head(10)

Unnamed: 0,Sample ID,Date,Publication,OCR Text,Ground Truth,old_CER,Model Correction,new_CER,CER_reduction
0,3200797032,1882-05-27,Illustrated Police News,CHARGE OF SETTING FIRE TO A HOUSE.,CHARGE OF SETTING FIRE TO A HOUSE.,0.0,CHARGE OF SETTING FIRE TO A HOUSE.,0.0,
1,3200797032,1882-05-27,Illustrated Police News,"AT the Thames police-eourt, Charles Mawzi was ...","AT the Thames police-court, Charles Mawzi was ...",0.019737,"AT the Thames police-court, Charles Mawzi was ...",0.006579,66.666667
2,3200797032,1882-05-27,Illustrated Police News,At a quarter to four o'clock that 'c�morniink ...,At a quarter to four o'clock that morning Lill...,0.105611,At a quarter to four o'clock that morning Lill...,0.009901,90.625
3,3200797032,1882-05-27,Illustrated Police News,"*Lilly caught hold of him, 'ai found that his ...","Lilly caught hold of him, and found that his c...",0.11399,"Lilly caught hold of him, and found that his c...",0.0,100.0
4,3200797032,1882-05-27,Illustrated Police News,"On reaclhing No. 2, whiG1e is an unoc6njid 'ao...","On reaching No. 2, which is an unoccupied hous...",0.107784,"On reaching No. 2, which is an unoccupied hous...",0.0,100.0
5,3200797032,1882-05-27,Illustrated Police News,"He iqrihng his rattle, and William Gadd, sorge...","He sprang his rattle, and William Gadd, sergea...",0.090909,"He was crying his rattle, and William Gadd, se...",0.068182,25.0
6,3200797032,1882-05-27,Illustrated Police News,"Ford got in the windoxw, and on opening the ba...","Ford got in the window, and on opening the bac...",0.055944,"Ford got in the window, and on opening the bac...",0.0,100.0
7,3200797032,1882-05-27,Illustrated Police News,"Lilly ran for the turnuock, and in the meantim...","Lilly ran for the turncock, and in the meantim...",0.076087,"Lilly ran for the turncock, and in the meantim...",0.016304,78.571429
8,3200797032,1882-05-27,Illustrated Police News,"In ten minutes' time a fire engine arrived, an...","In ten minutes' time a fire engine arrived, an...",0.07377,"In ten minutes' time a fire engine arrived, an...",0.0,100.0
9,3200797032,1882-05-27,Illustrated Police News,"The prisoner said he 'ent to put out the fire,",The prisoner said he went to put out the fire.,0.043478,The prisoner said he went to put out the fire.,0.0,100.0


In [3]:
for i in range(len(corrections)):
    print(i+1)
    print(f"OCR Text:\n{corrections['OCR Text'][i]}\n")
    print(f"Ground Truth:\n{corrections['Ground Truth'][i]}\n")
    print(f"Model Correction:\n{corrections['Model Correction'][i]}\n\n")

1
OCR Text:
CHARGE OF SETTING FIRE TO A HOUSE.

Ground Truth:
CHARGE OF SETTING FIRE TO A HOUSE.

Model Correction:
CHARGE OF SETTING FIRE TO A HOUSE.


2
OCR Text:
AT the Thames police-eourt, Charles Mawzi was charged with wilfully and maliciously setting fire to the house, 2 Caroline-9treet, Commercial-road East.

Ground Truth:
AT the Thames police-court, Charles Mawzi was charged with willfully and maliciously setting fire to the house, 2 Caroline-street, Commercial-road East.

Model Correction:
AT the Thames police-court, Charles Mawzi was charged with wilfully and maliciously setting fire to the house, 2 Caroline-street, Commercial-road East.


3
OCR Text:
At a quarter to four o'clock that 'c�morniink Lilly, a constable, No. 332 E, wias'oh datyin Carblind-stc'eet, . when he met the prisoeer, and he said, "I hive just come from ,afire," alluclingtoaiconflagration in.Philpot- street, Commercial-rbad, " and therb is anotlher one up the street, close to the brewery."

Ground Truth:
At