In [1]:
from sklearn.model_selection import train_test_split
import json
import Levenshtein
import os
import pandas as pd

In [2]:
# Compute character error rate (CER)
def cer(prediction, target):
    distance = Levenshtein.distance(prediction, target)
    return distance / len(target)

# Helper function to preprocess text
def preprocess(c):
    c = c.str.replace("‘", "'", regex=False)
    c = c.str.replace("’", "'", regex=False)
    c = c.str.replace("“", '"', regex=False)
    c = c.str.replace("”", '"', regex=False)
    c = c.str.replace("—", "-", regex=False)
    c = c.str.replace(r'\s+', ' ', regex=True)
    c = c.str.strip()
    return c

## BLN600


The following cells load **BLN600**, a parallel corpus of 19th century newspaper machine/human transcription. 

The dataset can be accessed from the DOI below and should be placed in the `data` folder of the project.

DOI: https://doi.org/10.15131/shef.data.25439023


In [3]:
sample_id, date, publication, ocr, gt = [], [], [], [], []

with open('data/BLN600/metadata.json', 'r') as f:
    metadata = json.load(f)

for doc in metadata:
    sample_id.append(doc['short_id'])
    
    d = doc['date']
    d = f'{d[:4]}-{d[4:6]}-{d[6:]}'
    date.append(d)
    
    p = doc['publication']
    match doc['publication']:
            case 'Lloyd&apos;s Weekly London Newspaper':
                p = 'Lloyd\'s Illustrated Newspaper'
            case 'Lloyd&apos;s Weekly Newspaper':
                p = 'Lloyd\'s Illustrated Newspaper'
            case 'The Illustrated Police News etc':
                p = 'Illustrated Police News'
            case 'The Morning Chronicle':
                p = 'Morning Chronicle'
            case 'The Era':
                p = 'The Era'
            case 'The Charter':
                p = 'Charter'
            case 'Daily News':
                p = 'Daily News'
    publication.append(p)    
    
    f = open(os.path.join('data/BLN600/OCR Text', doc['short_id'] + '.txt'), 'r')
    ocr.append(' '.join(f.read().split()))
    f.close()
    
    f = open(os.path.join('data/BLN600/Ground Truth', doc['short_id'] + '.txt'), 'r')
    gt.append(' '.join(f.read().split()))
    f.close()

bln600 = pd.DataFrame({'Sample ID': sample_id, 'Date': date, 'Publication': publication, 'OCR Text': ocr, 'Ground Truth': gt})
bln600['Date'] = pd.to_datetime(bln600['Date'])
bln600.head(10)

Unnamed: 0,Sample ID,Date,Publication,OCR Text,Ground Truth
0,3200797029,1882-05-27,Illustrated Police News,"__RO BBERY AT A BARONET'S. v EDWARn PRING, twe...","ROBBERY AT A BARONETS. EDWARD PRING, twenty-se..."
1,3200797032,1882-05-27,Illustrated Police News,CHARGE OF SETTING FIRE TO A HOUSE. AT the Tham...,CHARGE OF SETTING FIRE TO A HOUSE. AT the Tham...
2,3200797034,1882-05-27,Illustrated Police News,SERVANTS AND THEIR SWEETHEARTS. . A I I . 1 _ ...,"SERVANTS AND THEIR SWEETHEARTS. ON Saturday, a..."
3,3200797037,1882-05-27,Illustrated Police News,CAPTURE OF PICKPOCKETS. AT Stratford petty ses...,CAPTURE OF PICKPOCKETS. AT Stratford petty ses...
4,3200801612,1885-06-20,Illustrated Police News,"DOUBLE MURDER BY A MOTHER AT | I P9,USHOLMIE. ...",DOUBLE MURDER BY A MOTHER AT RUSHOLME. [SUBJEC...
5,3200801613,1885-06-20,Illustrated Police News,* ATTACK ON CONSTABLES AT WIDNES. er Da EfSUDM...,ATTACK ON CONSTABLES AT WIDNES. [SUBJECT OF IL...
6,3200801615,1885-06-20,Illustrated Police News,"ST-TOT FIRING IN MINES. M. AN, important ease ...",SHOT FIRING IN MINES. AN important case came o...
7,3200801619,1885-06-20,Illustrated Police News,THE POISONING CASE IN FRASNCE. Tll. trial oi P...,THE POISONING CASE IN FRANCE. THE trial of Pel...
8,3200801622,1885-06-20,Illustrated Police News,COMPENSATION TO A PUBLICAN. AT the Man sion Ho...,COMPENSATION TO A PUBLICAN. AT the Mansion Hou...
9,3200801629,1885-06-20,Illustrated Police News,"MIDDLESEX SESSIONS. Sl~~s/ - . - - - T ,,1 PAr...",MIDDLESEX SESSION'S. PARISH PROSECUTIONS AT TH...


In [4]:
bln600['Publication'].value_counts()

Publication
Lloyd's Illustrated Newspaper    360
Illustrated Police News          212
Morning Chronicle                 13
The Era                            8
Charter                            6
Daily News                         1
Name: count, dtype: int64

In [5]:
((bln600['Date'].dt.year // 10) * 10).value_counts()

Date
1890    196
1880    110
1870     94
1860     85
1850     72
1840     26
1830     17
Name: count, dtype: int64

## Sequences

The following cells load sequence pairs from **BLN600**.

Unzip the `Sequences.zip` file in the `data` folder of the project.

In [6]:
sample_id, date, publication, ocr, gt = [], [], [], [], []

for s in bln600['Sample ID']:
    with open(os.path.join('data/Sequences', f'{s}.txt'), 'r') as f:
        lines = f.readlines()
    
    ocr_text, ground_truth = '', ''
    for line in lines:
        if line.startswith('OCR Text: '):
            ocr_text = line.replace('OCR Text: ', '').strip()
        elif line.startswith('Ground Truth: '):
            ground_truth = line.replace('Ground Truth: ', '').strip()
        if ocr_text and ground_truth:
            sample_id.append(s)
            date.append(bln600.loc[bln600['Sample ID'] == s, 'Date'].iloc[0])
            publication.append(bln600.loc[bln600['Sample ID'] == s, 'Publication'].iloc[0])
            ocr.append(ocr_text)
            gt.append(ground_truth)            
            ocr_text, ground_truth = '', ''

seq = pd.DataFrame({'Sample ID': sample_id, 'Date': date, 'Publication': publication, 'OCR Text': ocr, 'Ground Truth': gt})
seq['OCR Text'] = preprocess(seq['OCR Text'])
seq['Ground Truth'] = preprocess(seq['Ground Truth'])
seq['CER'] = seq.apply(lambda row: cer(row['OCR Text'], row['Ground Truth']), axis=1)
seq.head(10)

Unnamed: 0,Sample ID,Date,Publication,OCR Text,Ground Truth,CER
0,3200797029,1882-05-27,Illustrated Police News,__RO BBERY AT A BARONET'S.,ROBBERY AT A BARONETS.,0.181818
1,3200797029,1882-05-27,Illustrated Police News,"v EDWARn PRING, twenty-seven, caipentbr, was b...","EDWARD PRING, twenty-seven, carpenter, was bro...",0.095238
2,3200797029,1882-05-27,Illustrated Police News,Chief Inspector Phillips said there were a num...,Chief Inspector Phillips said there were a num...,0.020979
3,3200797029,1882-05-27,Illustrated Police News,"The prisoner was taken into custody, it appear...","The prisoner was taken into custody, it appear...",0.034884
4,3200797029,1882-05-27,Illustrated Police News,ThP prisoner called at the house and represent...,The prisoner called at the house and represent...,0.027778
5,3200797029,1882-05-27,Illustrated Police News,"He *was given access to the bedrooms, and afte...","He was given access to the bedrooms, and after...",0.065421
6,3200797029,1882-05-27,Illustrated Police News,His statement of having been sent to the house...,His statement of having been sent to the house...,0.0
7,3200797029,1882-05-27,Illustrated Police News,The prisoner was ultimately arrested on this c...,The prisoner was ultimately arrested on this c...,0.028926
8,3200797029,1882-05-27,Illustrated Police News,"Robert Yfoellam, footman to Sir Robert, depose...","Robert Wollam, footman to Sir Robert, deposed ...",0.058394
9,3200797029,1882-05-27,Illustrated Police News,"Witn6ss said new blinds were not rdqoiifed, ,w...","Witness said new blinds were not required, whe...",0.083333


In [7]:
seq['CER'].describe()

count    13192.000000
mean         0.077117
std          0.121551
min          0.000000
25%          0.013986
50%          0.038095
75%          0.089938
max          2.000000
Name: CER, dtype: float64

## Split

The following cells splits sequence pairs into train/val/test sets for model development.

In [8]:
train_ids, test_ids = train_test_split(seq['Sample ID'].unique(), test_size=0.2, random_state=600)
# train_ids, val_ids = train_test_split(train_ids, test_size=0.125, random_state=600)

train = seq[seq['Sample ID'].isin(train_ids)]
# val = seq[seq['Sample ID'].isin(val_ids)]
test = seq[seq['Sample ID'].isin(test_ids)]

In [9]:
train.to_csv('data/train.csv', index=False)
train.head(10)

Unnamed: 0,Sample ID,Date,Publication,OCR Text,Ground Truth,CER
0,3200797029,1882-05-27,Illustrated Police News,__RO BBERY AT A BARONET'S.,ROBBERY AT A BARONETS.,0.181818
1,3200797029,1882-05-27,Illustrated Police News,"v EDWARn PRING, twenty-seven, caipentbr, was b...","EDWARD PRING, twenty-seven, carpenter, was bro...",0.095238
2,3200797029,1882-05-27,Illustrated Police News,Chief Inspector Phillips said there were a num...,Chief Inspector Phillips said there were a num...,0.020979
3,3200797029,1882-05-27,Illustrated Police News,"The prisoner was taken into custody, it appear...","The prisoner was taken into custody, it appear...",0.034884
4,3200797029,1882-05-27,Illustrated Police News,ThP prisoner called at the house and represent...,The prisoner called at the house and represent...,0.027778
5,3200797029,1882-05-27,Illustrated Police News,"He *was given access to the bedrooms, and afte...","He was given access to the bedrooms, and after...",0.065421
6,3200797029,1882-05-27,Illustrated Police News,His statement of having been sent to the house...,His statement of having been sent to the house...,0.0
7,3200797029,1882-05-27,Illustrated Police News,The prisoner was ultimately arrested on this c...,The prisoner was ultimately arrested on this c...,0.028926
8,3200797029,1882-05-27,Illustrated Police News,"Robert Yfoellam, footman to Sir Robert, depose...","Robert Wollam, footman to Sir Robert, deposed ...",0.058394
9,3200797029,1882-05-27,Illustrated Police News,"Witn6ss said new blinds were not rdqoiifed, ,w...","Witness said new blinds were not required, whe...",0.083333


In [10]:
train['CER'].describe()

count    10400.000000
mean         0.075276
std          0.117504
min          0.000000
25%          0.013973
50%          0.037975
75%          0.088889
max          2.000000
Name: CER, dtype: float64

In [11]:
test.to_csv('data/test.csv', index=False)
test.head(10)

Unnamed: 0,Sample ID,Date,Publication,OCR Text,Ground Truth,CER
18,3200797032,1882-05-27,Illustrated Police News,CHARGE OF SETTING FIRE TO A HOUSE.,CHARGE OF SETTING FIRE TO A HOUSE.,0.0
19,3200797032,1882-05-27,Illustrated Police News,"AT the Thames police-eourt, Charles Mawzi was ...","AT the Thames police-court, Charles Mawzi was ...",0.019737
20,3200797032,1882-05-27,Illustrated Police News,At a quarter to four o'clock that 'c�morniink ...,At a quarter to four o'clock that morning Lill...,0.105611
21,3200797032,1882-05-27,Illustrated Police News,"*Lilly caught hold of him, 'ai found that his ...","Lilly caught hold of him, and found that his c...",0.11399
22,3200797032,1882-05-27,Illustrated Police News,"On reaclhing No. 2, whiG1e is an unoc6njid 'ao...","On reaching No. 2, which is an unoccupied hous...",0.107784
23,3200797032,1882-05-27,Illustrated Police News,"He iqrihng his rattle, and William Gadd, sorge...","He sprang his rattle, and William Gadd, sergea...",0.090909
24,3200797032,1882-05-27,Illustrated Police News,"Ford got in the windoxw, and on opening the ba...","Ford got in the window, and on opening the bac...",0.055944
25,3200797032,1882-05-27,Illustrated Police News,"Lilly ran for the turnuock, and in the meantim...","Lilly ran for the turncock, and in the meantim...",0.076087
26,3200797032,1882-05-27,Illustrated Police News,"In ten minutes' time a fire engine arrived, an...","In ten minutes' time a fire engine arrived, an...",0.07377
27,3200797032,1882-05-27,Illustrated Police News,"The prisoner said he 'ent to put out the fire,",The prisoner said he went to put out the fire.,0.043478


In [12]:
test['CER'].describe()

count    2792.000000
mean        0.083974
std         0.135372
min         0.000000
25%         0.014122
50%         0.038462
75%         0.092946
max         1.700000
Name: CER, dtype: float64