In [1]:
import tqdm
import pandas as pd
from collections import Counter
from unicodedata import normalize

BIN_SIZE = 10

def get_ngrams(string: str, _sizes = (3, ), prefix="$"):
    ret = {}
    #string = normalize("NFC", string)
    for S in _sizes:
        nb = len(string) - (S-1)
        ret.update({
            prefix+k.replace(" ", "_"): v/nb
            for k, v in Counter([string[i:i+S] for i in range(len(string)-S+1)]).items()
        })
    return ret


df = pd.read_csv("cer.csv.gzip", compression="gzip", index_col=0)

In [2]:
print(df.shape)
df.drop_duplicates(subset=["manuscript", "page_id", "line_id", "transcription"], inplace=True)
print(df.shape)
df.head()

(322903, 8)
(300269, 8)


Unnamed: 0,model,lang,manuscript,page_id,line_id,transcription,CER,CER_NS
0,data-lat_only_3.mlmodel,lat,SBB_PK_Hdschr25,0,0,a ⁊utl,82.3529,81.25
1,data-lat_only_3.mlmodel,lat,SBB_PK_Hdschr25,0,1,t ut t̃ ps p,72.7273,83.3333
2,data-lat_only_3.mlmodel,lat,SBB_PK_Hdschr25,0,2,tas t̃,75.0,76.1905
3,data-lat_only_3.mlmodel,lat,SBB_PK_Hdschr25,0,3,e ult ut p̃ t,68.0,77.2727
4,data-lat_only_3.mlmodel,lat,SBB_PK_Hdschr25,0,4,ba tt ss ttta,70.9677,76.9231


# Compute Bins

In [3]:
df['CER_BINS'] = pd.cut(df['CER_NS'], bins=list(range(0, 101, BIN_SIZE)), include_lowest=True)
df.groupby('CER_BINS').size()

CER_BINS
(-0.001, 10.0]    74658
(10.0, 20.0]      33020
(20.0, 30.0]      24344
(30.0, 40.0]      24586
(40.0, 50.0]      30691
(50.0, 60.0]      29665
(60.0, 70.0]      24833
(70.0, 80.0]      20602
(80.0, 90.0]      15283
(90.0, 100.0]     22587
dtype: int64

In [4]:
df['CER_BINS'].unique()

[(80.0, 90.0], (70.0, 80.0], (90.0, 100.0], (40.0, 50.0], (60.0, 70.0], (50.0, 60.0], (30.0, 40.0], (20.0, 30.0], (10.0, 20.0], (-0.001, 10.0]]
Categories (10, interval[float64, right]): [(-0.001, 10.0] < (10.0, 20.0] < (20.0, 30.0] < (30.0, 40.0] ... (60.0, 70.0] < (70.0, 80.0] < (80.0, 90.0] < (90.0, 100.0]]

# Compute ngrams

In [5]:
new_df = []

total = Counter()


for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if len(str(row.transcription)) > 15:
        cnt = get_ngrams(str(row.transcription).lower())
        total += cnt
        new_df.append({
            "idx": idx,
            "lang": row.lang,
            "bin": int(row.CER_BINS.left) // BIN_SIZE,
            "manuscript": row.manuscript,
            "page_id": row.page_id,
            "line_id": row.line_id,
            "transcription": row.transcription, 
            "CER": row.CER_NS,
            **cnt
        })
    
del df  # For memory sake

100%|█████████████████████████████████████████████████████████| 300269/300269 [04:46<00:00, 1048.87it/s]


## Cut ngrams appearing in less than .5% of the lines

In [6]:
total_presence = Counter([
    key
    for row in tqdm.tqdm(new_df)
    for key in row 
    if key.startswith("$")
])
print(len(total_presence))
#filter_value = int(len(total_presence)/100)
#max_value = total_presence.most_common(filter_values)[-1][1]
#print(f"1% {filter_value}")

most_common = [key for key, _ in total_presence.most_common(1024*4)]
total_presence = {
    key: value
    for key, value in tqdm.tqdm(total_presence.items())
    if key in most_common
}
print(len(total_presence))
#del total

100%|███████████████████████████████████████████████████████| 265025/265025 [00:00<00:00, 472881.89it/s]


34555


100%|██████████████████████████████████████████████████████████| 34555/34555 [00:01<00:00, 24909.38it/s]

4096





# Sentences

In [7]:
import numpy as np
import random

random.shuffle(new_df)
KEPT = ["idx", "bin", "lang", "transcription", "manuscript", "page_id", "line_id", "CER", *total_presence.keys()]
# Optim
DTYPES = {
    "idx": np.int32,
    "bin": np.int8,
    "page_id": np.int16,
    "line_id": np.int16,
    "CER": np.float16,
    **{
        feat: np.float16
        for feat in total_presence
    }
}

df = pd.DataFrame({
    key: pd.Series([
        row.get(key, np.nan)
        for row in new_df
    ], dtype=DTYPES.get(key))
    for key in KEPT})#, dtype={feature: np.float32 for feature in total_presence})
df.head()

Unnamed: 0,idx,bin,lang,transcription,manuscript,page_id,line_id,CER,$__q,$_q̃,...,$.qe,$:⁊s,$eiz,$z.⁊,$ulx,$:ca,$ͥl_,$:si,$:qe,$:qi
0,125720,6,fro,a pos man cee t mesore,BnF_fr_25516,3,50,62.96875,,,...,,,,,,,,,,
1,267356,7,fro,ũde ił peisetut due due,bnf__arsenal3516_imagedumonde,5,116,72.75,,,...,,,,,,,,,,
2,201232,2,fro,se partist desi. Et qnaint ui,bnf_fr_22549_sept_sages,11,53,28.0,,,...,,,,,,,,,,
3,184835,0,fro,uoir sopouture.Etse uos beaus signeurs,bnf_fr_412_wauchier,30,3,0.0,,,...,,,,,,,,,,
4,183024,2,fro,du monde qm̃t il est fus,bnf__arsenal3516_imagedumonde,7,179,23.8125,,,...,,,,,,,,,,


In [8]:
df.dtypes

idx                int32
bin                 int8
lang              object
transcription     object
manuscript        object
                  ...   
$:ca             float16
$ͥl_             float16
$:si             float16
$:qe             float16
$:qi             float16
Length: 4104, dtype: object

In [9]:
df.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265025 entries, 0 to 265024
Columns: 4104 entries, idx to $:qi
dtypes: float16(4097), int16(2), int32(1), int8(1), object(3)
memory usage: 2.0+ GB


In [10]:
df.to_hdf("features.hdf5", key='df')
df.head()

Unnamed: 0,idx,bin,lang,transcription,manuscript,page_id,line_id,CER,$__q,$_q̃,...,$.qe,$:⁊s,$eiz,$z.⁊,$ulx,$:ca,$ͥl_,$:si,$:qe,$:qi
0,125720,6,fro,a pos man cee t mesore,BnF_fr_25516,3,50,62.96875,,,...,,,,,,,,,,
1,267356,7,fro,ũde ił peisetut due due,bnf__arsenal3516_imagedumonde,5,116,72.75,,,...,,,,,,,,,,
2,201232,2,fro,se partist desi. Et qnaint ui,bnf_fr_22549_sept_sages,11,53,28.0,,,...,,,,,,,,,,
3,184835,0,fro,uoir sopouture.Etse uos beaus signeurs,bnf_fr_412_wauchier,30,3,0.0,,,...,,,,,,,,,,
4,183024,2,fro,du monde qm̃t il est fus,bnf__arsenal3516_imagedumonde,7,179,23.8125,,,...,,,,,,,,,,


In [11]:
df["$abi"].value_counts()

0.026321    82
0.027771    75
0.029419    73
0.041656    66
0.025635    64
            ..
0.013336     1
0.037750     1
0.010307     1
0.026672     1
0.012817     1
Name: $abi, Length: 88, dtype: int64

In [12]:
df.bin.unique()

array([6, 7, 2, 0, 8, 4, 3, 5, 9, 1], dtype=int8)

In [13]:
df["idx 	bin 	transcription 	manuscript 	page_id 	line_id 	CER".split()].to_hdf("texts.hdf5", key='df')