In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_hdf("features.hdf5", key="df", index_col=0)
#df.drop(['manuscript', 'page_id', "line_id", "transcription"], axis=1, inplace=True)
df["K"] = 0
print(sorted(df.bin.unique()))

['Acceptable', 'Bad', 'Good', 'Very bad']


In [3]:
df.head()

Unnamed: 0,idx,bin,lang,transcription,manuscript,page_id,line_id,CER,$_q̃,$q̃_,...,$iez,$gie,$loi,$uei,$aen,$nz_,$esc,$inz,$r_⁊,K
0,184114,Good,fro,diuine chose.Apres ce bien lespace de.u.,bnf_fr_412_wauchier,50,44,4.878906,,,...,,,,,,,,,,0
1,81772,Very bad,fro,t ts rs si t aẽl,bnf__arsenal3516_imagedumonde,6,163,58.625,,,...,,,,,,,,,,0
2,348290,Good,fro,signor ⁊agemir ⁊aplorer.por ce qe tant,bnf_fr_412_wauchier,38,36,0.0,,,...,,,,,,,,,0.055542,0
3,51822,Acceptable,lat,¶louatibi due.qui natus,SBB_PK_Hdschr25,7,1,23.078125,,,...,,,,,,,,,,0
4,52653,Acceptable,lat,ẽ ĩsinc malitie ⁊dic. Et cũẽ uũd aut spum...,CLM13027,0,36,17.734375,,,...,,,,,,,,,,0


In [4]:
df.bin.unique()

array(['Good', 'Very bad', 'Acceptable', 'Bad'], dtype=object)

# Generate K-Folds class

## Memory footprint

In [5]:
print(f"Memory {df.info(memory_usage='deep')}")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265025 entries, 0 to 265024
Columns: 1009 entries, idx to K
dtypes: float16(1001), int16(2), int32(1), int64(1), object(4)
memory usage: 593.9 MB
Memory None


## Function to retrieve a K-Fold set

In [6]:
from nenequitia.contrib import get_manuscripts_and_lang_kfolds

# get_manuscripts_and_lang_kfolds(df, k=1)

(           idx         bin lang                             transcription  \
 0       184114        Good  fro  diuine chose.Apres ce bien lespace de.u.   
 2       348290        Good  fro    signor ⁊agemir ⁊aplorer.por ce qe tant   
 3        51822  Acceptable  lat                   ¶louatibi due.qui natus   
 5        98068         Bad  fro            mour layaole geudus auoit due.   
 6       339931        Good  fro            u ltre li met lenseinie aliant   
 ...        ...         ...  ...                                       ...   
 265019   24552        Good  lat       nũc exceamur. nc gestemur. nc pran   
 265020   54456         Bad  lat   sts Otiñ de diisirate Ri dilisificac̃e   
 265021  243905        Good  fro                gñt bien venir. Elle dist   
 265022   99787         Bad  fro    quotreleou les rechez illor serourrele   
 265024  219581         Bad  fro               eus mr hidite hion ił  eou   
 
                      manuscript  page_id  line_id        CER 

## Model

In [12]:
import json
import random

import tqdm
import torch.nn as nn
import torch
from torch.autograd import Variable
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score
from typing import List, Optional

        
def var(X):
    return torch.from_numpy(X).cuda()


class NgramModel(nn.Module):
    def __init__(self, features, output_dim, device="cuda:0"):
        super(NgramModel, self).__init__()
        self.out = output_dim
        self.features = features or ()
        self.inp = len(self.features)
        self.net = nn.Sequential(
            nn.Linear(self.inp, 128),
            nn.Dropout(.1),
            #nn.Linear(512, 128),
            #nn.Dropout(.1),
            nn.Linear(128, self.out)
        )
        self.to(device)
        
    def get_batches(
        self,
        dataframe,
        indexes: Optional[List[int]],
        has_truth: bool = False,
        batch_size=256
    ):
        if has_truth:
            random.shuffle(indexes)
        samples = len(indexes)
        
        for batch_start in range(0, samples, batch_size):
            batch_end = min(batch_start+batch_size, samples)
            loc_indexes = indexes[batch_start:batch_end]
            if has_truth:
                yield (
                    var(dataframe.loc[loc_indexes, self.features].fillna(.0).to_numpy()),
                    var(dataframe.loc[loc_indexes, ["bin"]].to_numpy(dtype="l")).squeeze(),
                )
            else:
                yield (
                    var(dataframe.loc[loc_indexes, self.features].fillna(.0).to_numpy()),
                    None
                )
                
    def forward(self, x):
        return self.net(x.float())
        
    def fit(
        self, 
        dataframe: pd.DataFrame,
        train_indexes: List[int], dev_indexes: List[int],
        epochs=1000, max_bad_epochs=20, batch_size=512, 
        lr=5e-3, delta=.005, use_loss: bool = True
    ):
        criterion = torch.nn.CrossEntropyLoss()
        dev_loss = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)

        best = float("-inf")
        if use_loss:
            best = float("inf")

        bad_epochs = 0
        best_params = self.state_dict()

        nb_batches = df.shape[0] // batch_size
        for epoch in (pbar := tqdm.tqdm(range(epochs))):

            for batch_idx, (xs, ys) in (pbar2 := tqdm.autonotebook.tqdm(enumerate(
                self.get_batches(dataframe, train_indexes, batch_size=batch_size, has_truth=True)
            ), leave=True, position=0)):
                outputs = self(xs)
                loss = criterion(outputs, ys)
                loss.backward()
                pbar2.set_description(f'Loss:{loss.item():.3f}')

            # update parameters
            optimizer.step()
            optimizer.zero_grad()

            self.eval()
            predicted = []
            total_ys = []
            epoch_loss = torch.tensor(.0).cpu().item()

            for batch_idx, (xs, ys) in tqdm.autonotebook.tqdm(enumerate(
                self.get_batches(dataframe, dev_indexes, batch_size=batch_size, has_truth=True)
            ), leave=True, position=1):
                outputs = self(xs)
                loss = dev_loss(outputs, ys)
                epoch_loss += loss.item()

                predicted.extend(outputs.argmax(dim=-1).cpu().flatten().tolist())
                total_ys.extend(ys.tolist())


            acc = accuracy_score(predicted, total_ys)
            epoch_loss = epoch_loss / (batch_idx + 1)
            self.train()

            factor = 100

            if use_loss:
                factor = 1
                if abs(epoch_loss - best) > delta and epoch_loss < best:
                    best = epoch_loss
                    bad_epochs = 0
                    best_params = self.state_dict()
                else:
                    bad_epochs += 1
                    if bad_epochs == max_bad_epochs + 1:
                        break
            else:
                if abs(acc - best) > delta and acc > best:
                    best = acc
                    bad_epochs = 0
                    best_params = self.state_dict()
                else:
                    bad_epochs += 1
                    if bad_epochs == max_bad_epochs + 1:
                        break

            pbar.set_description(f'BAD:{bad_epochs:0>2} LOSS:{epoch_loss:.2f} ACC:{acc*100:.1f} BEST:{best*factor:.1f}')

            #if accum_loss < 2e-5:
            #    break
        print("Loading best params...")
        self.load_state_dict(best_params)
        self.eval()
        
    def pred_dataframe(self, dataframe: pd.DataFrame, indexes: List[int], batch_size=256, _verbose=False):
        out = []
        if _verbose:
            deco = tqdm.tqdm
        else:
            deco = lambda x: x
        for x, _ in deco(self.get_batches(dataframe, indexes, batch_size=batch_size)):
            out.extend(self(x).argmax(dim=-1).cpu().flatten().tolist())
        return np.array(out)
        
    def pred(self, inputs, batch_size=256, _verbose=False):
        out = []
        if _verbose:
            deco = tqdm.autonotebook.tqdm
        else:
            deco = lambda x: x
        for x, _ in deco(self.get_batches(inputs, batch_size=batch_size)):
            out.extend(self(x.float()).argmax(dim=-1).cpu().flatten().tolist())
        return np.array(out)
    
    def save(self, name):
        torch.save(self.state_dict(), f"{name}.pt")
        with open(f"{name}.json", "w") as f:
            json.dump(self.features, f)

## Function for training model

In [None]:
import matplotlib.pyplot as plt
import random

def get_k_iterators(Ks, df):
    def ret():
        random.shuffle(Ks)
        for k in ks:
            ids, YCs, YNCs, Xs, XTranscriptions = get_features(train)
            yield Xs, YCs
    return ret

def make_for_K(K, df):
    train, dev, test = get_kfold_train_test(df, K)
    
    model = NgramModel(
        output_dim=len(df.bin.unique()),
        features=tuple([col for col in df.columns if col.startswith("$")])
    )
    print(model)
    model.fit(
        df,
        train_indexes=train,
        dev_indexes=dev,
        batch_size=4096*2,
        delta=.01,
        max_bad_epochs=10,
        epochs=100,
        lr=1e-1
    )
    model.net.eval()
    
    out = model.pred_dataframe(df, indexes=test, _verbose=True, batch_size=2048)
    test_truthes = df.loc[test, "bin"].tolist()
    
    # This should plot...
    figure, ax = plt.subplots(figsize=(10, 10), dpi=300)
    ConfusionMatrixDisplay.from_predictions(test_truthes, out, ax=ax)
    plt.show()
    print(classification_report(test_truthes, out))
    e_df = pd.DataFrame(zip(test_truthes, out), columns=["bin", "pred"])
    print(e_df.plot.box(by="bin"))
    for i in range(5):
        e_df[f"RectPred{i}"] = abs(e_df["pred"] - e_df["bin"]) <= i
        counts = e_df[f"RectPred{i}"].value_counts()
        print(f"Accuracy of CER predicted within {int(i*5)} "
              f"of the GT: {counts.get(True, 0)/(counts.get(False, 0)+counts.get(True, 0))*100:.2f}%")

    e_df[f"RectPredReadable"] = (e_df["pred"] < 2) & (e_df["bin"] < 2)  # That's wrong ?
    counts = e_df[(e_df["bin"] < 2)]["RectPredReadable"].value_counts()
    print(f"Accuracy of predicted CER < 10% (Readable) "
          f"of the GT: {counts.get(True, 0)/(counts.get(False, 0)+counts.get(True, 0))*100:.2f}%")
    
    e_df[f"RectPred85"] = (e_df["pred"] < 3) & (e_df["bin"] < 3)  # That's wrong ?
    counts = e_df[(e_df["bin"] < 2)]["RectPred85"].value_counts()
    print(f"Accuracy of predicted CER < 15% (Readable) "
          f"of the GT: {counts.get(True, 0)/(counts.get(False, 0)+counts.get(True, 0))*100:.2f}%")
    
    model.save(f"ngram-{K}")
    del model

In [15]:
NB_EXPS = 5
for i in range(KS):
    print(f"Dealing with {i}")
    make_for_K(K=i, df=df)
    if i+1 == NB_EXPS:
        break

Dealing with 0
NgramModel(
  (net): Sequential(
    (0): Linear(in_features=3567, out_features=512, bias=True)
    (1): Dropout(p=0.1, inplace=False)
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=128, out_features=20, bias=True)
  )
)


  0%|                                                                           | 0/100 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.96 ACC:19.1 BEST:3.0:   1%|▎                            | 1/100 [00:36<1:00:34, 36.71s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.87 ACC:19.1 BEST:2.9:   2%|▌                            | 2/100 [01:13<1:00:03, 36.77s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.78 ACC:19.1 BEST:2.8:   3%|▉                              | 3/100 [01:50<59:30, 36.80s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.81 ACC:19.1 BEST:2.8:   4%|█▏                             | 4/100 [02:27<58:58, 36.86s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.73 ACC:19.1 BEST:2.7:   5%|█▌                             | 5/100 [03:04<58:20, 36.84s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.69 ACC:19.1 BEST:2.7:   6%|█▊                             | 6/100 [03:41<57:57, 37.00s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.68 ACC:19.2 BEST:2.7:   7%|██▏                            | 7/100 [04:18<57:27, 37.07s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.66 ACC:19.7 BEST:2.7:   8%|██▍                            | 8/100 [04:55<56:54, 37.12s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.62 ACC:20.1 BEST:2.6:   9%|██▊                            | 9/100 [05:33<56:22, 37.17s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.55 ACC:20.4 BEST:2.6:  10%|███                           | 10/100 [06:10<55:57, 37.31s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.49 ACC:20.8 BEST:2.5:  11%|███▎                          | 11/100 [06:48<55:35, 37.48s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.45 ACC:21.6 BEST:2.4:  12%|███▌                          | 12/100 [07:26<55:16, 37.69s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.40 ACC:22.9 BEST:2.4:  13%|███▉                          | 13/100 [08:04<54:50, 37.82s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.37 ACC:23.9 BEST:2.4:  14%|████▏                         | 14/100 [08:42<54:11, 37.81s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.35 ACC:24.2 BEST:2.4:  15%|████▌                         | 15/100 [09:20<53:36, 37.84s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.32 ACC:24.2 BEST:2.3:  16%|████▊                         | 16/100 [09:59<53:13, 38.01s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.30 ACC:24.5 BEST:2.3:  17%|█████                         | 17/100 [10:37<52:44, 38.13s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.28 ACC:25.1 BEST:2.3:  18%|█████▍                        | 18/100 [11:16<52:18, 38.27s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.27 ACC:25.0 BEST:2.3:  19%|█████▋                        | 19/100 [11:54<51:44, 38.33s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.26 ACC:25.1 BEST:2.3:  20%|██████                        | 20/100 [12:32<51:03, 38.29s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.24 ACC:25.5 BEST:2.2:  21%|██████▎                       | 21/100 [13:11<50:26, 38.31s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.21 ACC:25.7 BEST:2.2:  22%|██████▌                       | 22/100 [13:49<49:54, 38.39s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.20 ACC:25.9 BEST:2.2:  23%|██████▉                       | 23/100 [14:27<49:11, 38.33s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.18 ACC:26.1 BEST:2.2:  24%|███████▏                      | 24/100 [15:06<48:32, 38.32s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.16 ACC:26.3 BEST:2.2:  25%|███████▌                      | 25/100 [15:44<47:49, 38.25s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.16 ACC:26.4 BEST:2.2:  26%|███████▊                      | 26/100 [16:22<47:10, 38.26s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.15 ACC:26.4 BEST:2.1:  27%|████████                      | 27/100 [17:00<46:25, 38.15s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.14 ACC:26.6 BEST:2.1:  28%|████████▍                     | 28/100 [17:38<45:55, 38.28s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.14 ACC:26.6 BEST:2.1:  29%|████████▋                     | 29/100 [18:17<45:30, 38.46s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:02 LOSS:2.14 ACC:26.8 BEST:2.1:  30%|█████████                     | 30/100 [18:56<44:58, 38.55s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:03 LOSS:2.13 ACC:27.0 BEST:2.1:  31%|█████████▎                    | 31/100 [19:34<44:08, 38.38s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.11 ACC:27.1 BEST:2.1:  32%|█████████▌                    | 32/100 [20:12<43:15, 38.17s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:01 LOSS:2.11 ACC:27.3 BEST:2.1:  33%|█████████▉                    | 33/100 [20:49<42:23, 37.97s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:02 LOSS:2.11 ACC:27.4 BEST:2.1:  34%|██████████▏                   | 34/100 [21:27<41:46, 37.98s/it]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

BAD:00 LOSS:2.10 ACC:27.4 BEST:2.1:  35%|██████████▌                   | 35/100 [22:05<41:04, 37.91s/it]

0it [00:00, ?it/s]

BAD:00 LOSS:2.10 ACC:27.4 BEST:2.1:  35%|██████████▌                   | 35/100 [22:16<41:22, 38.19s/it]


KeyboardInterrupt: 