In [1]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, BertForMaskedLM
import torch
from transformers import AdamW
from tqdm import tqdm
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
import pandas as pd
from nltk.tokenize import RegexpTokenizer

In [2]:
df = pd.read_csv('preprocessed_df100.csv')

In [3]:
# select relevant columns
print(df.columns)
df = df[['identifier', 'gt text org', 'ocr text org', 'gt text', 'ocr text', 'source', 'century', 'gt sentences matched', 'aligned_GT_sentences', 'aligned_OCR_sentences']]

Index(['identifier', 'gt text', 'ocr text', 'CER', 'WER', 'source',
       'word count gt', 'word count ocr', 'century', 'gt sentences matched',
       'ocr sentences matched', 'CER matched sentences',
       'WER matched sentences', 'avg sentence length gt (fuzzy matched)',
       'avg sentence length ocr (fuzzy matched)',
       'sentences gt (fuzzy matched)', 'sentences ocr (fuzzy matched)',
       'word count gt (fuzzy matched)', 'word count ocr (fuzzy matched)',
       'aligned_GT_sentences', 'aligned_OCR_sentences', 'good_alignments',
       'bad_alignments'],
      dtype='object')


In [4]:
# preprocess for models
def replace_num(text, character):
    numbers = re.findall(r'\d+', text) 
    res = list(map(int, numbers))
    for numText in res:
        text = text.replace(str(numText), character)
    return text

def replace_proper_nouns(text, character, names): 
    res = list(map(str, names))
    for nounText in res:
        text = text.replace(str(nounText), character)
    return text
    
    


df['gt for training'] = df['gt sentences matched']
# replace numbers
df['gt for training'] = df['gt for training'].apply(lambda x: replace_num(x, '%NUMBER%'))
# replace proper nouns
df['gt for training'] = df['gt for training'].replace_proper_nouns(x, "%NNP%", gt_names))

In [5]:
# create parts for cross-validation, train, validation oand test set
df_17th = df[df['century'] == '1600s']
df_18th = df[df['century'] == '1700s']
df_19th = df[df['century'] == '1800s']
df_20th = df[df['century'] == '1900s']

datasets = [df_17th, df_18th, df_19th, df_20th]
# create a list of train, validation and test set

train = pd.DataFrame()
val = pd.DataFrame()
test = pd.DataFrame()
for dataset in datasets:
    splits = np.array_split(dataset, 5)
    train_sub = splits[:3]
    train_sub = pd.concat(train_sub)
    val_sub = splits[3]
    test_sub = splits[4]
    train = pd.concat([train, train_sub])
    val = pd.concat([val, val_sub])
    test = pd.concat([test, test_sub])

In [6]:
# fine_tuning the models

word2vec_models = []
BERT_models = []

def finetune_word2vec(train, window=5):
    sentences = train.split('.')
    tokenizer = RegexpTokenizer(r'\w+')
    sentences = [tokenizer.tokenize(i) for i in sentences]
    total_examples = len(sentences)
    
    model_w2v = Word2Vec(size=160, min_count=1, window=window)
    model_w2v.build_vocab(sentences)
    total_examples = model_w2v.corpus_count
    model = KeyedVectors.load_word2vec_format(r"combined-160.txt", binary=False)
    model_w2v.build_vocab([list(model.vocab.keys())], update=True)
    model_w2v.intersect_word2vec_format(r"combined-160.txt", binary=False, lockf=1.0)
    model_w2v.train(sentences, total_examples=total_examples, epochs=model_w2v.iter)
    return model_w2v
    
def finetune_BERT(train):
    
    class TrainingDataset(torch.utils.data.Dataset):
        def __init__(self, encodings):
            self.encodings = encodings
        def __getitem__(self, idx):
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        def __len__(self):
            return len(self.encodings.input_ids)
    
    text = train.split('.')
    tokenizer = BertTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
    model = BertForMaskedLM.from_pretrained("GroNLP/bert-base-dutch-cased")
    # BERT tuning code from https://github.com/jamescalam/transformers/blob/main/course/training/03_mlm_training.ipynb
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
    # we take take the indices of each True value, within each individual vector.
    selection = []
    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    # apply these indices to each respective row in input_ids, assigning each of the values at these indices as 103.
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    # initialize data
    dataset = TrainingDataset(inputs)
    
    # start training loop
    loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)
    #device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # and move our model over to the selected device
    #model.to(device)
    # activate training mode
    model.train()
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=5e-5)
    
    epochs = 2

    for epoch in range(epochs):
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            #input_ids = batch['input_ids'].to(device)
            #attention_mask = batch['attention_mask'].to(device)
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            #labels = batch['labels'].to(device)
            labels = batch['labels']
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
            
    return model

In [7]:
# train models used in cross validations
word2vec_models = []
BERT_models = []

train = '.'.join(list(train['gt for training']))
word2vec_model = finetune_word2vec(train)
BERT_model = finetune_BERT(train)
word2vec_models.append(word2vec_model)
BERT_models.append(BERT_model)

    
    

fold: 1


KeyboardInterrupt: 

In [None]:
models_test = []

print('start')
train = "P L B. p l b." * 1000
model = finetune_word2vec(train)
models_test.append(model)
print('halverwege')
train = "P F B. p f b." * 1000
model = finetune_word2vec(train)
models_test.append(model)
print('klaar')


In [None]:
print(models_test[0].most_similar('P'))
print(models_test[1].most_similar('P'))

In [None]:
models_test[0].save("word2vec_test.model")

In [None]:
print(df.columns)

In [None]:
model_test_2 = Word2Vec.load("word2vec_test.model")

In [None]:
model_test_2.most_similar('P')

In [None]:
train = "P F B. p f b." * 1000
BERT_model = finetune_BERT(train)

Epoch 0:  33%|███▎      | 83/251 [50:01<1:42:47, 36.71s/it, loss=0.0515]