# Segmentation
Imports and define names of datafiles

In [15]:
from sklearn.model_selection import train_test_split, KFold
from transformers import AutoTokenizer
from typing import List,Tuple
from tqdm import tqdm  
import regex as re
import random
import json
datafiles= {
  "E1" : [''],
  "E2" : ['a', 'b'],
  "E3" : [''],
  "E4" : ['']
}

Function that extracts headwords out of \<b\> tags to build a headword dataset.

In [16]:
def build_b_tag_dataset(datastring, next_chars = 500, verbose=False):
  b_tag_dict = []

  # BUILD POSITIVE 
  for match in tqdm(re.finditer(r"((?<=<b>).+<\/b>)(.*(?<=<b>).+<\/b>)*", datastring), disable=(not verbose)):
    g1 = match.group(0)
    matched_b_tag = re.sub(r"</b>.*<b>|</b>"," ",g1).strip()
    end_of_b_tag = match.end()  
    
    surrounding_text_match = re.search(r"([^<]{1,"+str(next_chars)+r"})(?=<|$)", datastring[end_of_b_tag:end_of_b_tag+next_chars])
    surrounding_text = surrounding_text_match.group(0) if surrounding_text_match else ""

    short_def = re.sub(r"\s+", " ", surrounding_text).strip()
    if len(short_def) > 0:
      b_tag_dict.append([f"{matched_b_tag} {short_def}", matched_b_tag])

  # BUILD NEGATIVE
  for match in tqdm(re.finditer(r"(\n\n\p{Upper}[^<]{10,500})(?=\n|$|<)", datastring), disable=(not verbose)):
    g = match.group(0)
    matched_text = re.sub(r"\s+", " ", g).strip()
    b_tag_dict.append([matched_text, "<NO_HEADWORD>"])

  return b_tag_dict

Build the headword datasets for the first and second editions (E1 \& E2) where for each entry there is:
  - Feature: A paragraph or piece of text that starts with a headword, followed by up to <i>next_chars</i> number of characters, default is 500.
  - label: The headword at the beginning of the corresponding feature, empty string if feature wasn't a <i>"headword"</i> paragraph.

Save results to json files:
```json
  ["Lund, uppstad i Malmöhus län...beskaffenhet. I all", "Lund,"]
  ["betjenade sig af rapporter från...till privatlifvet", ""]
```

In [46]:
for i,edition in enumerate(['E1', 'E2']):

  dataset = ""
  for file in datafiles.get(edition):
    with open(f"./dataset/NF_{edition}{file}.txt", "r", encoding='utf-8') as fr:
      dataset += fr.read()
      fr.close()
      
  b_tag_dict = build_b_tag_dataset(dataset, verbose=True)
  print(f"{edition} has {len(b_tag_dict):,} entries")

  with open(f"./dataset/NF_{edition}_B.json", "w") as b_json:
    json.dump(b_tag_dict, b_json, indent=2, ensure_ascii=False)
del i, edition, dataset, file, fr, b_tag_dict, b_json

114857it [00:18, 6239.93it/s] 
11920it [00:00, 18291.41it/s]


E1 has 126,690 entries


132408it [00:26, 4955.58it/s]
49493it [00:02, 18965.87it/s]


E2 has 181,758 entries


In [None]:
dataset = []
for i, edition in enumerate(['E1', 'E2']):
  for file in datafiles.get(edition):
    with open(f"./dataset/NF_{edition}_B.json", "r", encoding='utf-8') as b_json:
      dataset += json.load(b_json)
      b_json.close()

random.shuffle(dataset)

tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
_ = tokenizer.add_tokens(["<NO_HEADWORD>"])

def process_data(sentence, headword):
    encoded_sentence = tokenizer(
        sentence,
        add_special_tokens=True, # Add [CLS] and [SEP] tokens
        padding='max_length',   # Pad to a maximum length
        max_length=50,        # Choose an appropriate max length
        truncation=True,        # Truncate if longer than max length
        return_tensors='pt'   # Return PyTorch tensors
    )
    # Encode the headword
    encoded_headword = tokenizer(
        headword,
        add_special_tokens=True,
        padding='max_length',
        max_length=32,           # Choose a suitable max length for headwords
        truncation=True,
        return_tensors='pt'
    )
    return encoded_sentence['input_ids'][0], encoded_headword['input_ids'][0]

def extract_features_labels(dataset) -> Tuple[List, List]:
    x = []
    y = []
    for entry in tqdm(dataset):
      s, h = process_data(entry[0], entry[1])
      x.append(s)
      y.append(h)
    return x,y

dataset = dataset[:int(0.2*len(dataset))]
X, y = extract_features_labels(dataset)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [23]:
X_train[:10]

[{'input_ids': tensor([[    2,   335,    76, 11691, 49796,  5548, 40176,  8847,   264, 33282,
          33628,    42,  6749,  2879,  2346,    19,   256,   102, 33013, 49795,
           1058, 26707,  6749,  2213,  2650,   252, 11050,    19,  2879,  2346,
             19,   146,    21, 10996,   308, 39949,  4815,  2348,  2112,   178,
          42812, 49795,  1376, 49808, 13957,  4261,   390,  1783,   290,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1]])},
 {'input_ids': tensor([[    2,  2876,   379,    19, 11729,     7,  1683,  2609,  3114,  8051,
              7,     3,     0,     0,     0,     0,     0,     0,     0,     0,
        