# Segmentation
Imports and define names of datafiles

In [1]:
import regex as re
import json
from tqdm import tqdm 
datafiles= {
  "E1" : [''],
  "E2" : ['a', 'b'],
  "E3" : [''],
  "E4" : ['']
}

Function that extracts headwords out of \<b\> tags to build a headword dataset.

In [None]:
def build_b_tag_dataset(datastring, next_chars = 500, verbose=False):
  b_tag_dict = []

  # BUILD POSITIVE 
  for match in tqdm(re.finditer(r"((?<=<b>).+<\/b>)(.*(?<=<b>).+<\/b>)*", datastring), disable=(not verbose)):
    g1 = match.group(0)
    matched_b_tag = re.sub(r"</b>.*<b>|</b>"," ",g1).strip()
    end_of_b_tag = match.end()  
    
    surrounding_text_match = re.search(r"([^<]{1,"+str(next_chars)+r"})(?=<|$)", datastring[end_of_b_tag:end_of_b_tag+next_chars])
    surrounding_text = surrounding_text_match.group(0) if surrounding_text_match else ""

    short_def = re.sub(r"\s+", " ", surrounding_text).strip()
    if len(short_def) > 0:
      b_tag_dict.append([f"{matched_b_tag} {short_def}", matched_b_tag])

  # BUILD NEGATIVE
  for match in tqdm(re.finditer(r"(\n\n[^<]{10,500})(?=\n|$|<)", datastring), disable=(not verbose)):
    g = match.group(0)
    matched_text = re.sub(r"\s+", " ", g).strip()
    b_tag_dict.append([matched_text, ""])

  return b_tag_dict

Build the headword datasets for the first and second editions (E1 \& E2) where for each entry there is:
  - Feature: A paragraph or piece of text that starts with a headword, followed by up to <i>next_chars</i> number of characters, default is 500.
  - label: The headword at the beginning of the corresponding feature, empty string if feature wasn't a <i>"headword"</i> paragraph.

Save results to json files:
```json
  ["Lund, uppstad i Malmöhus län...beskaffenhet. I all", "Lund,"]
  ["betjenade sig af rapporter från...till privatlifvet", ""]
```

In [None]:
for i,edition in enumerate(['E1', 'E2']):

  dataset = ""
  for file in datafiles.get(edition):
    with open(f"./dataset/NF_{edition}{file}.txt", "r", encoding='utf-8') as fr:
      dataset += fr.read()
      fr.close()
      
  b_tag_dict = build_b_tag_dataset(dataset, verbose=True)
  print(f"{edition} has {len(b_tag_dict):,} entries")

  with open(f"./dataset/NF_{edition}_B.json", "w") as b_json:
    json.dump(b_tag_dict, b_json, indent=2, ensure_ascii=False)
del i, edition, dataset, file, fr, b_tag_dict, b_json