<h1 style="color:red">WARNING!!</h1>

- I specifically ran this notebook on a separate environment, because calamanCy messes up with the rest of GLiNER's dependencies

Additional Notes:
- The created the json_records folder in assets would be where the argilla records are stored.
    - Simply store them in the parent folder i.e. `../assets/json_records/batch_{batches}/records.json`


### Flow
```python
-> Create Initial Train / Test Set (80/20) 
-> Convert to spacy 
-> Subdivide to get subsets which has Dev Set # largest subset would let you get (70/20/10 split)
    -> test split would be more dominant if you have a smaller train/dev
```

In [None]:
# install if needed
# %pip install spacy
# %pip install calamancy

### Necessary Variables / Imports

In [1]:
import spacy, calamancy

print("spaCy:", spacy.__version__)
print("Calamancy:", calamancy.__version__)

spaCy: 3.8.7
Calamancy: 0.2.2


In [4]:
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

nlp = calamancy.load("tl_calamancy_md-0.2.0")

def custom_tokenizer(nlp : Language):
    infixes = nlp.Defaults.infixes + [r'-+'] + [r'\)'] + [r'\('] + [r'\'t']
    infix_re = compile_infix_regex(infixes)
    return Tokenizer(
        nlp.vocab,
        prefix_search=nlp.tokenizer.prefix_search,
        suffix_search=nlp.tokenizer.suffix_search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match
    )

nlp.tokenizer = custom_tokenizer(nlp)

In [5]:
import random
random_seed = 42
random.seed(random_seed)

In [9]:
import os

dataset_name = "batch_1-2"
dataset_file_path = f"../assets/json_records/{dataset_name}"
spacy_file_path = f"../assets/corpus/{dataset_name}/spacy"

os.makedirs(dataset_file_path, exist_ok=True)
os.makedirs(spacy_file_path, exist_ok=True)

### Initial Split
- Creates the initial train/test split before we process then subset it

In [10]:
import json, unicodedata, collections

# Load dataset
with open(f"{dataset_file_path}/records.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Store character info
char_stats = collections.Counter()
char_examples = {}

def analyze_text(text, rec_id):
    for ch in text:
        if ord(ch) > 127 or ch in ["\u200b", "\u00a0"]:  # non-ASCII, zero-width, NBSP
            key = f"{ch} | U+{ord(ch):04X} | {unicodedata.name(ch, 'UNKNOWN')}"
            char_stats[key] += 1
            if key not in char_examples:
                # keep first 2 examples only
                char_examples[key] = []
            if len(char_examples[key]) < 2:
                snippet = text[:100].replace("\n", " ")
                char_examples[key].append({"record_id": rec_id, "sample": snippet})

# Scan all records
for rec in data:
    text = rec["fields"]["Text"]
    analyze_text(text, rec.get("id"))

# Save summary
summary = []
for char_key, count in char_stats.most_common():
    summary.append({
        "char": char_key,
        "count": count,
        "examples": char_examples.get(char_key, [])
    })

with open(f"{dataset_file_path}/weird_chars.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("✅ Weird characters scanned. Results saved to weird_chars.json")

✅ Weird characters scanned. Results saved to weird_chars.json


In [11]:
print("Total records:", len(data))

Total records: 2700


In [12]:
from collections import defaultdict
import os

def stratified_split_train_test(data : list, test_ratio=0.2, strata="Publication", seed=42):
    """
    Modified version of the train_test split but we'll only separate it into the train-test split first 
    so that we get a consistent test set.

    Params:
        data: the full dataset
        test_ratio: the ratio between the train and test. defaults to `0.2`
        strata: which metadata to stratify. Defaults to `Publication`
        seed: the random seed. Defaults to `42`
    Returns:
        train_dev: the new train (and possibly dev) set
        test: the new test data
    """
    random.seed(seed)
    buckets = defaultdict(list)

    # group entries by the column to use as our basis.
    for rec in data:
        pub = rec.get("metadata", {}).get(strata, "UNKNOWN") # defaults to publication
        buckets[pub].append(rec)

    train, test = [], []

    # allocate per publication
    for pub, items in buckets.items():
        random.shuffle(items)
        n_total = len(items)
        n_test = int(round(test_ratio * n_total))
        # n_dev = int(round(dev_ratio * n_total))

        
        test.extend(items[:n_test])
        # dev.extend(items[n_test:n_test+n_dev])
        train.extend(items[n_test:])

    # shuffle final splits
    random.shuffle(train)
    # random.shuffle(dev)
    random.shuffle(test)

    return train, test

In [13]:
import json
import unicodedata
from spacy.util import filter_spans

# # Split ratios
from collections import Counter

# train_data, dev_data, test_data = stratified_split_by_publication(data)
train_data, test_data = stratified_split_train_test(data)

print(f"Train: {len(train_data)}, Test: {len(test_data)}")

# ------------------------------------------------------------------------------------


Train: 2162, Test: 538


In [14]:
from typing import Literal

splits = {
    "train": train_data,
    "test": test_data
}

def count_publications(data, name):
    pubs = Counter(rec.get("metadata", {}).get("Publication", "UNKNOWN") for rec in data)
    print(f"\n📊 {name}: {len(data)} entries")
    for pub, count in pubs.most_common():
        print(f"  {pub}: {count}")

def dump_json_data(data:list, split: Literal["train", "dev", "test"], addtl_dir:str=""):
    """
    Dumps all of the data into a json file.
    Params:
        data: the list of data to be dumped.
        split: the specified split_type ["train", "dev", "test"]
        addtl_dir: additional directory data like few_shot count. Defaults to an empty string.
    """
    # the original dataset path + additional directory (if any)
    directory = dataset_file_path

    if addtl_dir != "":
        directory += "/" + addtl_dir
        os.makedirs(directory, exist_ok=True)

    with open(f"{directory}/{split}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

for split in splits.keys():
    data = splits[split]
    count_publications(data, split)
    dump_json_data(data, split=split)
# 


📊 train: 2162 entries
  Ang Tinig ng Masa: 1213
  Taliba ng Bayan: 429
  Ang Bayan: 408
  Ugnayan: 38
  Sulong: 18
  Bukluran: 17
  Kalayaan: 7
  Datos: 6
  Balita ng Malayang Pilipinas: 6
  Tabak: 6
  Himagsik: 5
  Tinig ng Zoto: 4
  Proletaryo: 1
  Sambayanan: 1
  Kontres: 1
  Ang Kapatiran: 1
  Agham Bayan: 1

📊 test: 538 entries
  Ang Tinig ng Masa: 303
  Taliba ng Bayan: 107
  Ang Bayan: 102
  Ugnayan: 9
  Sulong: 5
  Bukluran: 4
  Balita ng Malayang Pilipinas: 2
  Kalayaan: 2
  Tabak: 1
  Tinig ng Zoto: 1
  Datos: 1
  Himagsik: 1


### Data Converters

In [15]:
# some helpers functions
# ---
def remove_white_space_from_span(text : str, start : int, end : int):
    curr_text_len = len(text)
    adjusted = False
    # print(text)

    text = text.lstrip()
    if len(text) < curr_text_len:
        start += curr_text_len - len(text)
        curr_text_len = len(text)
        adjusted = True

    text = text.rstrip()
    if len(text) < curr_text_len:
        end -= curr_text_len - len(text)
        curr_text_len = len(text)
        adjusted = True
    
    if adjusted:
        out = {
            "text": text,
            "start": start,
            "end": end
        }
        display(f"Cleaned: {out}")
        return out
    else:
        return None

def find_overlaps(entities, text):
    """Detect overlaps in list of (start, end, label) and return span text too."""
    overlaps = []
    # Sort by start index
    entities_sorted = sorted(entities, key=lambda x: (x[0], x[1]))
    for i in range(len(entities_sorted) - 1):
        start1, end1, label1 = entities_sorted[i]
        start2, end2, label2 = entities_sorted[i+1]
        # If the next entity starts before the current one ends → overlap
        if start2 < end1:
            overlaps.append({
                "first": {
                    "start": start1,
                    "end": end1,
                    "label": label1,
                    "text": text[start1:end1]
                },
                "second": {
                    "start": start2,
                    "end": end2,
                    "label": label2,
                    "text": text[start2:end2]
                }
            })
    return overlaps


In [16]:
# ---------- Helper: spaCy format ----------

def to_spacy_format(records):
    spacy_data = []
    for idx, record in enumerate(records):
        text = unicodedata.normalize("NFC", record["fields"]["Text"])

        metadata = record.get("metadata", {})
        Publication = metadata.get("Publication")
        Year = metadata.get("Year")
        row_index = metadata.get("Row_Index")
        
        entities = []
        for resp in record.get("responses", {}).get("entity_type", []):
            for ent in resp.get("value", []):
                entities.append((ent["start"], ent["end"], ent["label"]))

        # --- check overlaps here ---
        overlaps = find_overlaps(entities, text)
        if overlaps:
            overlap_log.append({
                "record_id": record.get("id"),
                "row_index": row_index,
                "text": text,
                "overlaps": overlaps
            })
            dropped_log.append({
                "record_id": record.get("id"),
                "row_index": row_index,                
                "text": text,
                "reason": "overlap",
                "details": overlaps
            })
            continue  # still skip the record completely

        # --- continue with alignment check ---
        aligned_entities = []
        doc = nlp.make_doc(text)

        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            cleaned = remove_white_space_from_span(text[start:end], start, end)

            if (cleaned is not None) or span is None:
                # aligned_entities.append((cleaned["start"], cleaned["end"], label))
                misaligned_log.append({
                    "record_id": record.get("id"),
                    "text": text,
                    "span_text": text[start:end],
                    "start": start,
                    "end": end,
                    "label": label
                })
            if cleaned is not None:
                start = cleaned["start"]
                end = cleaned["end"]
            
            # empty span, this shouldn't really happen
            if (start == end):
                # empty span
                dropped_log.append({
                    "record_id": record.get("id"),
                    "row_index": row_index,
                    "text": text,
                    "reason": "misannotation",
                    "details": []
                })
            else:
                aligned_entities.append((start, end, label))

        # --- final decision ---
        if not aligned_entities:
            #  keep as negative example, but log it

            spacy_out = {
                "text": text,
                "entities": [],
                "Publication": Publication,
                "Year": Year,
                "Row_Index": row_index
            }

            spacy_data.append(spacy_out)
            neg_sample_log.append({
                "record_id": record.get("id"),
                "row_index": row_index,
                "text": text,
                "reason": "no entities (kept as negative example)"
            }) # NO NEED TO LOG AS DROPPED SINCE THEY ARE NEEDED AS NEGATIVE SAMPLES
        else:
            spacy_out = {
                "text": text,
                "entities": aligned_entities,
                "Publication": Publication,
                "Year": Year,
                "Row_Index": row_index
            }
            spacy_data.append(spacy_out)
    return spacy_data

# ---------- Helper: BIO format ----------
def to_bio_format(records):
    bio_sents = []
    for record in records:
        text = unicodedata.normalize("NFC", record["fields"]["Text"])
        entities = []
        for resp in record.get("responses", {}).get("entity_type", []):
            for ent in resp.get("value", []):
                entities.append((ent["start"], ent["end"], ent["label"]))
        
        doc = nlp(text)
        tags = ["O"] * len(doc)

        for start, end, label in entities:
            for i, token in enumerate(doc):
                if token.idx >= start and token.idx < end:
                    if token.idx == start:
                        tags[i] = f"B-{label}"
                    else:
                        tags[i] = f"I-{label}"

        sent_tags = [(token.text, tags[i]) for i, token in enumerate(doc)]
        bio_sents.append(sent_tags)
    return bio_sents

In [17]:
import os

# Collect misaligned spans for logging
misaligned_log = []
overlap_log = []
dropped_log = []
neg_sample_log = []

os.makedirs(f"{dataset_file_path}/json_spacy", exist_ok=True)
os.makedirs(f"{dataset_file_path}/bio", exist_ok=True)
os.makedirs(f"{dataset_file_path}/other_info", exist_ok=True)

# ---------- Save outputs ----------

# spaCy JSON → inside /json folder
for split_name, split_data in [("train", train_data), ("test", test_data)]:
    spacy_data = to_spacy_format(split_data)
    with open(f"{dataset_file_path}/json_spacy/{split_name}.json", "w", encoding="utf-8") as f:
        json.dump(spacy_data, f, ensure_ascii=False, indent=2)

    # BIO → inside /bio folder
    bio_data = to_bio_format(split_data)
    with open(f"{dataset_file_path}/bio/{split_name}.bio", "w", encoding="utf-8") as f:
        for sent in bio_data:
            for token, tag in sent:
                f.write(f"{token}\t{tag}\n")
            f.write("\n")


# Save misaligned spans log in root
with open(f"{dataset_file_path}/other_info/misaligned.json", "w", encoding="utf-8") as f:
    json.dump(misaligned_log, f, ensure_ascii=False, indent=2)

print("✅ Saved: train/dev/test in both spaCy JSON (/json) and BIO (/bio) formats")
print(f"⚠️ Misaligned spans logged: {len(misaligned_log)}")

with open(f"{dataset_file_path}/other_info/overlaps.json", "w", encoding="utf-8") as f:
    json.dump(overlap_log, f, ensure_ascii=False, indent=2)

print(f"⚠️ Overlapping spans logged: {len(overlap_log)}")

# ---------- Save dropped log ----------
with open(f"{dataset_file_path}/other_info/dropped_entities.json", "w", encoding="utf-8") as f:
    json.dump(dropped_log, f, ensure_ascii=False, indent=2)

print(f"❌ Dropped entity entries logged: {len(dropped_log)}")

"Cleaned: {'text': 'Sakbayan', 'start': 80, 'end': 88}"

"Cleaned: {'text': 'Mindanao', 'start': 66, 'end': 74}"

"Cleaned: {'text': 'Demokratikong Republikang Bayan ng Korea', 'start': 556, 'end': 596}"

"Cleaned: {'text': 'Dangadang', 'start': 8, 'end': 17}"

"Cleaned: {'text': 'HULYO 1985', 'start': 63, 'end': 73}"

"Cleaned: {'text': 'Masa', 'start': 97, 'end': 101}"

"Cleaned: {'text': '78%', 'start': 31, 'end': 34}"

"Cleaned: {'text': 'SAMBUTANI', 'start': 54, 'end': 63}"

"Cleaned: {'text': 'Assemblywoman Manotoc', 'start': 78, 'end': 99}"

"Cleaned: {'text': 'apartheid', 'start': 481, 'end': 490}"

"Cleaned: {'text': 'korporasyong transnasyonal', 'start': 36, 'end': 62}"

"Cleaned: {'text': 'Pilipino', 'start': 141, 'end': 149}"

"Cleaned: {'text': 'Enero', 'start': 193, 'end': 198}"

"Cleaned: {'text': 'US', 'start': 150, 'end': 152}"

"Cleaned: {'text': 'Administrasyon', 'start': 164, 'end': 178}"

"Cleaned: {'text': 'Okt. 20', 'start': 107, 'end': 114}"

"Cleaned: {'text': '6/26/78', 'start': 130, 'end': 137}"

✅ Saved: train/dev/test in both spaCy JSON (/json) and BIO (/bio) formats
⚠️ Misaligned spans logged: 32
⚠️ Overlapping spans logged: 0
❌ Dropped entity entries logged: 0


In [18]:
with open(f"{dataset_file_path}/other_info/neg_sample.json", "w", encoding="utf-8") as f:
    json.dump(neg_sample_log, f, ensure_ascii=False, indent=2)

print(f"⚠️ Negative example spans logged: {len(neg_sample_log)}")

⚠️ Negative example spans logged: 436


In [None]:
import json

def count_json_entries(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"{json_file}: {len(data)} entries")
    return len(data)

count_json_entries(f"{dataset_file_path}/json_spacy/train.json")
count_json_entries(f"{dataset_file_path}/json_spacy/test.json")


../assets/json_records/batch_1-2/json_spacy/train.json: 2162 entries
../assets/json_records/batch_1-2/json_spacy/test.json: 538 entries


538

In [20]:
import json, unicodedata

# Load helper
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def to_key(record):
    # Prefer ID if available
    if isinstance(record, dict):
        if "id" in record:
            return str(record["id"])

        fields = record.get("fields")
        if isinstance(fields, dict):
            return unicodedata.normalize("NFC", fields.get("Text", ""))
        elif isinstance(fields, list):
            # If fields is a list, join as string for uniqueness
            return unicodedata.normalize("NFC", " ".join(map(str, fields)))
    
    # fallback: stringify entire record
    return unicodedata.normalize("NFC", str(record))

# Load all splits
train = load_json(f"{dataset_file_path}/json_spacy/train.json")
test  = load_json(f"{dataset_file_path}/json_spacy/test.json")

train_keys = {to_key(r) for r in train}
test_keys  = {to_key(r) for r in test}

# Report sizes
print("Train size:", len(train_keys))
# print("Dev size:", len(dev_keys))
print("Test size:", len(test_keys))

# Overlaps
# overlap_train_dev = train_keys & dev_keys
overlap_train_test = train_keys & test_keys
# overlap_dev_test = dev_keys & test_keys

print("Train ∩ Test:", len(overlap_train_test))

# Global uniqueness
all_keys = train_keys | test_keys
total = len(train) + len(test)
print("Total samples:", total)
print("Unique samples:", len(all_keys))
print("Duplicates overall:", total - len(all_keys))

# Optional: save actual duplicates for inspection
duplicates = {
    # "train_dev": list(overlap_train_dev),
    "train_test": list(overlap_train_test),
    # "dev_test": list(overlap_dev_test),
}
with open(f"{dataset_file_path}/other_info/split_duplicates.json", "w", encoding="utf-8") as f:
    json.dump(duplicates, f, ensure_ascii=False, indent=2)

print("✅ Duplicate report saved to split_duplicates.json")


Train size: 2162
Test size: 538
Train ∩ Test: 0
Total samples: 2700
Unique samples: 2700
Duplicates overall: 0
✅ Duplicate report saved to split_duplicates.json


In [51]:
import json
import spacy
from spacy.tokens import DocBin, Span
from spacy.util import compile_infix_regex, filter_spans
from spacy.language import Language

def convert(json_file, spacy_file):
    with open(f"{dataset_file_path}/{json_file}", "r", encoding="utf-8") as f:
        data = json.load(f)

    doc_bin = DocBin(store_user_data=True)
    for record in data:
        # normalize text to avoid encoding mismatches
        text = unicodedata.normalize("NFC", record["text"])
        doc = nlp.make_doc(text)

        doc.user_data["Publication"] = record["Publication"]
        doc.user_data["Row_Index"] = int(record["Row_Index"])
        doc.user_data["Year"] = record["Year"]
        
        ents = []
        for start, end, label in record["entities"]:
            # set to expand to handle fit as much of the text as it can,
            #   the tokenizer should handle the rest
            span = doc.char_span(start, end, label=label, alignment_mode="expand") 
            if span is None:
                print(f"⚠️ Misaligned span skipped: {text[start:end]}")
            else:
                ents.append(span)

        # filter overlaps and assign
        doc.ents = filter_spans(ents)
        doc_bin.add(doc)

    # save binary .spacy file
    doc_bin.to_disk(f"{spacy_file_path}/{spacy_file}")
    print(f"Saved {spacy_file}")

In [52]:


# Convert splits
convert("json_spacy/train.json", "train.spacy")
# convert("json_spacy/dev.json", "spacy/dev.spacy")
convert("json_spacy/test.json", "test.spacy")


Saved train.spacy
Saved test.spacy


Splitting training data with decreasing size

In [48]:
from collections import defaultdict
import random, json

def stratified_fewshot_sample_publication(data, n, strata="Publication", seed=42):
    """
    Few-shot subset, stratified by publication.
    Params:
        records: the list of training records
        n: how much of the records to sample
        seed: random seed
    """
    random.seed(seed)

    # --- bucket by publication ---
    buckets = defaultdict(list)
    for rec in data:
        pub = rec.get("metadata", {}).get(strata, "UNKNOWN") # defaults to publication
        buckets[pub].append(rec)

    # --- proportional allocation ---
    total = sum(len(v) for v in buckets.values())
    take = {pub: int(n * len(v) / total) for pub, v in buckets.items()}

    # --- rounding adjust ---
    allocated = sum(take.values())
    while allocated < n:  # give extras
        pub = max(buckets, key=lambda p: len(buckets[p]) - take[p])
        take[pub] += 1
        allocated += 1
    while allocated > n:  # remove excess
        pub = max(take, key=take.get)
        if take[pub] > 1:
            take[pub] -= 1
            allocated -= 1
        else:
            break

    # --- sample ---
    out = []
    for pub, bucket in buckets.items():
        k = min(take[pub], len(bucket))
        out.extend(random.sample(bucket, k))

    random.shuffle(out)
    return out


In [None]:

# --- Convert record to spaCy-style example ---
# def record_to_spacy(rec):
#     text = unicodedata.normalize("NFC", rec["fields"]["Text"])
#     entities = []
#     for resp in rec.get("responses", {}).get("entity_type", []):
#         for ent in resp.get("value", []):
#             entities.append([ent["start"], ent["end"], ent["label"]])
#     return [text, {"entities": entities}]


# --- JSON to spaCy DocBin converter ---

# def json_to_docbin(infile, outfile):
#     with open(infile, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     db = DocBin()
#     for record in data:  
#         text = unicodedata.normalize("NFC", record["text"])
#         doc = nlp.make_doc(text)
#         ents = []
#         for start, end, label in record["entities"]:
#             span = doc.char_span(start, end, label=label)
#             if span:
#                 ents.append(span)
#         doc.ents = spacy.util.filter_spans(ents)
#         db.add(doc)

#     db.to_disk(outfile)
#     print(f"Saved {outfile}")



In [54]:

# --- Main procedure ---
# this is what we split from the first one
with open(f"{dataset_file_path}/train.json", "r", encoding="utf-8") as f:
    train_pool = json.load(f)

sizes = []
max_size = len(train_pool)
current_size = max_size
while current_size > 50:
    sizes.append(current_size)
    current_size = round(current_size / 2)
# seeds = [0, 1, 2]

sizes

[2162, 1081, 540, 270, 135, 68]

In [55]:
os.makedirs(f"{dataset_file_path}/json_spacy/subset", exist_ok=True)
os.makedirs(f"{spacy_file_path}/subset", exist_ok=True)


for size in sizes:
    misaligned_log = []
    overlap_log = []
    dropped_log = []
    neg_sample_log = []
    
    # for seed in seeds:
    subset_records = stratified_fewshot_sample_publication(train_pool, size)
    train_set, dev_set = stratified_split_train_test(subset_records, test_ratio=0.1)

    sets = {
        "train": train_set,
        "dev": dev_set
    }

    print(f"Sample Record for size ({size}):", sets["train"][0], "\n")

    os.makedirs(f"{dataset_file_path}/json_spacy/subset/{size}", exist_ok=True)
    os.makedirs(f"{spacy_file_path}/subset/{size}", exist_ok=True)

    for set in sets:
        # Convert each record to spaCy format
        # subset = [record_to_spacy(rec) for rec in subset_records]
        subset = to_spacy_format(sets[set])

        json_file = f"json_spacy/subset/{size}/{set}.json"
        spacy_file = f"subset/{size}/{set}.spacy"

        with open(f"{dataset_file_path}/{json_file}", "w", encoding="utf-8") as f:
            json.dump(subset, f, indent=2, ensure_ascii=False)
        print(f"Saved {json_file} with {len(subset)} entries")

        convert(json_file, spacy_file)
    print("-------------------------------------------------------")

Sample Record for size (2162): {'id': '82fc5eda-e815-4e31-b5e5-486a40606b25', 'fields': {'Text': 'Nauna rito, isang yunit ng BHB na kumikilos sa gitnang Isabela ang nakapatay ng apat na opisyal at tatlong tauhan ng AFP sa isang ambus.'}, 'metadata': {'Source': 'eVols', 'Year': '1974', 'Publication': 'Taliba ng Bayan', 'Issue': '1974.Vol3.No4', 'Page Number': '1', 'Remarks': '', 'Row_Index': '7160'}, 'suggestions': {'entity_type': {'value': [{'label': 'Organization-Military', 'start': 27, 'end': 30}, {'label': 'Location', 'start': 55, 'end': 62}, {'label': 'Organization-Military', 'start': 117, 'end': 120}], 'score': None, 'agent': None}}, 'responses': {'entity_type': [{'value': [{'label': 'Organization-Military', 'start': 27, 'end': 30}, {'label': 'Location', 'start': 55, 'end': 62}, {'label': 'Organization-Military', 'start': 117, 'end': 120}], 'user_id': 'b366f9db-9620-43cb-8ae0-6ccc19a5bc3d', 'status': 'submitted'}]}, 'vectors': {}, 'status': 'completed', '_server_id': '3fdb957d-4cc

"Cleaned: {'text': 'apartheid', 'start': 481, 'end': 490}"

"Cleaned: {'text': 'Assemblywoman Manotoc', 'start': 78, 'end': 99}"

"Cleaned: {'text': '78%', 'start': 31, 'end': 34}"

"Cleaned: {'text': 'Dangadang', 'start': 8, 'end': 17}"

"Cleaned: {'text': 'Sakbayan', 'start': 80, 'end': 88}"

"Cleaned: {'text': 'Pilipino', 'start': 141, 'end': 149}"

"Cleaned: {'text': 'HULYO 1985', 'start': 63, 'end': 73}"

"Cleaned: {'text': 'Masa', 'start': 97, 'end': 101}"

"Cleaned: {'text': 'Demokratikong Republikang Bayan ng Korea', 'start': 556, 'end': 596}"

"Cleaned: {'text': 'Enero', 'start': 193, 'end': 198}"

"Cleaned: {'text': 'korporasyong transnasyonal', 'start': 36, 'end': 62}"

"Cleaned: {'text': 'SAMBUTANI', 'start': 54, 'end': 63}"

"Cleaned: {'text': 'Mindanao', 'start': 66, 'end': 74}"

Saved json_spacy/subset/2162/train.json with 1945 entries
Saved subset/2162/train.spacy
Saved json_spacy/subset/2162/dev.json with 217 entries
Saved subset/2162/dev.spacy
-------------------------------------------------------
Sample Record for size (1081): {'id': '786d8d1e-0173-4652-bb03-7b23c39a0c57', 'fields': {'Text': 'Sa lahat ng mga aksyong masa sa buong kapuluan, ang diktadura ang itinurong salarin sa asasinasyon kay Aquino at sa pagpatay sa iba pang mamamayang nakikibaka laban sa diktadura. Sinabi ng mga nagrali na mamamayan mismo ang kailangang magpawalang ng hustisya dahil walang maaasahang hustisya sa diktadura.'}, 'metadata': {'Source': 'eVols', 'Year': '1985', 'Publication': 'Ang Bayan', 'Issue': 'Agosto 1985', 'Page Number': '4', 'Remarks': '', 'Row_Index': '3942'}, 'suggestions': {'entity_type': {'value': [{'label': 'Person-Individual', 'start': 103, 'end': 109}, {'label': 'Organization-Political', 'start': 204, 'end': 213}, {'label': 'Organization-Political', 'start': 2

"Cleaned: {'text': 'apartheid', 'start': 481, 'end': 490}"

"Cleaned: {'text': '78%', 'start': 31, 'end': 34}"

"Cleaned: {'text': 'Dangadang', 'start': 8, 'end': 17}"

"Cleaned: {'text': 'Sakbayan', 'start': 80, 'end': 88}"

"Cleaned: {'text': 'Demokratikong Republikang Bayan ng Korea', 'start': 556, 'end': 596}"

Saved json_spacy/subset/1081/train.json with 975 entries
Saved subset/1081/train.spacy


"Cleaned: {'text': 'Mindanao', 'start': 66, 'end': 74}"

Saved json_spacy/subset/1081/dev.json with 106 entries
Saved subset/1081/dev.spacy
-------------------------------------------------------
Sample Record for size (540): {'id': '320410ce-b3e5-47f0-b074-ac16748216da', 'fields': {'Text': "Idinagdag pa ni Mitra na 'nagkamali si Pena ng pasok dahil sawa na ang tao sa pamamahala at namulat na ang mga tao sa kanyang lalawigan sa katotohanan kung kaya't alam na nila kung sino ang dapat ihalal."}, 'metadata': {'Source': 'Bantayog ng mga Bayani', 'Year': '1984', 'Publication': 'Ang Tinig ng Masa', 'Issue': 'Ang Tinig ng Masa Taon 1 Blg 43 1984 Mayo 22-28', 'Page Number': '2', 'Remarks': '', 'Row_Index': '2093'}, 'suggestions': {'entity_type': {'value': [{'label': 'Person-Individual', 'start': 16, 'end': 21}, {'label': 'Person-Individual', 'start': 39, 'end': 43}, {'label': 'Person-Individual', 'start': 71, 'end': 74}, {'label': 'Person-Individual', 'start': 111, 'end': 114}], 'score': None, 'agent': None}}, 'responses': {'entity_type': [{'value'

"Cleaned: {'text': 'apartheid', 'start': 481, 'end': 490}"

"Cleaned: {'text': 'Demokratikong Republikang Bayan ng Korea', 'start': 556, 'end': 596}"

"Cleaned: {'text': 'korporasyong transnasyonal', 'start': 36, 'end': 62}"

"Cleaned: {'text': 'Assemblywoman Manotoc', 'start': 78, 'end': 99}"

Saved json_spacy/subset/540/train.json with 487 entries
Saved subset/540/train.spacy
Saved json_spacy/subset/540/dev.json with 53 entries
Saved subset/540/dev.spacy
-------------------------------------------------------
Sample Record for size (270): {'id': '3dc3ce82-e4a0-4d24-b9d9-1e1490d2502c', 'fields': {'Text': 'Binibili ang mga ito ng mga negosyante, na ang ginagamit sa pamimili ay mga dolyar, mula sa mga tinatawag na post exchange (PX) shop na matatagpuan.sa_loob ng mga base.'}, 'metadata': {'Source': 'Bantayog ng mga Bayani', 'Year': '1984', 'Publication': 'Ang Tinig ng Masa', 'Issue': 'Taon 1 Blg 35 1984 Marso 27 - Abril 02', 'Page Number': '6', 'Remarks': '', 'Row_Index': '188'}, 'suggestions': {'entity_type': {'value': [{'label': 'Location', 'start': 163, 'end': 167}], 'score': None, 'agent': None}}, 'responses': {'entity_type': [{'value': [], 'user_id': 'b366f9db-9620-43cb-8ae0-6ccc19a5bc3d', 'status': 'submitted'}]}, 'vectors': {}, 'status': 'completed', '_server_id': '7925

In [34]:
import json

def count_json_entries(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"{json_file}: {len(data)} entries")
    return len(data)

for size in sizes:
    count_json_entries(f"{dataset_file_path}/json_spacy/subset/{size}/train.json")
    count_json_entries(f"{dataset_file_path}/json_spacy/subset/{size}/dev.json")

../assets/json_records/batch_1-2/json_spacy/subset/2162/train.json: 1945 entries
../assets/json_records/batch_1-2/json_spacy/subset/2162/dev.json: 217 entries
../assets/json_records/batch_1-2/json_spacy/subset/1081/train.json: 975 entries
../assets/json_records/batch_1-2/json_spacy/subset/1081/dev.json: 106 entries
../assets/json_records/batch_1-2/json_spacy/subset/540/train.json: 487 entries
../assets/json_records/batch_1-2/json_spacy/subset/540/dev.json: 53 entries
../assets/json_records/batch_1-2/json_spacy/subset/270/train.json: 244 entries
../assets/json_records/batch_1-2/json_spacy/subset/270/dev.json: 26 entries
../assets/json_records/batch_1-2/json_spacy/subset/135/train.json: 122 entries
../assets/json_records/batch_1-2/json_spacy/subset/135/dev.json: 13 entries
../assets/json_records/batch_1-2/json_spacy/subset/68/train.json: 62 entries
../assets/json_records/batch_1-2/json_spacy/subset/68/dev.json: 6 entries


## Try to augment train dataset (Untouched, modify if needed)

In [19]:
import spacy
from spacy.tokens import DocBin
from collections import Counter
import math

# Paths
train_path = "spacy/train.spacy"
balanced_path = "spacy/train_balanced.spacy"

# Load train.spacy
nlp = spacy.blank("tl")
docbin = DocBin().from_disk(train_path)
docs = list(docbin.get_docs(nlp.vocab))

# Count original label frequencies
counts = Counter([ent.label_ for doc in docs for ent in doc.ents])
print("Original counts:", counts)

# Target minimum per label
target = 100

# Compute oversample factors
factors = {label: math.ceil(target / freq) if freq < target else 1
           for label, freq in counts.items()}

print("Oversample factors:", factors)

# Apply oversampling
balanced_docs = []
for doc in docs:
    balanced_docs.append(doc)
    labels = {ent.label_ for ent in doc.ents}
    # Find the max factor among labels in this doc
    factor = max([factors.get(label, 1) for label in labels], default=1)
    if factor > 1:
        balanced_docs.extend([doc.copy()] * (factor - 1))

# Count new frequencies
new_counts = Counter([ent.label_ for doc in balanced_docs for ent in doc.ents])
print("Balanced counts:", new_counts)

# Save
balanced_db = DocBin(docs=balanced_docs)
balanced_db.to_disk(balanced_path)
print(f"✅ Saved balanced training set to {balanced_path}")


Original counts: Counter({'Person-Individual': 1368, 'Location': 1194, 'Numerical Statistics': 492, 'Time': 403, 'Organization-Other': 335, 'Organization-Government': 326, 'Organization-Military': 255, 'Person-Collective': 239, 'Organization-Political': 207, 'Production-Media': 176, 'Production-Doctrine': 99, 'Production-Government': 55, 'Event-Local': 52, 'Object': 46, 'Event-International': 31})
Oversample factors: {'Organization-Other': 1, 'Numerical Statistics': 1, 'Production-Media': 1, 'Person-Individual': 1, 'Organization-Political': 1, 'Organization-Government': 1, 'Event-International': 4, 'Location': 1, 'Production-Doctrine': 2, 'Person-Collective': 1, 'Time': 1, 'Event-Local': 2, 'Production-Government': 2, 'Organization-Military': 1, 'Object': 3}
Balanced counts: Counter({'Person-Individual': 1570, 'Location': 1453, 'Numerical Statistics': 638, 'Time': 510, 'Organization-Government': 408, 'Organization-Other': 358, 'Organization-Military': 311, 'Person-Collective': 292, 'Or

In [24]:
import spacy, calamancy
from spacy.tokens import DocBin

# Load spaCy model (the same one used to create the docs)
nlp = calamancy.load("tl_calamancy_md-0.2.0")

# Load the DocBin file
doc_bin = DocBin().from_disk("spacy/dev.spacy")

# Get list of Doc objects
docs = list(doc_bin.get_docs(nlp.vocab))

for i, doc in enumerate(docs):
    print(f"\n--- Document {i+1} ---")
    print("Text:", doc.text)

    print("Tokens:")
    for token in doc:
        print(f"'{token.text}'", end=" | ")
    print()

    print("Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")



--- Document 1 ---
Text: Ayon sa ilang opisyalng Export Processing Zone Authority (EPZA) at (BEPZ), ang bentahan ay malinis at umano’y walang anomalya pagkat ito ay aprobado ng isang EPZA resolution at ng Central Bank.
Tokens:
'Ayon' | 'sa' | 'ilang' | 'opisyalng' | 'Export' | 'Processing' | 'Zone' | 'Authority' | '(' | 'EPZA' | ')' | 'at' | '(' | 'BEPZ' | ')' | ',' | 'ang' | 'bentahan' | 'ay' | 'malinis' | 'at' | 'umano’y' | 'walang' | 'anomalya' | 'pagkat' | 'ito' | 'ay' | 'aprobado' | 'ng' | 'isang' | 'EPZA' | 'resolution' | 'at' | 'ng' | 'Central' | 'Bank' | '.' | 
Entities:
Export Processing Zone Authority (Organization-Government)
EPZA (Organization-Government)
BEPZ (Organization-Government)
EPZA resolution (Production-Government)
Central Bank (Organization-Government)

--- Document 2 ---
Text: Halimbawa, ang mga ipinangangakong kaginhawahan at kaayusan ay baka mauwi sa "kaginhawahan at kaayusan" ng mga bulsa ng mga kandidato. Bilang isang matalinong hakbang, tingnan ang kabuhay