In [1]:
from datasets import Dataset, DatasetDict, load_from_disk
import spacy
from spacy.tokens import DocBin
import srsly

In [2]:
# this is where our entity mapping file is
entity_map = srsly.read_json("../assets/mapped_labels.json")

In [3]:
# list of all the entities
entities = entity_map["labels"]
entity_map

{'labels': ['Person-Individual',
  'Person-Collective',
  'Organization-Political',
  'Organization-Government',
  'Organization-Military',
  'Organization-Other',
  'Location',
  'Object',
  'Time',
  'Event-Local',
  'Event-International',
  'Production-Media',
  'Production-Government',
  'Production-Doctrine',
  'Numerical Statistics'],
 'iob_mapping': {'0': 'O',
  '1': 'B-Person-Individual',
  '2': 'I-Person-Individual',
  '3': 'B-Person-Collective',
  '4': 'I-Person-Collective',
  '5': 'B-Organization-Political',
  '6': 'I-Organization-Political',
  '7': 'B-Organization-Government',
  '8': 'I-Organization-Government',
  '9': 'B-Organization-Military',
  '10': 'I-Organization-Military',
  '11': 'B-Organization-Other',
  '12': 'I-Organization-Other',
  '13': 'B-Location',
  '14': 'I-Location',
  '15': 'B-Object',
  '16': 'I-Object',
  '17': 'B-Time',
  '18': 'I-Time',
  '19': 'B-Event-Local',
  '20': 'I-Event-Local',
  '21': 'B-Event-International',
  '22': 'I-Event-International',

In [4]:
def convert_entity_to_iob(entity_type : str, inside=False) -> int :
    iob_num = 0

    # might slow things but adding this for checking
    if entity_type in entities:
        iob_num = entities.index(entity_type)
    else:
        print("Error entity type not found")
        return 0;

    # note: the zeroth index in the array is always mapped to the first index in the map
    iob_num = iob_num * 2 + 1

    if inside:
        return iob_num + 1
    else:
        return iob_num

In [5]:
from spacy.language import Language
from typing import Literal

def spacy_to_hf(raw_spacy_directory : str, set:Literal["train", "dev", "test"]):
    doc_bin = DocBin().from_disk(f"{raw_spacy_directory}/{set}.spacy")
    
    # The model used to construct the doc object
    nlp = spacy.blank("tl")  # or the language your corpus uses
    docs = list(doc_bin.get_docs(nlp.vocab))

    texts = []
    tokens = []
    entities = []
    iob_tags = []
    ids = []

    row_indices = []
    publications = []
    years = []

    id_iterator = 0

    for doc in docs:
        # the tokens
        texts.append(doc.text)
        token_texts = [t.text for t in doc]
        
        # some metadatas
        publication = doc.user_data["Publication"]
        row_index = int(doc.user_data["Row_Index"])
        year = doc.user_data["Year"]

        # We'll follow the I-O-B scheme
        token_labels = ["O"] * len(doc)
        token_tags = [0] * len(doc)

        for ent in doc.ents:
            token_tags[ent.start] = convert_entity_to_iob(ent.label_);
            token_labels[ent.start] = "B-" + ent.label_
            for i in range(ent.start + 1, ent.end):
                token_labels[i] = "I-" + ent.label_
                token_tags[i] = convert_entity_to_iob(ent.label_, inside=True);
        
        tokens.append(token_texts)
        entities.append(token_labels)
        iob_tags.append(token_tags)
        ids.append(id_iterator)
        
        publications.append(publication)
        row_indices.append(row_index)
        years.append(year)


        id_iterator += 1
        # end of for loop

    return Dataset.from_dict({
        "id": ids,
        "tokens": tokens,
        "entities": entities, 
        "ner_tags": iob_tags,
        "Row_Index": row_indices,
        "Year": years,
        "Publication": publications
    })

In [7]:
# 1. Load all three sets

corpus_directory = "../assets/corpus/"
dataset_name = "batch_1-2"
training_size = 2162

test_dataset  = spacy_to_hf(f"{corpus_directory}/{dataset_name}/spacy", "test")
train_dataset = spacy_to_hf(f"{corpus_directory}/{dataset_name}/spacy/subset/{training_size}", "train")
dev_dataset   = spacy_to_hf(f"{corpus_directory}/{dataset_name}/spacy/subset/{training_size}", "dev")


# 2. Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset,
    "test": test_dataset
})

print(dataset_dict)
print("Sample",dataset_dict["train"][0])  # first training sample

# 3. Save locally
dataset_dict.save_to_disk(f"{corpus_directory}/{dataset_name}/dataset_full")

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 1891
    })
    validation: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 271
    })
    test: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 538
    })
})
Sample {'id': 0, 'tokens': ['Hiniling', 'noong', 'nakaraang', 'linggo', 'ng', 'United', 'Nationalist', 'Democratic', 'Organization', '(', 'UNIDO', ')', 'na', 'pawalang', '-', 'bisa', 'ang', 'halalan', 'sa', '16', 'na', 'lalawigan', 'at', 'kagyat', 'na', 'maglunsad', 'ng', 'bagong', 'eleksiyon', 'sa', 'mga', 'nasabing', 'lugar', '.'], 'entities': ['O', 'O', 'O', 'O', 'O', 'B-Organization-Political', 'I-Organization-Political', 'I-Organization-Political', 'I-Organization-Political', 'O', 'B-Organization-Political', 'O', 'O', 'O', 'O', '

Saving the dataset (0/1 shards):   0%|          | 0/1891 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/271 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/538 [00:00<?, ? examples/s]

In [8]:
ds = load_from_disk(f"{corpus_directory}/{dataset_name}/dataset_full")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 1891
    })
    validation: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 271
    })
    test: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags', 'Row_Index', 'Year', 'Publication'],
        num_rows: 538
    })
})

In [9]:
import os;

push_to_hub = True

owner = "etdvprg"
hf_dataset_name = "PHMartialLaw-NER_b12"

if push_to_hub:
    api_token = os.getenv("HF_TOKEN")
    ds.push_to_hub(f"{owner}/{hf_dataset_name}", token=api_token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            