In [24]:
from datasets import Dataset, DatasetDict, load_from_disk
import spacy
from spacy.tokens import DocBin
import srsly

In [None]:
# this is where our entity mapping file is
entity_map = srsly.read_json("../assets/mapped_labels.json")

In [None]:
# list of all the entities
entities = entity_map["labels"]
entities

['Person-Individual',
 'Person-Collective',
 'Organization-Political',
 'Organization-Government',
 'Organization-Military',
 'Organization-Other',
 'Location',
 'Object',
 'Time',
 'Event-Local',
 'Event-International',
 'Production-Media',
 'Production-Government',
 'Production-Doctrine',
 'Numerical Statistics']

In [5]:
entity_map["iob_mapping"]

{'0': 'O',
 '1': 'B-Person-Individual',
 '2': 'I-Person-Individual',
 '3': 'B-Person-Collective',
 '4': 'I-Person-Collective',
 '5': 'B-Organization-Political',
 '6': 'I-Organization-Political',
 '7': 'B-Organization-Government',
 '8': 'I-Organization-Government',
 '9': 'B-Organization-Military',
 '10': 'I-Organization-Military',
 '11': 'B-Organization-Other',
 '12': 'I-Organization-Other',
 '13': 'B-Location',
 '14': 'I-Location',
 '15': 'B-Object',
 '16': 'I-Object',
 '17': 'B-Time',
 '18': 'I-Time',
 '19': 'B-Event-Local',
 '20': 'I-Event-Local',
 '21': 'B-Event-International',
 '22': 'I-Event-International',
 '23': 'B-Production-Media',
 '24': 'I-Production-Media',
 '25': 'B-Production-Government',
 '26': 'I-Production-Government',
 '27': 'B-Production-Doctrine',
 '28': 'I-Production-Doctrine',
 '29': 'B-Numerical Statistics',
 '30': 'I-Numerical Statistics'}

In [None]:
def convert_entity_to_iob(entity_type : str, inside=False) -> int :
    iob_num = 0

    # might slow things but adding this for checking
    if entity_type in entities:
        iob_num = entities.index(entity_type)
    else:
        print("Error entity type not found")
        return 0;

    # note: the zeroth index in the array is always mapped to the first index in the map
    iob_num = iob_num * 2 + 1

    if inside:
        return iob_num + 1
    else:
        return iob_num

In [21]:
def spacy_to_hf(corpus_folder, set=["train", "dev", "test"]):
    doc_bin = DocBin().from_disk(f"../experiments/corpus/{set}.spacy")
    
    # The model used to construct the doc object
    nlp = spacy.blank("en")  # or the language your corpus uses
    docs = list(doc_bin.get_docs(nlp.vocab))

    texts = []
    tokens = []
    entities = []
    iob_tags = []
    ids = []

    id_iterator = 0

    for doc in docs:
        texts.append(doc.text)
        token_texts = [t.text for t in doc]
        
        # We'll follow the I-O-B scheme
        token_labels = ["O"] * len(doc)
        token_tags = [0] * len(doc)

        for ent in doc.ents:
            token_tags[ent.start] = convert_entity_to_iob(ent.label_);
            token_labels[ent.start] = "B-" + ent.label_
            for i in range(ent.start + 1, ent.end):
                token_labels[i] = "I-" + ent.label_
                token_tags[i] = convert_entity_to_iob(ent.label_, inside=True);
        
        tokens.append(token_texts)
        entities.append(token_labels)
        iob_tags.append(token_tags)
        ids.append(id_iterator)

        id_iterator += 1

    return Dataset.from_dict({
        "id": ids,
        "tokens": tokens,
        "entities": entities, 
        "ner_tags": iob_tags
    })

# 1. Load all three sets

corpus_directory = "../experiments/corpus/"
dataset_name = "batch 1"

train_dataset = spacy_to_hf(corpus_directory, "train")
dev_dataset   = spacy_to_hf(corpus_directory, "dev")
test_dataset  = spacy_to_hf(corpus_directory, "test")

# 2. Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset,
    "test": test_dataset
})

print(dataset_dict)
print(dataset_dict["train"][0])  # first training sample

# 3. Save locally
dataset_dict.save_to_disk(f"{corpus_directory}/{dataset_name}")

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 1879
    })
    validation: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 267
    })
    test: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 533
    })
})
{'id': 0, 'tokens': ['Isang', 'demograpo', 'ng', 'UPPI', 'ang', 'nagsabing', 'mababa', 'ang', 'pagtaya', 'sa', 'rate', 'ng', 'namamatay', 'sa', 'dalawang', 'rehiyon', 'ayon', 'sa', 'ulat', 'ng', 'Area', 'Fertility', 'Survey', '.'], 'entities': ['O', 'O', 'O', 'B-Organization-Other', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Numerical Statistics', 'I-Numerical Statistics', 'O', 'O', 'O', 'O', 'B-Production-Media', 'I-Production-Media', 'I-Production-Media', 'O'], 'ner_tags': [0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 30, 0, 0, 0, 0, 23, 24, 24, 0]}


Saving the dataset (0/1 shards):   0%|          | 0/1879 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/267 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/533 [00:00<?, ? examples/s]

In [None]:
corpus_directory = "../experiments/corpus/"
dataset_name = "batch_1"

ds = load_from_disk(f"{corpus_directory}/{dataset_name}")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 1879
    })
    validation: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 267
    })
    test: Dataset({
        features: ['id', 'tokens', 'entities', 'ner_tags'],
        num_rows: 533
    })
})

In [23]:
import os; 

push_to_hub = True

if push_to_hub:
    api_token = os.getenv("HF_TOKEN")

if push_to_hub:
    ds.push_to_hub("etdvprg/gold-ml-batch1", token=api_token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/555 [00:00<?, ?B/s]