In [1]:
from spacy.tokens import DocBin
import spacy 
import json
from tqdm import tqdm
import random

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [4]:
train = load_data("data/train.json")

In [5]:
valid = load_data("data/valid.json")

In [7]:
train_spacy = load_data("data/train_spacy.json")
valid_spacy = load_data("data/valid_spacy.json")

In [8]:
# Load the blank SpaCy model
nlp = spacy.blank("en")

# Function to create training data
def create_training(TRAIN_DATA):
    db = DocBin()
    for text, annot in tqdm(TRAIN_DATA):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"Skipping entity in text: {text[start:end]} with label: {label}")
            else:
                ents.append(span)
        # Filter out overlapping spans
        filtered_ents = filter_overlapping_spans(ents)
        doc.ents = filtered_ents
        db.add(doc)
    return db

# Function to filter out overlapping spans
def filter_overlapping_spans(spans):
    if not spans:
        return spans
    # Sort spans by start and then by end
    spans = sorted(spans, key=lambda span: (span.start, span.end))
    filtered_spans = [spans[0]]
    for span in spans[1:]:
        if span.start >= filtered_spans[-1].end:
            filtered_spans.append(span)
    return filtered_spans

In [9]:
train = create_training(train)
train.to_disk("./data/train.spacy")

100%|██████████| 119/119 [00:00<00:00, 1999.34it/s]

Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Grey with label: COLOR
Skipping entity in text: 2.5 Kg with label: WEIGHT
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Laptop with label: PRODUCT_TYPE
Skipping entity in text: 1.74 Kg with label: WEIGHT
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Ski




In [10]:
valid = create_training(valid)
valid.to_disk("./data/valid.spacy")

100%|██████████| 51/51 [00:00<00:00, 2039.86it/s]

Skipping entity in text: Grey with label: COLOR
Skipping entity in text: Grey with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Grey with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Grey with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Laptop with label: PRODUCT_TYPE
Skipping entity in text: Silver with label: COLOR
Skipping entity in text: Grey with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Black with label: COLOR
Skipping entity in text: Blue with label: COLOR





In [11]:
train_spacy = create_training(train_spacy)
train_spacy.to_disk("./data/train_spacy.spacy")

100%|██████████| 119/119 [00:00<00:00, 4406.99it/s]


In [12]:
valid_spacy = create_training(valid_spacy)
valid_spacy.to_disk("./data/valid_spacy.spacy")

100%|██████████| 51/51 [00:00<00:00, 3644.05it/s]
