In [1]:
import json

def convert_labelstudio_to_spacy(label_studio_file, spacy_output_file):
    with open(label_studio_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    spacy_data = []
    
    for entry in data:
        text = entry["data"]["text"]
        entities = []

        for annotation in entry["annotations"]:
            for result in annotation["result"]:
                value = result["value"]
                start = value["start"]
                end = value["end"]
                label = value["labels"][0]
                entities.append((start, end, label))
        
        spacy_data.append({
            "text": text,
            "entities": entities
        })

    with open(spacy_output_file, "w", encoding="utf-8") as file:
        json.dump(spacy_data, file, indent=4)
    
    print(f"✅ Converted '{label_studio_file}' to SpaCy format at '{spacy_output_file}'.")

# ✅ Convert your training and testing data
convert_labelstudio_to_spacy("annotated70data.json", "train_spacy.json")
convert_labelstudio_to_spacy("annotated30data.json", "test_spacy.json")



✅ Converted 'annotated70data.json' to SpaCy format at 'train_spacy.json'.
✅ Converted 'annotated30data.json' to SpaCy format at 'test_spacy.json'.


In [2]:
# import spacy
# from spacy.tokens import DocBin
# from spacy.training.example import Example
# from spacy.training import Example

# # Load blank English pipeline
# nlp = spacy.blank("en")

# # Add RoBERTa transformer and NER
# nlp.add_pipe("transformer", config={"model": {"name": "roberta-base"}})
# ner = nlp.add_pipe("ner")

# # Add your 14 custom entities
# labels = [
#     "INVOICE_NUM", "INVOICE_DATE", "DUE_DATE", "SENDER", "EMAIL", "PHONE_NUM",
#     "ITEM_NAME", "PRICE", "QUANTITY", "TOTAL_PRICE", "TOTAL_INVOICE_PRICE",
#     "BANK_NAME", "ACCOUNT_NUM", "WEBSITE"
# ]
# for label in labels:
#     ner.add_label(label)


import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example

# Load a blank English pipeline
nlp = spacy.blank("en")

# Add a CNN-based tok2vec (default in spaCy for CNN models)
tok2vec = nlp.add_pipe("tok2vec")  # CNN-based context encoder
ner = nlp.add_pipe("ner")

# Add your 14 custom entities
labels = [
    "INVOICE_NUM", "INVOICE_DATE", "DUE_DATE", "SENDER", "EMAIL", "PHONE_NUM",
    "ITEM_NAME", "PRICE", "QUANTITY", "TOTAL_PRICE", "TOTAL_INVOICE_PRICE",
    "BANK_NAME", "ACCOUNT_NUM", "WEBSITE"
]
for label in labels:
    ner.add_label(label)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from spacy.training.example import Example
import random
import json

# Load converted training data
def load_spacy_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

train_data_raw = load_spacy_data("train_spacy.json")
test_data_raw = load_spacy_data("test_spacy.json")

# Convert to SpaCy Example objects
train_examples = []
for entry in train_data_raw:
    doc = nlp.make_doc(entry["text"])
    entities = {"entities": [(start, end, label) for start, end, label in entry["entities"]]}
    example = Example.from_dict(doc, entities)
    train_examples.append(example)

In [4]:
from spacy.util import compounding, minibatch


# Initialize components with training examples
optimizer = nlp.initialize(get_examples=lambda: train_examples)

# Early stopping config
best_loss = float("inf")
patience = 5
min_delta = 0.001
no_improvement = 0
max_epochs = 30

for epoch in range(max_epochs):
    random.shuffle(train_examples)
    losses = {}

    # Use smaller batches with slower growth to simulate Code A’s exposure
    batches = minibatch(train_examples, size=compounding(2.0, 16.0, 1.001))
    
    for batch in batches:
        nlp.update(batch, drop=0.3, losses=losses)

    current_loss = losses.get("ner", 0.0)
    print(f"Epoch {epoch + 1}, Loss: {current_loss:.4f}")

    # Early stopping
    if best_loss - current_loss > min_delta:
        best_loss = current_loss
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print(f"⏹️ Early stopping triggered at epoch {epoch + 1}")
            break


Epoch 1, Loss: 3624.0894
Epoch 2, Loss: 1517.2960
Epoch 3, Loss: 1354.9921
Epoch 4, Loss: 1122.9745
Epoch 5, Loss: 1185.2572
Epoch 6, Loss: 890.2052
Epoch 7, Loss: 836.7438
Epoch 8, Loss: 779.1541
Epoch 9, Loss: 807.8812
Epoch 10, Loss: 583.0440
Epoch 11, Loss: 576.5822
Epoch 12, Loss: 542.2830
Epoch 13, Loss: 469.3046
Epoch 14, Loss: 386.7404
Epoch 15, Loss: 377.9493
Epoch 16, Loss: 435.4324
Epoch 17, Loss: 412.9754
Epoch 18, Loss: 325.0068
Epoch 19, Loss: 307.3699
Epoch 20, Loss: 287.0491
Epoch 21, Loss: 282.0897
Epoch 22, Loss: 217.6070
Epoch 23, Loss: 292.1536
Epoch 24, Loss: 221.1254
Epoch 25, Loss: 211.3847
Epoch 26, Loss: 255.4597
Epoch 27, Loss: 228.6534
Epoch 28, Loss: 180.8164
Epoch 29, Loss: 165.0786
Epoch 30, Loss: 188.2148


In [5]:
nlp.to_disk("ner_cnn_30e")
