In [23]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
import random
from faker import Faker
import re
import spacy
from spacy.tokens import DocBin
from spacy.scorer import Scorer
from spacy.training import Example
import time

In [None]:
#Initialize faker
fake = Faker(['de_DE', 'sl_SI', 'en_US', 'fr_FR', 'it_IT', 'nl_NL', 'es_ES'])

#Contry codes for ibans that we are going to use
IBAN_COUNTRIES = {
    "DE": 22, "CH": 21, "SI": 19, "FR": 27, "IT": 27,
    "NL": 18, "ES": 24, "AT": 20, "BE": 16, "FI": 18,
    "LU": 20, "MT": 31, "PT": 25, "SK": 24, "CZ": 24, "PL": 28
}

def generate_iban(country_code):
    if country_code in IBAN_COUNTRIES:
        random_digits = ''.join(str(random.randint(0, 9)) for _ in range(IBAN_COUNTRIES[country_code] - len(country_code) - 2))
        return f"{country_code}{random.randint(10, 99)}{random_digits}"
    return fake.iban()

def generate_entity(entity_type):
    if entity_type == "INVOICE_NUMBER":
        return str(fake.random_int(10000000, 99999999))
    elif entity_type == "REFERENCE_NUMBER":
        return f"REF-{fake.random_int(1000, 9999)}"
    elif entity_type == "IBAN":
        return generate_iban(random.choice(list(IBAN_COUNTRIES.keys())))
    elif entity_type == "CONTRACT_NUMBER":
        return f"CN-{fake.random_int(100000, 999999)}"
    elif entity_type == "NAME":
        return fake.first_name()
    elif entity_type == "SURNAME":
        return fake.last_name()
    return None

def generate_descriptions(num_samples=100000):
    descriptions = []
    # Random sentences where we enter our entities
    sentences = [
        "I am paying for my invoice {}. Have a great day!",
        "Please refer to the reference number {} for further details.",
        "Here is my IBAN {}. Let me know if you need anything else.",
        "Contract number {} is being finalized today.",
        "{} {} will handle the next steps of the project.",
        "The payment for invoice {} was already made.",
        "Funds transferred.",
        "Transaction processed."
    ]

    for _ in range(num_samples):
        text = random.choice(sentences)
        num_entities = random.randint(0, 3)  # Randomly choose 0–3 entities
        entities = []
        occupied_indices = []

        for _ in range(num_entities):
            entity_type = random.choice(["INVOICE_NUMBER", "REFERENCE_NUMBER", "IBAN", "CONTRACT_NUMBER", "NAME", "SURNAME"])
            entity_value = generate_entity(entity_type)

            placeholder_index = -1
            for i, char in enumerate(text):
                if char == "{" and i+1 < len(text) and text[i+1] == "}":
                    placeholder_index = i
                    break

            if placeholder_index != -1 and entity_value:
                text = text[:placeholder_index] + entity_value + text[placeholder_index + 2:]
                entities.append({
                    "start": placeholder_index,
                    "end": placeholder_index + len(entity_value),
                    "label": entity_type  # Use "label" for spaCy compatibility
                })

        # Replace any remaining placeholders with filler text
        text = text.replace("{}", fake.word())
        descriptions.append({"text": text, "entities": entities})

    return descriptions

data = generate_descriptions(10000)

output_file = "synthetic_payment_descriptions.json"
with open(output_file, "w") as f:
    json.dump(data, f, indent=2)

print(f"Generated data saved to {output_file}")

Generated data saved to synthetic_payment_descriptions.json


In [18]:
# 1. Data Loading, Splitting, and DocBin Creation (No changes needed here from the working version)
with open("synthetic_payment_descriptions.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)

random.shuffle(full_data)

train_size = int(len(full_data) * 0.7)
val_size = int(len(full_data) * 0.2)

train_data = full_data[:train_size]
valid_data = full_data[train_size:train_size + val_size]
test_data = full_data[train_size + val_size:]

def create_training_data(data, nlp):
    doc_bin = DocBin()
    for item in data:
        text = item["text"]
        entities = item["entities"]
        doc = nlp.make_doc(text)
        ents = []
        for ent in entities:
            start = ent["start"]
            end = ent["end"]
            label = ent["label"]
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin

nlp = spacy.blank("en")

train_db = create_training_data(train_data, nlp)
valid_db = create_training_data(valid_data, nlp)
test_db = create_training_data(test_data, nlp)

train_db.to_disk("train.spacy")
valid_db.to_disk("valid.spacy")
test_db.to_disk("test.spacy")

print("Datasets prepared and saved!")

Datasets prepared and saved!


In [20]:
# Training the NER model
nlp = spacy.blank("en")
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

labels = ["INVOICE_NUMBER", "REFERENCE_NUMBER", "IBAN", "CONTRACT_NUMBER", "NAME", "SURNAME"]
for label in labels:
    ner.add_label(label)

train_db = DocBin().from_disk("train.spacy")
valid_db = DocBin().from_disk("valid.spacy")

train_examples = []
for doc in train_db.get_docs(nlp.vocab):
    if doc.ents:
        train_examples.append(Example.from_dict(doc, {"entities": doc.ents}))

valid_examples = []
for doc in valid_db.get_docs(nlp.vocab):
    if doc.ents:
        valid_examples.append(Example.from_dict(doc, {"entities": doc.ents}))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    n_iter = 30 #Number of iterations
    for i in range(n_iter):
        start_time = time.time()
        losses = {}
        batches = spacy.util.minibatch(train_examples, size=2)
        for batch in batches:
            try:
                nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
            except Exception as e:
                print(f"Error during training: {e}")
                for example in batch:
                    print(example.text)
        end_time = time.time()
        epoch_time = end_time - start_time

        with nlp.disable_pipes(*other_pipes):
            scores = nlp.evaluate(valid_examples)

        print(f"Epoch {i + 1}/{n_iter} - Time: {epoch_time:.2f}s - Losses: {losses} - F Score: {scores['ents_f']}")

nlp.to_disk("./model")
print("Model trained and saved to ./model")

Epoch 1/30 - Time: 95.72s - Losses: {'ner': np.float32(1795.9426)} - F Score: 0.9979975971165399
Epoch 2/30 - Time: 87.16s - Losses: {'ner': np.float32(873.7676)} - F Score: 0.9979975971165399
Epoch 3/30 - Time: 113.76s - Losses: {'ner': np.float32(530.6558)} - F Score: 0.998397435897436
Epoch 4/30 - Time: 98.30s - Losses: {'ner': np.float32(358.90723)} - F Score: 0.9979975971165399
Epoch 5/30 - Time: 104.73s - Losses: {'ner': np.float32(254.59645)} - F Score: 0.9979975971165399
Epoch 6/30 - Time: 105.18s - Losses: {'ner': np.float32(191.53621)} - F Score: 0.9975980784627703
Epoch 7/30 - Time: 108.80s - Losses: {'ner': np.float32(158.37415)} - F Score: 0.9979975971165399
Epoch 8/30 - Time: 101.23s - Losses: {'ner': np.float32(135.8668)} - F Score: 0.9979975971165399
Epoch 9/30 - Time: 114.51s - Losses: {'ner': np.float32(146.70149)} - F Score: 0.998397435897436
Epoch 10/30 - Time: 90.48s - Losses: {'ner': np.float32(124.594086)} - F Score: 0.998397435897436
Epoch 11/30 - Time: 95.04s -

In [None]:
# Load the trained model
nlp_trained = spacy.load("./model")

# Load the test data
test_db = DocBin().from_disk("test.spacy")
examples = []
for doc in test_db.get_docs(nlp_trained.vocab):
    if doc.ents:
        examples.append(Example.from_dict(doc, {"entities": doc.ents}))

# Evaluate the model
scorer = Scorer()
scores = scorer.score(examples)

# Print detailed metrics
print("Evaluation results:")
print(f"Overall F-score: {scores['ents_f']:.4f}")
print(f"Precision: {scores['ents_p']:.4f}")
print(f"Recall: {scores['ents_r']:.4f}")

# Print per-entity metrics
print("\nPer-entity metrics:")
for metric in ["ents_per_type"]:
    for entity_type, entity_scores in scores[metric].items():
        print(f"\n{entity_type}:")
        print(f"  F-score: {entity_scores['f']:.4f}")
        print(f"  Precision: {entity_scores['p']:.4f}")
        print(f"  Recall: {entity_scores['r']:.4f}")

# Example Usage
text = "Payment for invoice 12345678 to John Doe with reference REF-9876. IBAN: DE12345678901234567890"
doc = nlp_trained(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

text = "Happy new year! I am paying rent."
doc = nlp_trained(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Evaluation results:
Overall F-score: 1.0000
Precision: 1.0000
Recall: 1.0000

Per-entity metrics:

CONTRACT_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

NAME:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

REFERENCE_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

INVOICE_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

IBAN:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

SURNAME:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000
Payment SURNAME
12345678 INVOICE_NUMBER
John Doe NAME
REF-9876 REFERENCE_NUMBER
: NAME
DE12345678901234567890 IBAN
Happy NAME
