In [6]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
import random
from faker import Faker
import re
import spacy
from spacy.tokens import DocBin
from spacy.scorer import Scorer
from spacy.training import Example
import time
import openai
import os
from typing import List, Dict, Optional
from pydantic import BaseModel
from openai import OpenAI

In [None]:
#Initialize faker
fake = Faker(['de_DE', 'sl_SI', 'en_US', 'fr_FR', 'it_IT', 'nl_NL', 'es_ES'])

#Contry codes for ibans that we are going to use
IBAN_COUNTRIES = {
    "DE": 22, "CH": 21, "SI": 19, "FR": 27, "IT": 27,
    "NL": 18, "ES": 24, "AT": 20, "BE": 16, "FI": 18,
    "LU": 20, "MT": 31, "PT": 25, "SK": 24, "CZ": 24, "PL": 28
}

def generate_iban(country_code):
    if country_code in IBAN_COUNTRIES:
        random_digits = ''.join(str(random.randint(0, 9)) for _ in range(IBAN_COUNTRIES[country_code] - len(country_code) - 2))
        return f"{country_code}{random.randint(10, 99)}{random_digits}"
    return fake.iban()

def generate_entity(entity_type):
    if entity_type == "INVOICE_NUMBER":
        return str(fake.random_int(10000000, 99999999))
    elif entity_type == "REFERENCE_NUMBER":
        return f"REF-{fake.random_int(1000, 9999)}"
    elif entity_type == "IBAN":
        return generate_iban(random.choice(list(IBAN_COUNTRIES.keys())))
    elif entity_type == "CONTRACT_NUMBER":
        return f"CN-{fake.random_int(100000, 999999)}"
    elif entity_type == "NAME":
        return fake.first_name()
    elif entity_type == "SURNAME":
        return fake.last_name()
    return None

def generate_descriptions(num_samples=100000):
    descriptions = []
    # Random sentences where we enter our entities
    sentences = [
        "I am paying for my invoice {}. Have a great day!",
        "Please refer to the reference number {} for further details.",
        "Here is my IBAN {}. Let me know if you need anything else.",
        "Contract number {} is being finalized today.",
        "{} {} will handle the next steps of the project.",
        "The payment for invoice {} was already made.",
        "Funds transferred.",
        "Transaction processed."
    ]

    for _ in range(num_samples):
        text = random.choice(sentences)
        num_entities = random.randint(0, 3)  # Randomly choose 0–3 entities
        entities = []
        occupied_indices = []

        for _ in range(num_entities):
            entity_type = random.choice(["INVOICE_NUMBER", "REFERENCE_NUMBER", "IBAN", "CONTRACT_NUMBER", "NAME", "SURNAME"])
            entity_value = generate_entity(entity_type)

            placeholder_index = -1
            for i, char in enumerate(text):
                if char == "{" and i+1 < len(text) and text[i+1] == "}":
                    placeholder_index = i
                    break

            if placeholder_index != -1 and entity_value:
                text = text[:placeholder_index] + entity_value + text[placeholder_index + 2:]
                entities.append({
                    "start": placeholder_index,
                    "end": placeholder_index + len(entity_value),
                    "label": entity_type  # Use "label" for spaCy compatibility
                })

        # Replace any remaining placeholders with filler text
        text = text.replace("{}", fake.word())
        descriptions.append({"text": text, "entities": entities})

    return descriptions

data = generate_descriptions(10000)

output_file = "synthetic_payment_descriptions.json"
with open(output_file, "w") as f:
    json.dump(data, f, indent=2)

print(f"Generated data saved to {output_file}")

Generated data saved to synthetic_payment_descriptions.json


In [187]:
from spacy.tokens import DocBin

# Load and shuffle data
with open("payment_descriptions.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)

# Optional: Set seed for reproducibility
random.seed(42)

# Shuffle the entire dataset
random.shuffle(full_data)

# Split the data into train, validation, and test sets
train_size = int(len(full_data) * 0.7)
val_size = int(len(full_data) * 0.2)

train_data = full_data[:train_size]
valid_data = full_data[train_size:train_size + val_size]
test_data = full_data[train_size + val_size:]

# Shuffle each split separately to avoid any order effects
random.shuffle(train_data)
random.shuffle(valid_data)
random.shuffle(test_data)

# Function to create training data
def create_training_data(data, nlp):
    doc_bin = DocBin()
    for item in data:
        text = item["text"]
        entities = item["entities"]
        doc = nlp.make_doc(text)
        ents = []
        for ent in entities:
            start = ent["start"]
            end = ent["end"]
            label = ent["label"]
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin

# Initialize a blank spaCy model
nlp = spacy.blank("en")

# Create and save DocBins for train, validation, and test sets
train_db = create_training_data(train_data, nlp)
valid_db = create_training_data(valid_data, nlp)
test_db = create_training_data(test_data, nlp)

train_db.to_disk("train.spacy")
valid_db.to_disk("valid.spacy")
test_db.to_disk("test.spacy")

print("Datasets prepared, shuffled, and saved!")


Datasets prepared, shuffled, and saved!


In [None]:
import spacy
from spacy.training import Example
from spacy.tokens import DocBin


# Training the NER model
nlp = spacy.blank("en")
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER pipeline
labels = ["INVOICE_NUMBER", "REFERENCE_NUMBER", "IBAN", "CONTRACT_NUMBER", "NAME", "SURNAME"]
for label in labels:
    ner.add_label(label)

# Load training and validation data
train_db = DocBin().from_disk("train.spacy")
valid_db = DocBin().from_disk("valid.spacy")

# Convert DocBin to Examples
train_examples = [
    Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
    for doc in train_db.get_docs(nlp.vocab) if doc.ents
]

valid_examples = [
    Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
    for doc in valid_db.get_docs(nlp.vocab) if doc.ents
]

# Data augmentation (optional)
def augment_example(example):
    """Randomly shuffle or modify entities to add variety."""
    doc = example.reference
    entities = list(doc.ents)
    random.shuffle(entities)
    new_example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in entities]})
    return new_example

augmented_train_examples = train_examples + [augment_example(ex) for ex in train_examples]

# Train the model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()

    n_iter = 20
    patience = 50  # Early stopping patience
    best_f_score = 0
    epochs_without_improvement = 0

    for i in range(n_iter):
        start_time = time.time()
        losses = {}
        
        # Shuffle examples to prevent overfitting patterns
        random.shuffle(augmented_train_examples)
        
        # Use batch size of 2–4
        batches = spacy.util.minibatch(augmented_train_examples, size=2)
        for batch in batches:
            nlp.update(batch, sgd=optimizer, drop=0.15, losses=losses)  # Reintroduce dropout

        end_time = time.time()
        epoch_time = end_time - start_time

        # Validate the model every epoch
        with nlp.disable_pipes(*other_pipes):
            scores = nlp.evaluate(valid_examples)
        
        f_score = scores["ents_f"]
        print(
            f"Epoch {i + 1}/{n_iter} - Time: {epoch_time:.2f}s - "
            f"Losses: {losses} - F Score: {f_score:.4f} - "
            f"Precision: {scores['ents_p']:.4f} - Recall: {scores['ents_r']:.4f}"
        )

        # Early stopping logic
        if f_score > best_f_score:
            best_f_score = f_score
            epochs_without_improvement = 0
            nlp.to_disk("./model")  # Save the best model
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("Early stopping triggered.")
                break

# Load the best model after training
nlp.from_disk("./model")
print("Training completed. Best model saved in ./model")


Epoch 1/20 - Time: 2.05s - Losses: {'ner': np.float32(590.36096)} - F Score: 0.9677 - Precision: 0.9375 - Recall: 1.0000
Epoch 2/20 - Time: 2.62s - Losses: {'ner': np.float32(163.57323)} - F Score: 0.8824 - Precision: 0.7895 - Recall: 1.0000
Epoch 3/20 - Time: 2.26s - Losses: {'ner': np.float32(62.67759)} - F Score: 0.8571 - Precision: 0.7500 - Recall: 1.0000
Epoch 4/20 - Time: 2.15s - Losses: {'ner': np.float32(32.358273)} - F Score: 0.8955 - Precision: 0.8108 - Recall: 1.0000
Epoch 5/20 - Time: 2.47s - Losses: {'ner': np.float32(12.389014)} - F Score: 0.8955 - Precision: 0.8108 - Recall: 1.0000
Epoch 6/20 - Time: 2.14s - Losses: {'ner': np.float32(15.369544)} - F Score: 0.8571 - Precision: 0.7500 - Recall: 1.0000
Epoch 7/20 - Time: 1.97s - Losses: {'ner': np.float32(14.321794)} - F Score: 0.9091 - Precision: 0.8333 - Recall: 1.0000
Epoch 8/20 - Time: 2.20s - Losses: {'ner': np.float32(6.730218)} - F Score: 0.8955 - Precision: 0.8108 - Recall: 1.0000
Epoch 9/20 - Time: 1.97s - Losses:

In [189]:
# Load the trained model
nlp_trained = spacy.load("./model")

# Load the test data
test_db = DocBin().from_disk("test.spacy")
examples = []
for doc in test_db.get_docs(nlp_trained.vocab):
    if doc.ents:
        examples.append(Example.from_dict(doc, {"entities": doc.ents}))

# Evaluate the model
scorer = Scorer()
scores = scorer.score(examples)

# Print detailed metrics
print("Evaluation results:")
print(f"Overall F-score: {scores['ents_f']:.4f}")
print(f"Precision: {scores['ents_p']:.4f}")
print(f"Recall: {scores['ents_r']:.4f}")

# Print per-entity metrics
print("\nPer-entity metrics:")
for metric in ["ents_per_type"]:
    for entity_type, entity_scores in scores[metric].items():
        print(f"\n{entity_type}:")
        print(f"  F-score: {entity_scores['f']:.4f}")
        print(f"  Precision: {entity_scores['p']:.4f}")
        print(f"  Recall: {entity_scores['r']:.4f}")



Evaluation results:
Overall F-score: 1.0000
Precision: 1.0000
Recall: 1.0000

Per-entity metrics:

NAME:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

SURNAME:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

REFERENCE_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

CONTRACT_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

INVOICE_NUMBER:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000

IBAN:
  F-score: 1.0000
  Precision: 1.0000
  Recall: 1.0000


In [194]:
text = "Payment for contract CT-7788 from Zoe Turner"
doc = nlp_trained(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

In [11]:
from typing import List
from pydantic import BaseModel

class Entity(BaseModel):
    start: int
    end: int
    label: str

class PaymentDescription(BaseModel):
    text: str
    entities: List[Entity]


In [2]:
os.environ['OPENAI_API_KEY']=""

In [None]:
print(os.environ['OPENAI_API_KEY'])

In [7]:
client = OpenAI() # It will automatically look for OPENAI_API_KEY env variable

In [183]:
def generate_payment_description_o1(country : str):
    try:
        # Request OpenAI to extract the required data in the given format
        completion = client.beta.chat.completions.parse(
            model="o1-mini-2024-09-12",
            messages=[
                {"role": "user", "content": f"""Can you please generate realistic and different payment descriptions. I need 50 different descriptions in {country} natural language that are accurate, varied, and contextually appropriate. Each description may contain zero, one, or multiple entities, based on the scenario. You must be accurate with marking the entities and where they start and end since they will be used as spacy data. Start from position 0 and count every character one by one.
                    This response should be raw json format without any additional text or start/end signalizations, also word IBAN is not counted in the enitity but just the value of the IBAN. The structure is defined with the following properties class Entity(BaseModel):
    start: int
    end: int
    label: str

class PaymentDescription(BaseModel):
    text: str
    entities: List[Entity]
                    The entities to identify and label are as follows:
                    1. **INVOICE_NUMBER**: A numerical invoice identifier.
                    2. **REFERENCE_NUMBER**: A reference number, often alphanumeric.
                    3. **IBAN**: An International Bank Account Number.
                    4. **CONTRACT_NUMBER**: A contract identifier.
                    5. **NAME**: A person's first name.
                    6. **SURNAME**: A person's last name."""},
            ],
        )

        # The parsed response will automatically map to the Pydantic class
        return completion.choices[0].message.content

    except openai.APIError as e:
        print(f"OpenAI API Error: {e}")
        return None
    except Exception as e:
        print(f"General Error: {e}")
        return None

In [184]:
result_o1 = generate_payment_description_o1("England")

if result_o1:
    print("\nRaw data:")
    print(result_o1) 
else:
    print("Failed to generate a payment description.")



Raw data:
```json
[
    {
        "text": "Payment received for invoice 67890 from John Doe.",
        "entities": [
            {
                "start": 26,
                "end": 31,
                "label": "INVOICE_NUMBER"
            },
            {
                "start": 37,
                "end": 41,
                "label": "NAME"
            },
            {
                "start": 42,
                "end": 45,
                "label": "SURNAME"
            }
        ]
    },
    {
        "text": "Transferred £250 to account IBAN GB82WEST12345698765432 for reference REF123ABC.",
        "entities": [
            {
                "start": 27,
                "end": 49,
                "label": "IBAN"
            },
            {
                "start": 65,
                "end": 73,
                "label": "REFERENCE_NUMBER"
            }
        ]
    },
    {
        "text": "Monthly payment for contract number C-2023-456 has been processed.",
        "entities": 

In [185]:
result_o1 = result_o1[7:]
result_o1 = result_o1[:-3]
print(result_o1)


[
    {
        "text": "Payment received for invoice 67890 from John Doe.",
        "entities": [
            {
                "start": 26,
                "end": 31,
                "label": "INVOICE_NUMBER"
            },
            {
                "start": 37,
                "end": 41,
                "label": "NAME"
            },
            {
                "start": 42,
                "end": 45,
                "label": "SURNAME"
            }
        ]
    },
    {
        "text": "Transferred £250 to account IBAN GB82WEST12345698765432 for reference REF123ABC.",
        "entities": [
            {
                "start": 27,
                "end": 49,
                "label": "IBAN"
            },
            {
                "start": 65,
                "end": 73,
                "label": "REFERENCE_NUMBER"
            }
        ]
    },
    {
        "text": "Monthly payment for contract number C-2023-456 has been processed.",
        "entities": [
            {
  

In [186]:
import json

def parse_and_append_payments(result: str, filename: str):
    """
    Parse and append multiple payments from the result (string format) to a JSON file.

    Args:
        result (str): A JSON-formatted string containing payment descriptions.
        filename (str): Name of the JSON file to save the descriptions.
    """
    try:
        # Parse the string into a Python list
        payments = json.loads(result)
    except json.JSONDecodeError as e:
        print(f"Error parsing result string as JSON: {e}")
        return

    if not isinstance(payments, list):
        print("Invalid result format: expected a list of payments.")
        return

    # Parse each payment in the result
    parsed_descriptions = []
    for payment in payments:
        if "text" in payment and "entities" in payment:
            parsed_descriptions.append({
                "text": payment["text"],
                "entities": [
                    {
                        "start": entity["start"],
                        "end": entity["end"],
                        "label": entity["label"]
                    }
                    for entity in payment["entities"]
                ]
            })
        else:
            print("Skipping payment with invalid structure.")

    # Load existing data from the JSON file, if it exists
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []  # If the file doesn't exist, initialize as empty list
    except json.JSONDecodeError:
        print("Warning: File exists but contains invalid JSON. Starting fresh.")
        existing_data = []  # Start fresh if JSON is invalid

    # Append all parsed payments to the existing data
    existing_data.extend(parsed_descriptions)

    # Save updated data back to the file
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
        print(f"Payments appended to {filename}")
    except Exception as e:
        print(f"Error saving to file: {e}")

# Example usage
parse_and_append_payments(result_o1, "payment_descriptions.json")


Payments appended to payment_descriptions.json


In [None]:
def generate_payment_description(number_of_samples : int, country : str):
    try:
        # Request OpenAI to extract the required data in the given format
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": """You are a payment description generator specializing in creating realistic and different payment descriptions from various countries and contexts. Your task is to generate descriptions in natural language that are accurate, varied, and contextually appropriate. User will ask you for a number of descriptions and country where the payments should be originated. Each description may contain zero, one, or multiple entities, based on the scenario. You must be accurate with marking the entities and where they start and end, have to consider all caracter including spaces apostrophes and so on. The prefix is not counted as entity, for example CN-35232 is entity 35232, this applies to every type of enitity.   {
    "text": "Virement pour la facture numéro 34789 concernant le client Marc Dupont. IBAN fr3512345678901234567890126.",
    "entities": [
      {
        "start": 32,
        "end": 37,
        "label": "INVOICE_NUMBER"
      },
      {
        "start": 59,
        "end": 63,
        "label": "NAME"
      },
      {
        "start": 64,
        "end": 70,
        "label": "SURNAME"
      },
      {
        "start": 77,
        "end": 104,
        "label": "IBAN"
      }
    ]
  }
                    The entities to identify and label are as follows:
                    1. **INVOICE_NUMBER**: A numerical invoice identifier.
                    2. **REFERENCE_NUMBER**: A reference number, often alphanumeric.
                    3. **IBAN**: An International Bank Account Number.
                    4. **CONTRACT_NUMBER**: A contract identifier.
                    5. **NAME**: A person's first name.
                    6. **SURNAME**: A person's last name. """},
                {"role": "user", "content": f"""Can you please generate {number_of_samples}  different payments and originate them from {country}?"""},
            ],
            response_format=PaymentDescription,  # Use the Pydantic model as the response format
        )

        # The parsed response will automatically map to the Pydantic class
        return completion.choices[0].message

    except openai.APIError as e:
        print(f"OpenAI API Error: {e}")
        return None
    except Exception as e:
        print(f"General Error: {e}")
        return None