In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from spacy.util import filter_spans

In [3]:
# Define the path to the training data file
train_file="../data/annotations.json"

# Load the training data from the JSON file
with open(train_file) as f:
    data = json.load(f)

# Extract the relevant columns from the DataFrame
intent_df = pd.DataFrame(data)
intent_df['annotations']
intent_df = intent_df.join(intent_df['annotations'].apply(pd.Series))


intent_df.rename(columns={1: 'entities'}, inplace=True)
intent_df.rename(columns={0: 'pattern'}, inplace=True)

# Drop the unneccessary column
intent_df.drop(columns=['annotations'], inplace=True)


In [4]:
# Define a function to extract entities from the 'entities' column
def extract_entities(entities):
    if isinstance(entities, dict) :
        return [tuple(entity) for entity in entities['entities']]
    return np.nan

# Apply the function to extract entities
intent_df['entities'] = intent_df['entities'].apply(extract_entities)

# Drop rows with missing entities
intent_df=intent_df.dropna()


In [5]:
training_data=intent_df
training_data

Unnamed: 0,pattern,entities
0,Can you provide details for InvoiceNo 536365?,"[(28, 37, VARIABLE), (38, 44, VALUE)]"
1,What items were purchased with InvoiceNo 536365?,"[(31, 40, VARIABLE), (41, 47, VALUE)]"
2,I need information about transaction InvoiceNo...,"[(37, 46, VARIABLE), (47, 53, VALUE)]"
3,"Details for InvoiceNo 536365, please?","[(12, 21, VARIABLE), (22, 28, VALUE)]"
4,What can you tell me about InvoiceNo 536365?,"[(27, 36, VARIABLE), (37, 43, VALUE)]"
...,...,...
233,Show distribution of product prices using a hi...,"[(21, 35, VARIABLE), (44, 53, VISUALIZATION)]"
234,Create a histogram illustrating distribution o...,"[(9, 18, VISUALIZATION), (48, 59, VARIABLE)]"
235,Generate a histogram showing distribution of w...,"[(11, 20, VISUALIZATION), (45, 60, VARIABLE)]"
236,Generate a heatmap depicting regional sales pe...,"[(11, 18, VISUALIZATION), (29, 37, VARIABLE), ..."


In [6]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create a DocBin object to store the training data
doc_bin = DocBin()

In [17]:
import os
# Ensure the directory exists
os.makedirs("../data/ner_model", exist_ok=True)



None


In [18]:
# Iterate over the training data and create spaCy documents
for index, training_example in tqdm(intent_df.iterrows(), total=intent_df.shape[0]):
    text = training_example['pattern']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.set_ents(filtered_ents)
    doc_bin.add(doc)
#  Save the training data to a file
doc_bin.to_disk("../data/ner_model/train.spacy")

100%|██████████| 167/167 [00:00<00:00, 3308.92it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [19]:
# Initialize a spaCy config file
! python -m spacy init config ../data/ner_model/config.cfg --lang en --pipeline ner --optimize efficiency

# Train the spaCy model
! python -m spacy train ../data/ner_model/config.cfg --output ../data/ner_model/ --paths.train ../data/ner_model/train.spacy --paths.dev ../data/ner_model/train.spacy


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../data/ner_model/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory: ../data/ner_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     64.77    0.00  

In [21]:
# Load the trained spaCy model
nlp_trained_model = spacy.load("../data/ner_model/model-best")

In [22]:
# Create a spaCy document from the input text
doc = nlp_trained_model('''
Please show me a scatter plot of country and product type.
Give me details of Invoice number  1234534
''')

In [None]:
# Visualize the entities in the document
spacy.displacy.render(doc, style="ent", jupyter=True)