In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from spacy.util import filter_spans

In [5]:
# Define the path to the training data file
train_file="../data/annotations (5).json"

# Load the training data from the JSON file
with open(train_file) as f:
    data = json.load(f)

# Extract the relevant columns from the DataFrame
intent_df = pd.DataFrame(data)
intent_df['annotations']
intent_df = intent_df.join(intent_df['annotations'].apply(pd.Series))


intent_df.rename(columns={1: 'entities'}, inplace=True)
intent_df.rename(columns={0: 'pattern'}, inplace=True)

# Drop the unneccessary column
intent_df.drop(columns=['annotations'], inplace=True)


In [6]:
# Define a function to extract entities from the 'entities' column
def extract_entities(entities):
    if isinstance(entities, dict) :
        return [tuple(entity) for entity in entities['entities']]
    return np.nan

# Apply the function to extract entities
intent_df['entities'] = intent_df['entities'].apply(extract_entities)

# Drop rows with missing entities
intent_df=intent_df.dropna()


In [7]:
training_data=intent_df
training_data

Unnamed: 0,pattern,entities
0,Can you generate a bar chart showing sales per...,"[(19, 28, VISUALIZATION), (37, 42, SALES), (58..."
1,Can you generate a bar chart showing sales per...,"[(19, 28, VISUALIZATION), (37, 42, SALES), (58..."
2,Show me a bar chart of sales performance by re...,"[(10, 19, VISUALIZATION), (23, 28, SALES), (44..."
3,I'd like to see a bar chart of sales performan...,"[(18, 27, VISUALIZATION), (31, 36, SALES), (52..."
4,I'd like to see a bar chart of sales performan...,"[(18, 27, VISUALIZATION), (31, 36, SALES), (52..."
...,...,...
98,Show me a histogram of order values for the pa...,"[(10, 19, VISUALIZATION), (23, 35, SALES), (44..."
99,Can you provide a histogram of order values?,"[(18, 27, VISUALIZATION), (31, 43, SALES)]"
100,Create a histogram showing frequency of purcha...,"[(9, 18, VISUALIZATION), (40, 49, SALES), (53,..."
102,Generate a histogram showing distribution of c...,"[(11, 20, VISUALIZATION), (45, 57, AGE)]"


In [8]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create a DocBin object to store the training data
doc_bin = DocBin()

In [9]:
import os
# Ensure the directory exists
os.makedirs("../data/ner_model", exist_ok=True)



In [10]:
# Iterate over the training data and create spaCy documents
for index, training_example in tqdm(intent_df.iterrows(), total=intent_df.shape[0]):
    text = training_example['pattern']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.set_ents(filtered_ents)
    doc_bin.add(doc)
#  Save the training data to a file
doc_bin.to_disk("../data/ner_model/train.spacy")

  0%|          | 0/76 [00:00<?, ?it/s]

100%|██████████| 76/76 [00:00<00:00, 1033.44it/s]


In [11]:
# Initialize a spaCy config file
! python -m spacy init config ../data/ner_model/config.cfg --lang en --pipeline ner --optimize efficiency

# Train the spaCy model
! python -m spacy train ../data/ner_model/config.cfg --output ../data/ner_model/ --paths.train ../data/ner_model/train.spacy --paths.dev ../data/ner_model/train.spacy



[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m

[38;5;4mℹ Saving to output directory: ../data/ner_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     66.11    0.00    0.00    0.00    0.00
 23     200         89.61   2490.60   99.77   99.55  100.00    1.00
 52     400          8.42     13.03  100.00  100.00  100.00    1.00
 88     600          0.00      0.01  100.00  100.00  100.00    1.00
131     800          0.00      0.00  100.00  100.00  100.00    1.00
181    1000          0.28      0.30  100.00  100.00  100.00    1.00
247    1200         12.11      3.84  100.00  100.00  100.00    1.00
319    1400         32.35     15.25  100.00  10

In [12]:
# Load the trained spaCy model
nlp_trained_model = spacy.load("../data/ner_model/model-best")

In [16]:
# Create a spaCy document from the input text
doc = nlp_trained_model('''
Show me a scatter plot of country and product type.
''')

In [17]:
# Visualize the entities in the document
spacy.displacy.render(doc, style="ent", jupyter=True)