# The approach

The focus is to train a spacy model to recognize an entity which has size, item and quantity in it. This way we can extract multiple entities if the user request many items at once. 

Once that is done, we will be using spacy patterns to extract the size, item and quanitity seperately from each entity. 

# 1 . Loading Dataset

In [31]:
with open("new_entity_format.txt") as file:
    # if not line.isspace():
      lines = [line.lstrip().rstrip() for line in file if not line.isspace()]

In [32]:
lines[:6]

["I'd like a small coffee and a medium latte, please.",
 '[(a small coffee), (a medium latte)]',
 'Can I get a large smoothie and two small coffees to go?',
 '[(a large smoothie), (two small coffees)]',
 "I'll have a medium latte and a small smoothie.",
 '[(a medium latte), (a small smoothie)]']

In [33]:
len(lines)

524

# 2 . Preprocessing dataset
Note : Get the above list to a Spacy friendly training dataset format

In [34]:
entities = []
sentences = []

for i,line in enumerate(lines):
  if i%2 == 0:
    sentences.append(line)
  else:
    entities.append(line)

In [35]:
for i,entity in enumerate(entities):
  entities[i] = entities[i].strip("[]").split(",")
  entities[i] = [item.strip("() ").lower() for item in entities[i]]

In [36]:
def generate_entity_output(sentence, index, entity_name):
    entity_list = []
    try:
      for j,entity in enumerate(entities[index]):
        start_index = sentence.index(entity)
        end_index = start_index + len(entity) - 1
        entity_list.append((start_index, end_index, entity_name))

      output = {
          "entities": entity_list
      }
      return (sentence, output)

    except ValueError as e:
      print (e, " at index: ", i," Keyword: ",entity)
      return False

In [37]:
train_set = []

for i,sentence in enumerate(sentences):
  train_set.append(generate_entity_output(sentence,i,"Entity_Item"))

In [38]:
train_set[0]

("I'd like a small coffee and a medium latte, please.",
 {'entities': [(9, 22, 'Entity_Item'), (28, 41, 'Entity_Item')]})

In [39]:
train_set[0][0][9:22]

'a small coffe'

# 3 . Training Spacy

In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
label = "Entity_Item"

In [12]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

In [13]:
prev_ents = ner.move_names

In [14]:
ner.add_label(label)

1

In [15]:
new_ents = ner.move_names

In [16]:
list(set(new_ents) - set(prev_ents))

['B-Entity_Item', 'U-Entity_Item', 'L-Entity_Item', 'I-Entity_Item']

In [17]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
nlp=spacy.load('en_core_web_sm')

db = DocBin() # create a DocBin object
for text, annot in tqdm(train_set): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)
db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 262/262 [00:00<00:00, 5687.02it/s]


In [18]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [19]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [20]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2023-06-10 15:44:38,662] [INFO] Set up nlp object from config
[2023-06-10 15:44:38,688] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-06-10 15:44:38,693] [INFO] Created vocabulary
[2023-06-10 15:44:39,516] [INFO] Added vectors: en_core_web_sm
[2023-06-10 15:44:39,518] [INFO] Finished initializing nlp object
[2023-06-10 15:44:39,911] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     48.00    6.00    5.72    6.30    0.06
  5     200          1.10    532.16  100.00  100.00  100.00    1.00
 12     400          

# 4 . Load Trained model


In [22]:
nlp1 = spacy.load(r"./output/model-best") #load the best model
doc = nlp1("I need two large lattes and a medium coffee and large three smothies") # input sample text
doc.ents

(two large, a medium)

# Training Method 2

In [55]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Load the blank English model
# nlp = spacy.blank("en")

# Define your custom entity label
label = "Entity_Item"

nlp = spacy.load("en_core_web_sm")
# nlp.remove_pipe("ner")
ner = nlp.create_pipe("ner")

ner.add_label(label)
# nlp.add_pipe(ner)

# # Add the entity recognizer to the pipeline
# ner = nlp.create_pipe("ner")
# nlp.add_pipe(ner)

1

In [56]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [57]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(100):

    # shuufling examples  before every iteration
    random.shuffle(train_set)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(train_set, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        
        # Update the model
        nlp.update(example, drop=0.2, losses=losses)

        # print("Losses", losses)

In [58]:
# Testing before
sentence = "I'd like a medium coffee with extra foam and two small Lattes."
doc = nlp(sentence.lower())
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('two', 'CARDINAL')]


# Training Method 3

In [59]:
import random
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from pathlib import Path

# Disable pipeline components you don't need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Load the training data in the correct format
# train_data = [
#     ("I'd like a small coffee and a medium latte, please.", {'entities': [(9, 22, 'Entity_Item'), (28, 41, 'Entity_Item')]}),
#     # Add more training examples as needed
# ]

train_data = train_set

# Update the model with your training data
with nlp.disable_pipes(*unaffected_pipes):
    # Training for a fixed number of iterations
    for iteration in range(100):
        random.shuffle(train_data)
        losses = {}

        # Create batches of examples
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        
        for batch in batches:
            examples = []
            texts, annotations = zip(*batch)
            
            # Convert the annotations into spaCy's Example format
            for text, entity_offsets in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, entity_offsets)
                examples.append(example)
            
            # Update the model
            nlp.update(examples, drop=0.2, losses=losses)
        
        print("Losses:", losses)

# # Save the trained model to disk
# output_dir = Path("/content/training_output")
# nlp.to_disk(output_dir)

# # Load the trained model
# nlp = spacy.load(output_dir)

# Testing
sentence = "I'd like a medium coffee with extra foam and two small Lattes."
doc = nlp(sentence.lower())
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Losses: {'ner': 2.5400926892913553e-14}
Losses: {'ner': 1.0662297352851587e-13}
Losses: {'ner': 1.8799681937336086e-14}
Losses: {'ner': 4.892265908248231e-15}
Losses: {'ner': 1.4213835495593146e-14}
Losses: {'ner': 6.609374288340019e-15}
Losses: {'ner': 1.1349102684873505e-14}
Losses: {'ner': 3.334861680817981e-14}
Losses: {'ner': 2.1721313970807823e-13}
Losses: {'ner': 3.950875262025883e-15}
Losses: {'ner': 4.395039993199404e-15}
Losses: {'ner': 1.5876451057200714e-14}
Losses: {'ner': 2.2931681483870993e-12}
Losses: {'ner': 1.3248991142798601e-14}
Losses: {'ner': 4.031047004655434e-15}
Losses: {'ner': 3.0261870951642157e-15}
Losses: {'ner': 3.680339926247546e-15}
Losses: {'ner': 2.4391037564805596e-15}
Losses: {'ner': 2.7608764775394504e-15}
Losses: {'ner': 1.3641825226018788e-14}
Losses: {'ner': 5.027288816414311e-13}
Losses: {'ner': 1.684155479408759e-13}
Losses: {'ner': 2.44079251296409e-15}
Losses: {'ner': 1.1788750268778085e-15}
Losses: {'ner': 1.1689045201314238e-14}
Losses: {'n

ValueError: ignored

In [60]:
# # Save the trained model to disk
# output_dir = Path("/content/training_output")
# nlp.to_disk(output_dir)

# # Load the trained model
# nlp = spacy.load(output_dir)

# Testing
sentence = "I'd like a medium coffee with extra foam and two small Lattes."
doc = nlp(sentence.lower())
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Entities: [('two', 'CARDINAL')]
