In [1]:
import spacy
from spacy.tokens import DocBin  # Import DocBin
import json
from tqdm import tqdm  # Import tqdm

## Using annotator

In [2]:
nlp = spacy.blank("en")
db = DocBin()

In [3]:
with open('annotations_training.json', encoding='utf-8') as f:
    TRAIN_DATA = json.load(f)

In [4]:
with open('annotations_validation.json', encoding='utf-8') as f:
    VALID_DATA = json.load(f)

In [5]:
TRAIN_DATA

{'classes': ['PRODUCT_NAME',
  'BRAND_NAME',
  'WEIGHT',
  'INGREDIENTS',
  'NUTRITIONAL_INFO',
  'FOOD_TYPE',
  'PRICE',
  'EXPIRY_DATE',
  'MANUFACTURING_DATE',
  'LIC_NUMBER',
  'HELPLINE_NUMBER',
  'OTHER_INFO'],
 'annotations': [['protein more 15X than milk\r',
   {'entities': [[0, 26, 'NUTRITIONAL_INFO']]}],
  ['\r', {'entities': []}],
  ['fortune\r', {'entities': [[0, 7, 'BRAND_NAME']]}],
  ['\r', {'entities': []}],
  ['soya chunks\r', {'entities': [[0, 11, 'PRODUCT_NAME']]}],
  ['\r', {'entities': []}],
  ['100% vegetarian\r', {'entities': [[0, 15, 'FOOD_TYPE']]}],
  ['\r', {'entities': []}],
  ['low fat\r', {'entities': [[0, 7, 'NUTRITIONAL_INFO']]}],
  ['\r', {'entities': []}],
  None,
  ['\r', {'entities': []}],
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  ['fortune soya\r', {'entities': [[0, 12, 'PRODUCT_NAME']]}],
  ['\r', {'entities': []}],
  None,
  None,
  ['steps to use nutritious fortune soya chunks\r',
   {'entities': [[13, 23, 'NUTRITION

In [6]:
VALID_DATA

{'classes': ['PRODUCT_NAME',
  'BRAND_NAME',
  'WEIGHT',
  'INGREDIENTS',
  'NUTRITIONAL_INFO',
  'FOOD_TYPE',
  'PRICE',
  'EXPIRY_DATE',
  'MANUFACTURING_DATE',
  'LIC_NUMBER',
  'HELPLINE_NUMBER',
  'OTHER_INFO'],
 'annotations': [None,
  ['\r', {'entities': []}],
  ['BALAJI\r', {'entities': [[0, 6, 'BRAND_NAME']]}],
  ['\r', {'entities': []}],
  ['BALAJI\r', {'entities': [[0, 6, 'BRAND_NAME']]}],
  ['\r', {'entities': []}],
  ['WAFERS\r', {'entities': [[0, 6, 'PRODUCT_NAME']]}],
  ['\r', {'entities': []}],
  ['Khatta Mitha Mix\r', {'entities': [[0, 16, 'PRODUCT_NAME']]}],
  ['BALAR\r', {'entities': []}],
  ['\r', {'entities': []}],
  ['EMEGHT\r', {'entities': []}],
  ['\r', {'entities': []}],
  ['INS PEANUTS MILK POWDER\r', {'entities': [[4, 23, 'PRODUCT_NAME']]}],
  ['\r', {'entities': []}],
  ['PER 100g\r', {'entities': []}],
  ['\r', {'entities': []}],
  ['568 Kral\r', {'entities': []}],
  ['\r', {'entities': []}],
  ['DAILY VALUE OF RDA PER 300\r', {'entities': []}],
  ['\r', {

## Generating Training Data in spacy format

In [7]:
for entry in tqdm(TRAIN_DATA['annotations']):
    if entry is None:
        print("Skipping NoneType entry")
        continue
    try:
        text, annot = entry  # Unpack the entry
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot['entities']:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    except ValueError as e:
        print(f"Skipping malformed entry: {entry}, error: {e}")
        
db.to_disk("training_data.spacy")  # Save the DocBin object

100%|███████████████████████████████████████████████████████████████████████████| 3390/3390 [00:00<00:00, 13243.27it/s]

Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping NoneType entry
Skipping entity





## Generating Validation Data in spacy format

In [8]:
for entry in tqdm(VALID_DATA['annotations']):
    if entry is None:
        print("Skipping NoneType entry")
        continue
    try:
        text, annot = entry  # Unpack the entry
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot['entities']:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    except ValueError as e:
        print(f"Skipping malformed entry: {entry}, error: {e}")
        
db.to_disk("validation_data.spacy")  # Save the DocBin object

100%|█████████████████████████████████████████████████████████████████████████████| 603/603 [00:00<00:00, 15544.96it/s]

Skipping NoneType entry





## Custom NER Model Training

In [9]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m[!] To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4m[i] Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --verbose

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00      1.96    0.00    0.00    0.00    0.00
  1     200        382.88   3109.80   27.72   32.03   24.43    0.28
  3     400        438.75   2138.82   53.12   59.24   48.15    0.53
  6     600       1265.57   2127.82   74.97   77.24   72.82    0.75
  9     800       1024.77   1496.93   83.89   86.66   81.29    0.84
 12    1000        676.26   1038.08   88.78   91.41   86.29    0.89
 17    1200        560.95    771.24   87.90   89.70   86.17    0.88
 23    1400        671.04    682.65   91.04   91.87   90.23    0.91
 29    1600        403.73    556.72   91.40   92.87   89.99    0.91
 38    1800        924.44    592.78   90.

[2024-10-19 19:39:55,121] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2024-10-19 19:39:55,298] [INFO] Set up nlp object from config
[2024-10-19 19:39:55,299] [DEBUG] Loading corpus from path: validation_data.spacy
[2024-10-19 19:39:55,299] [DEBUG] Loading corpus from path: training_data.spacy
[2024-10-19 19:39:55,299] [INFO] Pipeline: ['tok2vec', 'ner']
[2024-10-19 19:39:55,309] [INFO] Created vocabulary
[2024-10-19 19:39:55,309] [INFO] Finished initializing nlp object

Load the table in your config with:

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]

[2024-10-19 19:39:55,972] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[2024-10-19 19:39:55,981] [DEBUG] Loading corpus from path: validation_data.spacy
[2024-10-19 19:39:55,981] [DEBUG] Loading corpus from path: training_data.spacy
[2024-10-19 19:39:55,987] [DEBUG] Removed existing output directory: model-best
[2024-10-19 19:39:55,988] [DEBUG] Rem