# Named Entity Recognition using spacy's en_core_web_sm model

#### Defining paths and importing packages

In [1]:
import pandas as pd
data_directory_train = "/content/restauranttrain.txt"
data_directory_test = "/content/restauranttest.txt"

#### Reading data from files

In [2]:
def read_lines_from_data(file):
  with open(file, "r") as f:
    lines = f.readlines()
  return lines

In [3]:
train_data_raw = read_lines_from_data(data_directory_train)
test_data_raw = read_lines_from_data(data_directory_test)

#### Preprocessing data to structure into Pandas DataFrame

In [4]:
def preprocessing_data(filename):
  lines = read_lines_from_data(filename)
  list_of_dict = []
  i = 1
  for line in lines:
    if line == "\n":
      i+=1
    else:
      dict_for_lines = {}
      split_line = line.split("\t")
      dict_for_lines["Sentence #"] = f"Sentence {i}"
      dict_for_lines["Word"] = split_line[1].strip("\n")
      dict_for_lines["Tag"] =split_line[0]
      list_of_dict.append(dict_for_lines)
  return list_of_dict

In [5]:
list_dic_train = preprocessing_data(data_directory_train)
list_dic_test = preprocessing_data(data_directory_test)

In [6]:
df_train = pd.DataFrame(list_dic_train)
df_test = pd.DataFrame(list_dic_test)

In [7]:
df_train[:15]

Unnamed: 0,Sentence #,Word,Tag
0,Sentence 1,2,B-Rating
1,Sentence 1,star,I-Rating
2,Sentence 1,restaurants,O
3,Sentence 1,with,O
4,Sentence 1,inside,B-Amenity
5,Sentence 1,dining,I-Amenity
6,Sentence 2,34,O
7,Sentence 3,5,B-Rating
8,Sentence 3,star,I-Rating
9,Sentence 3,resturants,O


#### Converting data into Spacy's input format

In [8]:
def data_to_spacy_input_format(data_input):
  data = []
  words = []
  sentence = ''
  current_sentence = 'Sentence 1'
  entities = []
  for index, row in data_input.iterrows():
    if current_sentence != row[0]:
      if len(entities) > 0:
        data.append((' '.join(words), { "entities": entities }))
      sentence = ''
      current_sentence = row[0]
      words = []
      entities = []
    if row['Tag'] != 'O':
      start_index = len(sentence) + len(words)
      end_index = start_index + len(row['Word'])
      entities.append((start_index, end_index, row['Tag']))
    sentence += row['Word']
    words.append(row['Word'])

  if len(entities) > 0:
    data.append((' '.join(words), { "entities": entities }))
  return data


In [9]:
train_data = data_to_spacy_input_format(df_train)

In [10]:
test_data = data_to_spacy_input_format(df_test)

In [24]:
train_data[:15]

[('2 star restaurants with inside dining',
  {'entities': [(0, 1, 'B-Rating'),
    (2, 6, 'I-Rating'),
    (24, 30, 'B-Amenity'),
    (31, 37, 'I-Amenity')]}),
 ('5 star resturants in my town',
  {'entities': [(0, 1, 'B-Rating'),
    (2, 6, 'I-Rating'),
    (18, 20, 'B-Location'),
    (21, 23, 'I-Location'),
    (24, 28, 'I-Location')]}),
 ('98 hong kong restaurant reasonable prices',
  {'entities': [(3, 7, 'B-Restaurant_Name'),
    (8, 12, 'I-Restaurant_Name'),
    (24, 34, 'B-Price')]}),
 ('a great lunch spot but open till 2 a m passims kitchen',
  {'entities': [(23, 27, 'B-Hours'),
    (28, 32, 'I-Hours'),
    (33, 34, 'I-Hours'),
    (35, 36, 'I-Hours'),
    (37, 38, 'I-Hours'),
    (39, 46, 'B-Restaurant_Name'),
    (47, 54, 'I-Restaurant_Name')]}),
 ('a place that serves soft serve ice cream',
  {'entities': [(20, 24, 'B-Dish'),
    (25, 30, 'I-Dish'),
    (31, 34, 'I-Dish'),
    (35, 40, 'I-Dish')]}),
 ('a restaurant that is good for groups',
  {'entities': [(21, 25, 'B-Rating')

#### This function is borrwed from spacy's documentation. It converts above data form to .spacy format which the pipeline accepts

In [11]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_sm") # load a pretrained spacy model
db = DocBin() # create a DocBin object

for text, annot in tqdm(train_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

db = DocBin()
for text, annot in tqdm(test_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./valid.spacy") # save the docbin object

100%|██████████| 7573/7573 [00:01<00:00, 5859.04it/s]
100%|██████████| 1517/1517 [00:00<00:00, 6243.39it/s]


#### Installing spacy transformers

In [None]:
!pip install spacy[transformers]

#### Generating Configuration file for training

In [16]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


#### Model Training and evaluation

In [17]:
!python -m spacy train config.cfg --verbose --output ./ner_demo/training/ --paths.train train.spacy --paths.dev valid.spacy

[2022-08-28 10:43:29,947] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: ner_demo/training[0m
[38;5;4mℹ Saving to output directory: ner_demo/training[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2022-08-28 10:43:30,467] [INFO] Set up nlp object from config
[2022-08-28 10:43:30,476] [DEBUG] Loading corpus from path: valid.spacy
[2022-08-28 10:43:30,477] [DEBUG] Loading corpus from path: train.spacy
[2022-08-28 10:43:30,477] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-08-28 10:43:30,481] [INFO] Created vocabulary
[2022-08-28 10:43:30,482] [INFO] Finished initializing nlp object

Load the table in your config with:

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]

[2022-08-28 10:43:32,454] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2022-08-28 10:43:32,464] [DEBUG] Loading co

#### Prediction Function

In [19]:
import spacy.displacy
def pred_fun():
  pred = spacy.load("/content/ner_demo/training/model-best")
  sentence = input("Please enter the sentence here: " )
  result = pred(sentence)
  spacy.displacy.render(result, jupyter=True, style="ent")


## Sample Prediction
#### Unmarked words are of category "O"

In [21]:
pred_fun()

Please enter the sentence here: lets go to burger king and eat their spicy beef burger


In [None]:
!zip -r /content/ner_demo.zip /content/ner_demo

In [23]:
from google.colab import files
files.download("/content/ner_demo.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>