<a href="https://colab.research.google.com/github/SHEHAN-120/fine-tuned-ner-for-restaurant-queries/blob/main/Restaurant_Search_NER_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Libiraries

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [2]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

Collecting transformers
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m750.2 kB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.2
    Uninstalling transformers-4.56.2:
      Successfully uninstalled transformers-4.56.2
Successfully installed transformers-4.57.0
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6

In [3]:
import pandas as pd
import json
import requests

In [4]:
train=pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio",sep="\t",header=None)

In [5]:
train.head()

Unnamed: 0,0,1
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside


In [6]:
response=requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio")
response=response.text

In [8]:
response=response.splitlines()

In [9]:
train_tokens=[]
train_tags=[]

temp_tokens=[]
temp_tags=[]

for line in response:
  if line !="":
    tag,token=line.strip().split("\t")
    temp_tags.append(tag)
    temp_tokens.append(token)
  else:
    train_tokens.append(temp_tokens)
    train_tags.append(temp_tags)

    temp_tokens,temp_tags=[],[]

In [10]:
len(train_tokens),len(train_tags)

(7659, 7659)

In [11]:
response=requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/test.bio")
response=response.text

In [12]:
response=response.splitlines()

In [15]:
test_tokens=[]
test_tags=[]

temp_tokens=[]
temp_tags=[]

for line in response:
  if line !="":
    tag,token=line.strip().split("\t")
    temp_tags.append(tag)
    temp_tokens.append(token)
  else:
    test_tokens.append(temp_tokens)
    test_tags.append(temp_tags)

    temp_tokens,temp_tags=[],[]

In [16]:
len(test_tokens),len(test_tags)

(1520, 1520)

## HuggingFace Dataset

In [17]:
from datasets import Dataset,DatasetDict

df=pd.DataFrame({'tokens':train_tokens,'ner_tags_str':train_tags})
train=Dataset.from_pandas(df)

df=pd.DataFrame({'tokens':test_tokens,'ner_tags_str':test_tags})
test=Dataset.from_pandas(df)


dataset=DatasetDict({'train':train,'test':test,'validation':test})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [18]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [19]:
unique_tags=set()
for tag in dataset['train']['ner_tags_str']:
  unique_tags.update(tag)

unique_tags=list(set([x[2:] for x in list(unique_tags) if x!='O']))

tag2index={"O":0}
for i,tag in enumerate(unique_tags):
  tag2index[f'B-{tag}']=len(tag2index)
  tag2index[f'I-{tag}']=len(tag2index)

index2tag={v:k for k,v in tag2index.items()}

In [20]:
dataset=dataset.map(lambda example: {"ner_tags":[tag2index[tag] for tag in example['ner_tags_str']]})

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
})

## Model Building

In [22]:
from transformers import AutoTokenizer

In [23]:
model_ckpt="distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [13, 14, 0, 11, 12, 12]}

In [27]:
input=dataset['train'][2]['tokens']
output=tokenizer(input,is_split_into_words=True)
tokenizer.convert_ids_to_tokens(output.input_ids)


['[CLS]', '5', 'star', 'rest', '##ura', '##nts', 'in', 'my', 'town', '[SEP]']

In [28]:
def tokenize_and_align_labels(examples):
  tokenized_inputs=tokenizer(examples['tokens'],truncation=True,is_split_into_words=True)

  labels=[]
  for i,label in enumerate(examples['ner_tags']):
    word_ids=tokenized_inputs.word_ids(batch_index=i)

    previous_word_idx=None
    label_ids=[]

    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx!=previous_word_idx:
        label_ids.append(label[word_idx])

      else:
        label_ids.append(-100)

      previous_word_idx=word_idx

    labels.append(label_ids)

  tokenized_inputs['labels']=labels
  return tokenized_inputs

In [29]:
tokenized_dataset=dataset.map(tokenize_and_align_labels,batched=True)

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [30]:
tokenized_dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [13, 14, 0, 11, 12, 12],
 'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 13, 14, 0, -100, -100, 11, 12, 12, -100]}

In [31]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [13, 14, 0, 11, 12, 12]}

## Data Collation and Metrics

In [32]:
!pip install seqeval
!pip install evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=395388e1b5eed2a7fc6a3097300e52767fa586e799e787e8e68596e50a5e6f96
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [34]:
from transformers import DataCollatorForTokenClassification
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)


In [35]:
import evaluate
import numpy as np

In [36]:
metric=evaluate.load('seqeval')
label_names=list(tag2index)

def compute_metrics(eval_preds):
  logits,labels=eval_preds

  predictions=np.argmax(logits,axis=-1)
  true_labels=[[label_names[1] for l in label if l!=-100] for label in labels]

  true_predictions=[[label_names[p] for p, l in zip(prediction,label) if l!=-100]
                    for prediction,label in zip(predictions,labels)]

  all_metrics=metric.compute(predictions=true_predictions,references=true_labels)

  return {
      'precision':all_metrics['overall_precision'],
      'recall':all_metrics['overall_recall'],
      'f1':all_metrics['overall_f1'],
      'accuracy':all_metrics['overall_accuracy']
  }

Downloading builder script: 0.00B [00:00, ?B/s]

## Model Training

In [37]:
from transformers import AutoModelForTokenClassification

model=AutoModelForTokenClassification.from_pretrained(model_ckpt,id2label=index2tag,label2id=tag2index)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import TrainingArguments,Trainer

In [48]:
args=TrainingArguments("finetuned-ner",
                       eval_strategy='epoch',
                       save_strategy='epoch',
                       learning_rate=2e-5,
                       num_train_epochs=3,
                       report_to="none",
                       weight_decay=0.01)

In [49]:
trainer=Trainer(model=model,args=args,
                train_dataset=tokenized_dataset['train'],
                eval_dataset=tokenized_dataset['validation'],
                data_collator=data_collator,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer)

In [50]:
!pip uninstall -y wandb

[0m

In [51]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2473,0.291439,0.070833,0.016704,0.027033,0.039444
2,0.2146,0.282105,0.067112,0.015511,0.025198,0.038672
3,0.173,0.287685,0.064919,0.014949,0.024303,0.03797


TrainOutput(global_step=2874, training_loss=0.22318680616576553, metrics={'train_runtime': 3078.454, 'train_samples_per_second': 7.464, 'train_steps_per_second': 0.934, 'total_flos': 105239751014754.0, 'train_loss': 0.22318680616576553, 'epoch': 3.0})

## Save Model and Get Predictions

In [52]:
trainer.save_model("ner_distilbert")

In [53]:
from transformers import pipeline

checkpoint="./ner_distilbert"
pipe=pipeline("token-classification",model=checkpoint,aggregation_strategy='simple')

Device set to use cpu


In [65]:
pipe("Which restaurant serves the best rice in Delhi")

[{'entity_group': 'Rating',
  'score': np.float32(0.9737631),
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity_group': 'Dish',
  'score': np.float32(0.96121955),
  'word': 'rice',
  'start': 33,
  'end': 37},
 {'entity_group': 'Location',
  'score': np.float32(0.9652745),
  'word': 'delhi',
  'start': 41,
  'end': 46}]