## Restaurant Search NER Recognition By Fine Tuning DistilBERT

## Coding


In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import pandas as pd
import json
import requests

In [3]:
train = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio", sep="\t", header=None)
train.head()


Unnamed: 0,0,1
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside


In [4]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio")
response = response.text

In [5]:
response = response.splitlines()

In [6]:
train_tokens = []
train_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tokens.append(temp_tokens)
        train_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [7]:
len(train_tokens), len(train_tags)

(7659, 7659)

In [8]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/test.bio")
response = response.text
response = response.splitlines()

test_tokens = []
test_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tokens.append(temp_tokens)
        test_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

len(test_tokens), len(test_tags)


(1520, 1520)

## HuggingFace Dataset Prep

In [9]:
from datasets import Dataset, DatasetDict

df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
train = Dataset.from_pandas(df)

df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})
test = Dataset.from_pandas(df)

dataset = DatasetDict({'train': train, 'test': test, 'validation': test})

dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [10]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [11]:
unique_tags = set()
for tag in dataset['train']['ner_tags_str']:
    unique_tags.update(tag)

unique_tags = list(set([x[2:] for x in list(unique_tags) if x!='O']))

tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f'B-{tag}'] = len(tag2index)
    tag2index[f'I-{tag}'] = len(tag2index)

index2tag = {v:k for k,v in tag2index.items()}

In [12]:
dataset = dataset.map(lambda example: {"ner_tags": [tag2index[tag] for tag in example['ner_tags_str']]})

Map: 100%|██████████| 7659/7659 [00:00<00:00, 33740.58 examples/s]
Map: 100%|██████████| 1520/1520 [00:00<00:00, 33042.96 examples/s]
Map: 100%|██████████| 1520/1520 [00:00<00:00, 32340.80 examples/s]


In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
})

## Model Building


In [14]:
from transformers import AutoTokenizer

In [15]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [16]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [1, 2, 0, 9, 10, 10]}

In [17]:
input = dataset['train'][2]['tokens']
output = tokenizer(input, is_split_into_words=True)
tokenizer.convert_ids_to_tokens(output.input_ids)

['[CLS]', '5', 'star', 'rest', '##ura', '##nts', 'in', 'my', 'town', '[SEP]']

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            # if id=-100 then loss is not calculated
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx
        
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels

    return tokenized_inputs
    


In [19]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 7659/7659 [00:00<00:00, 33739.88 examples/s]
Map: 100%|██████████| 1520/1520 [00:00<00:00, 31667.23 examples/s]
Map: 100%|██████████| 1520/1520 [00:00<00:00, 31666.45 examples/s]


In [20]:
tokenized_dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [1, 2, 0, 9, 10, 10],
 'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 1, 2, 0, -100, -100, 9, 10, 10, -100]}

In [21]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [1, 2, 0, 9, 10, 10]}

## Data Collation and Metrics


In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
import numpy as np
import evaluate

metric = evaluate.load("seqeval")
label_names = list(tag2index)

def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]
    
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

Downloading builder script: 6.34kB [00:00, 6.33MB/s]


## Model Training

In [24]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=index2tag, label2id=tag2index)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from transformers import TrainingArguments, Trainer


In [26]:
args = TrainingArguments("finetuned-ner", evaluation_strategy='epoch',
                         save_strategy='epoch',
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [27]:
trainer = Trainer(model=model, args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

In [28]:
trainer.train()

 17%|█▋        | 500/2874 [03:01<15:21,  2.58it/s]

{'loss': 0.6236, 'grad_norm': 4.230839252471924, 'learning_rate': 1.65205288796103e-05, 'epoch': 0.52}


                                                  
 33%|███▎      | 958/2874 [06:01<10:55,  2.92it/s]

{'eval_loss': 0.3017440140247345, 'eval_precision': 0.7439096850861556, 'eval_recall': 0.7949206349206349, 'eval_f1': 0.7685696746470226, 'eval_accuracy': 0.9102330151600224, 'eval_runtime': 11.2659, 'eval_samples_per_second': 134.92, 'eval_steps_per_second': 16.865, 'epoch': 1.0}


 35%|███▍      | 1000/2874 [06:28<09:47,  3.19it/s] 

{'loss': 0.3213, 'grad_norm': 0.7570993304252625, 'learning_rate': 1.30410577592206e-05, 'epoch': 1.04}


 52%|█████▏    | 1500/2874 [09:13<08:21,  2.74it/s]

{'loss': 0.2467, 'grad_norm': 5.85361385345459, 'learning_rate': 9.561586638830899e-06, 'epoch': 1.57}


                                                   
 67%|██████▋   | 1916/2874 [11:52<04:49,  3.31it/s]

{'eval_loss': 0.2799017131328583, 'eval_precision': 0.7769652650822669, 'eval_recall': 0.8095238095238095, 'eval_f1': 0.7929104477611941, 'eval_accuracy': 0.9171813587871982, 'eval_runtime': 9.811, 'eval_samples_per_second': 154.928, 'eval_steps_per_second': 19.366, 'epoch': 2.0}


 70%|██████▉   | 2000/2874 [12:35<04:24,  3.31it/s]  

{'loss': 0.2466, 'grad_norm': 1.5406414270401, 'learning_rate': 6.082115518441197e-06, 'epoch': 2.09}


 87%|████████▋ | 2500/2874 [15:18<02:07,  2.92it/s]

{'loss': 0.2016, 'grad_norm': 2.6998701095581055, 'learning_rate': 2.6026443980514964e-06, 'epoch': 2.61}


                                                   
100%|██████████| 2874/2874 [17:32<00:00,  3.32it/s]

{'eval_loss': 0.28050994873046875, 'eval_precision': 0.775615688659167, 'eval_recall': 0.8098412698412698, 'eval_f1': 0.7923590619661438, 'eval_accuracy': 0.9194272880404267, 'eval_runtime': 9.4285, 'eval_samples_per_second': 161.213, 'eval_steps_per_second': 20.152, 'epoch': 3.0}


100%|██████████| 2874/2874 [17:53<00:00,  2.68it/s]

{'train_runtime': 1073.8992, 'train_samples_per_second': 21.396, 'train_steps_per_second': 2.676, 'train_loss': 0.31142478098700094, 'epoch': 3.0}





TrainOutput(global_step=2874, training_loss=0.31142478098700094, metrics={'train_runtime': 1073.8992, 'train_samples_per_second': 21.396, 'train_steps_per_second': 2.676, 'total_flos': 105239751014754.0, 'train_loss': 0.31142478098700094, 'epoch': 3.0})

In [29]:
trainer.save_model("ner_distilbert")

## Prediction and Load & Save Model

In [30]:
from transformers import pipeline

checkpoint = "ner_distilbert"
pipe = pipeline('token-classification', model=checkpoint, aggregation_strategy='simple')

In [31]:
pipe("which restaurant serves the best shushi in new york?")

[{'entity_group': 'Rating',
  'score': 0.9677843,
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity_group': 'Dish',
  'score': 0.9091525,
  'word': 'shushi',
  'start': 33,
  'end': 39},
 {'entity_group': 'Location',
  'score': 0.88065165,
  'word': 'new york',
  'start': 43,
  'end': 51}]