# Name Entity Recognition and Entity Similarity

## Prepare Dataset and Libraries

In [1]:
!pip install transformers==4.28.0
!pip install accelerate
!pip install datasets
!pip install -q emoji pythainlp==2.2.4 sefr_cut tinydb seqeval sentencepiece pydantic jsonlines
!pip install --no-deps thai2transformers==0.1.2
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfor

In [2]:
import numpy as np
from datasets import load_dataset, load_metric, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
)

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
dataset = load_dataset("/content/drive/MyDrive/NLP-Dataset", data_files="dataset.csv")

Downloading and preparing dataset csv/NLP-Dataset to /root/.cache/huggingface/datasets/csv/NLP-Dataset-6bb4e81ef475dfee/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/NLP-Dataset-6bb4e81ef475dfee/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
test_dataset = load_dataset("/content/drive/MyDrive/NLP-Dataset", data_files="test_dataset.csv")

Downloading and preparing dataset csv/NLP-Dataset to /root/.cache/huggingface/datasets/csv/NLP-Dataset-9680d2fc25d0de66/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/NLP-Dataset-9680d2fc25d0de66/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
train_dataset = dataset["train"].train_test_split(shuffle = True, seed = 200, test_size = 0.2)

datasets = DatasetDict({
    'train': train_dataset['train'],
    'test': test_dataset['train'],
    'validate': train_dataset['test']})

In [8]:
datasets["train"][0]

{'tokens': "['Intel', 'Core i7', 'ราคา', 'ไม่เกิน', '181035', 'บาท', 'ได้', 'แบรนด์', 'ไหน', 'บ้าง', 'ครับ']",
 'ner_tags': "['B-processor_brand', 'B-processor_name', 'B-price', 'I-price', 'I-price', 'I-price', 'O', 'O', 'O', 'O', 'O']"}

In [9]:
import ast

def format_type(example):
  example["tokens"] = ast.literal_eval(example["tokens"])
  example["ner_tags"] = ast.literal_eval(example["ner_tags"])
  return example

datasets["train"] = datasets["train"].map(format_type)
datasets["test"] = datasets["test"].map(format_type)
datasets["validate"] = datasets["validate"].map(format_type)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [10]:
tokens_to_ids = {
    'O': 0,
    'B-brand': 1,
    'I-brand': 2,
    'B-model': 3,
    'I-model': 4,
    'B-processor_brand': 5,
    'I-processor_brand': 6,
    'B-processor_name': 7,
    'I-processor_name': 8,
    'B-ram': 9,
    'I-ram': 10,
    'B-memory': 11,
    'I-memory': 12,
    'B-price': 13,
    'I-price': 14,
}

In [11]:
ids_to_tokens = {tokens_to_ids[token]: token for token in tokens_to_ids}
ids_to_tokens

{0: 'O',
 1: 'B-brand',
 2: 'I-brand',
 3: 'B-model',
 4: 'I-model',
 5: 'B-processor_brand',
 6: 'I-processor_brand',
 7: 'B-processor_name',
 8: 'I-processor_name',
 9: 'B-ram',
 10: 'I-ram',
 11: 'B-memory',
 12: 'I-memory',
 13: 'B-price',
 14: 'I-price'}

In [12]:
def prepare_tags(example):
  example["ner_tags"] = [tokens_to_ids[tag] for tag in example["ner_tags"]]
  return example

datasets["train"] = datasets["train"].map(prepare_tags)
datasets["test"] = datasets["test"].map(prepare_tags)
datasets["validate"] = datasets["validate"].map(prepare_tags)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [13]:
datasets["train"][0]

{'tokens': ['Intel',
  'Core i7',
  'ราคา',
  'ไม่เกิน',
  '181035',
  'บาท',
  'ได้',
  'แบรนด์',
  'ไหน',
  'บ้าง',
  'ครับ'],
 'ner_tags': [5, 7, 13, 14, 14, 14, 0, 0, 0, 0, 0]}

In [14]:
task = "ner"
batch_size = 16
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ids_to_tokens[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ids_to_tokens[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

## Bert Base Multilingual

In [15]:
model_bert_checkpoint = "bert-base-multilingual-cased"

In [16]:
model_bert = AutoModelForTokenClassification.from_pretrained(model_bert_checkpoint, num_labels=len(tokens_to_ids))

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [17]:
tokenizer_bert = AutoTokenizer.from_pretrained(model_bert_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [18]:
def tokenize_bert_function(examples):
    tokenized_inputs = tokenizer_bert(examples["tokens"], truncation=True, is_split_into_words=True)
    label_all_tokens = True
    labels = []

    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_bert_datasets = datasets.map(tokenize_bert_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [20]:
data_collator_bert = DataCollatorForTokenClassification(tokenizer_bert)

In [21]:
args = TrainingArguments(
    "bert",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
)

In [22]:
trainer_bert = Trainer(
    model_bert,
    args = args,
    train_dataset=tokenized_bert_datasets["train"],
    eval_dataset=tokenized_bert_datasets["validate"],
    data_collator=data_collator_bert,
    tokenizer=tokenizer_bert,
    compute_metrics=compute_metrics
)

In [23]:
trainer_bert.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Brand F1,Memory F1,Model F1,Price F1,Processor Brand F1,Processor Name F1,Ram F1
1,No log,0.931604,0.491379,0.188119,0.272076,0.722361,0.0,0.0,0.0,0.133333,0.0,0.738462,0.0
2,No log,0.456376,0.608108,0.594059,0.601002,0.879468,0.653465,0.25,0.058824,0.696774,0.0,0.739884,0.657143
3,No log,0.26929,0.753378,0.735974,0.744574,0.922693,0.795699,0.619048,0.615385,0.825,0.064516,0.831169,0.776119
4,No log,0.148125,0.839117,0.877888,0.858065,0.96675,0.939759,0.8,0.758621,0.99435,0.238095,0.882759,0.857143
5,No log,0.086178,0.889241,0.927393,0.907916,0.980881,0.928571,0.869565,0.758621,0.99435,0.678571,0.984615,0.852941
6,No log,0.058196,0.919872,0.947195,0.933333,0.9867,0.975,0.913043,0.8,0.99435,0.677966,1.0,0.953846
7,No log,0.039692,0.932692,0.960396,0.946341,0.990025,0.975,0.933333,0.857143,0.99435,0.689655,1.0,1.0
8,No log,0.037867,0.941748,0.960396,0.95098,0.990025,0.975,0.933333,0.915254,0.99435,0.689655,1.0,0.984615
9,No log,0.028998,0.945161,0.966997,0.955954,0.991687,0.975,0.913043,0.95082,1.0,0.689655,1.0,1.0
10,No log,0.026583,0.964169,0.976898,0.970492,0.994181,0.975,1.0,0.95082,0.99435,0.785714,1.0,1.0


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=200, training_loss=0.19586856842041014, metrics={'train_runtime': 41.1357, 'train_samples_per_second': 77.791, 'train_steps_per_second': 4.862, 'total_flos': 79451716609440.0, 'train_loss': 0.19586856842041014, 'epoch': 20.0})

In [24]:
trainer_bert.evaluate()

{'eval_loss': 0.01702955551445484,
 'eval_overall_precision': 0.980327868852459,
 'eval_overall_recall': 0.9867986798679867,
 'eval_overall_f1': 0.9835526315789472,
 'eval_overall_accuracy': 0.9941812136325852,
 'eval_brand_f1': 0.975,
 'eval_memory_f1': 1.0,
 'eval_model_f1': 0.967741935483871,
 'eval_price_f1': 1.0,
 'eval_processor_brand_f1': 0.888888888888889,
 'eval_processor_name_f1': 1.0,
 'eval_ram_f1': 1.0,
 'eval_runtime': 0.1483,
 'eval_samples_per_second': 269.8,
 'eval_steps_per_second': 20.235,
 'epoch': 20.0}

In [25]:
predictions, labels, _ = trainer_bert.predict(tokenized_bert_datasets["train"])
predictions = np.argmax(predictions, axis=-1)

predictions
true_predictions = [
    [ids_to_tokens[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [ids_to_tokens[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

for i in results:
  print(i, ':', results[i])

brand : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 227}
memory : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 74}
model : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 163}
price : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 312}
processor_brand : {'precision': 0.978021978021978, 'recall': 0.9888888888888889, 'f1': 0.9834254143646408, 'number': 90}
processor_name : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 191}
ram : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 128}
overall_precision : 0.9983136593591906
overall_recall : 0.99915611814346
overall_f1 : 0.9987347110923661
overall_accuracy : 0.9977173687487031


In [26]:
predictions, labels, _ = trainer_bert.predict(tokenized_bert_datasets["test"])
predictions = np.argmax(predictions, axis=-1)

predictions
true_predictions = [
    [ids_to_tokens[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [ids_to_tokens[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

for i in results:
  print(i, ':', results[i])

brand : {'precision': 0.7727272727272727, 'recall': 0.6538461538461539, 'f1': 0.7083333333333333, 'number': 26}
memory : {'precision': 0.6363636363636364, 'recall': 0.5833333333333334, 'f1': 0.6086956521739131, 'number': 12}
model : {'precision': 0.875, 'recall': 0.9333333333333333, 'f1': 0.9032258064516129, 'number': 15}
price : {'precision': 0.8490566037735849, 'recall': 0.9782608695652174, 'f1': 0.9090909090909092, 'number': 46}
processor_brand : {'precision': 0.45454545454545453, 'recall': 0.7142857142857143, 'f1': 0.5555555555555556, 'number': 7}
processor_name : {'precision': 0.8333333333333334, 'recall': 0.9375, 'f1': 0.8823529411764706, 'number': 16}
ram : {'precision': 0.7741935483870968, 'recall': 0.9230769230769231, 'f1': 0.8421052631578947, 'number': 26}
overall_precision : 0.7839506172839507
overall_recall : 0.8581081081081081
overall_f1 : 0.8193548387096774
overall_accuracy : 0.9424157303370787


In [27]:
model_bert.push_to_hub("Ponlawat1645/SaleAI-token-classification")

pytorch_model.bin:   0%|          | 0.00/709M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Ponlawat1645/SaleAI-token-classification/commit/4799533ccbc54249fbb74ff7f156294f6b325ef7', commit_message='Upload BertForTokenClassification', commit_description='', oid='4799533ccbc54249fbb74ff7f156294f6b325ef7', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
tokenizer_bert.push_to_hub("Ponlawat1645/SaleAI-token-classification")

CommitInfo(commit_url='https://huggingface.co/Ponlawat1645/SaleAI-token-classification/commit/2f5ee249f00a8e0df4e64cc6b409839cd16a6238', commit_message='Upload tokenizer', commit_description='', oid='2f5ee249f00a8e0df4e64cc6b409839cd16a6238', pr_url=None, pr_revision=None, pr_num=None)

## WangChangBERTa

In [29]:
model_wangchanberta_checkpoint = "airesearch/wangchanberta-base-att-spm-uncased"

In [30]:
model_wangchanberta = AutoModelForTokenClassification.from_pretrained(model_wangchanberta_checkpoint, num_labels=len(tokens_to_ids))

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier

In [31]:
tokenizer_wangchanberta = AutoTokenizer.from_pretrained(model_wangchanberta_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

In [32]:
def tokenize_wangchanberta_function(examples):
    tokenized_inputs = tokenizer_wangchanberta(examples["tokens"], truncation=True, is_split_into_words=True)
    label_all_tokens = True
    labels = []

    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [33]:
tokenized_wangchanberta_datasets = datasets.map(tokenize_wangchanberta_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [34]:
data_collator_wangchanberta = DataCollatorForTokenClassification(tokenizer_wangchanberta)

In [35]:
args = TrainingArguments(
    "wangchanberta",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
)

In [36]:
trainer_wangchanberta = Trainer(
    model_wangchanberta,
    args = args,
    train_dataset=tokenized_wangchanberta_datasets["train"],
    eval_dataset=tokenized_wangchanberta_datasets["validate"],
    data_collator=data_collator_wangchanberta,
    tokenizer=tokenizer_wangchanberta,
    compute_metrics=compute_metrics
)

In [37]:
trainer_wangchanberta.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Brand F1,Memory F1,Model F1,Price F1,Processor Brand F1,Processor Name F1,Ram F1
1,No log,1.758528,0.326733,0.15103,0.206573,0.433536,0.035088,0.0,0.0,0.0,0.139535,0.489627,0.0
2,No log,1.266101,0.388889,0.400458,0.394589,0.685491,0.474227,0.022989,0.408163,0.101266,0.36036,0.622754,0.074074
3,No log,0.935727,0.565217,0.565217,0.565217,0.781929,0.672727,0.056338,0.568966,0.176471,0.614286,0.787004,0.369565
4,No log,0.67579,0.633406,0.668192,0.650334,0.83927,0.724638,0.338028,0.645669,0.268657,0.653595,0.874016,0.431818
5,No log,0.510593,0.727273,0.750572,0.738739,0.87576,0.80315,0.725,0.686567,0.369231,0.72,0.89243,0.592593
6,No log,0.359828,0.810155,0.839817,0.824719,0.921807,0.871795,0.823529,0.788732,0.542857,0.792208,0.95,0.756098
7,No log,0.255988,0.844298,0.881007,0.862262,0.942659,0.890756,0.883721,0.802817,0.586667,0.837838,0.975207,0.864198
8,No log,0.194148,0.877193,0.915332,0.895857,0.957428,0.912281,0.954545,0.859259,0.615385,0.844156,0.983607,0.975
9,No log,0.14989,0.888889,0.933638,0.910714,0.964379,0.923077,1.0,0.890511,0.607595,0.857143,0.987654,1.0
10,No log,0.13264,0.905702,0.94508,0.924972,0.966985,0.915254,1.0,0.887218,0.815789,0.846154,0.983607,1.0


TrainOutput(global_step=200, training_loss=0.509396629333496, metrics={'train_runtime': 31.1896, 'train_samples_per_second': 102.598, 'train_steps_per_second': 6.412, 'total_flos': 72624536520480.0, 'train_loss': 0.509396629333496, 'epoch': 20.0})

In [38]:
trainer_wangchanberta.evaluate()

{'eval_loss': 0.06780602037906647,
 'eval_overall_precision': 0.9507829977628636,
 'eval_overall_recall': 0.9725400457665904,
 'eval_overall_f1': 0.9615384615384616,
 'eval_overall_accuracy': 0.9834926151172894,
 'eval_brand_f1': 0.9734513274336283,
 'eval_memory_f1': 1.0,
 'eval_model_f1': 0.9275362318840579,
 'eval_price_f1': 0.9142857142857143,
 'eval_processor_brand_f1': 0.9161290322580645,
 'eval_processor_name_f1': 0.9917355371900827,
 'eval_ram_f1': 1.0,
 'eval_runtime': 0.1301,
 'eval_samples_per_second': 307.405,
 'eval_steps_per_second': 23.055,
 'epoch': 20.0}

In [39]:
predictions, labels, _ = trainer_wangchanberta.predict(tokenized_wangchanberta_datasets["train"])
predictions = np.argmax(predictions, axis=-1)

predictions
true_predictions = [
    [ids_to_tokens[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [ids_to_tokens[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

for i in results:
  print(i, ':', results[i])

brand : {'precision': 0.935374149659864, 'recall': 0.985663082437276, 'f1': 0.9598603839441536, 'number': 279}
memory : {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 140}
model : {'precision': 0.9936305732484076, 'recall': 0.9936305732484076, 'f1': 0.9936305732484076, 'number': 314}
price : {'precision': 0.8666666666666667, 'recall': 0.9719626168224299, 'f1': 0.9162995594713657, 'number': 107}
processor_brand : {'precision': 0.9612068965517241, 'recall': 0.9529914529914529, 'f1': 0.9570815450643777, 'number': 234}
processor_name : {'precision': 0.9918478260869565, 'recall': 1.0, 'f1': 0.9959072305593452, 'number': 365}
ram : {'precision': 0.9863945578231292, 'recall': 1.0, 'f1': 0.9931506849315068, 'number': 145}
overall_precision : 0.968421052631579
overall_recall : 0.9873737373737373
overall_f1 : 0.9778055642388246
overall_accuracy : 0.9903609056265411


In [40]:
predictions, labels, _ = trainer_wangchanberta.predict(tokenized_wangchanberta_datasets["test"])
predictions = np.argmax(predictions, axis=-1)

predictions
true_predictions = [
    [ids_to_tokens[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [ids_to_tokens[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

for i in results:
  print(i, ':', results[i])

brand : {'precision': 0.7931034482758621, 'recall': 0.8518518518518519, 'f1': 0.8214285714285715, 'number': 27}
memory : {'precision': 0.782608695652174, 'recall': 0.8181818181818182, 'f1': 0.8, 'number': 22}
model : {'precision': 0.9090909090909091, 'recall': 0.9375, 'f1': 0.923076923076923, 'number': 32}
price : {'precision': 0.7368421052631579, 'recall': 0.9333333333333333, 'f1': 0.8235294117647058, 'number': 15}
processor_brand : {'precision': 0.42105263157894735, 'recall': 0.4, 'f1': 0.41025641025641024, 'number': 20}
processor_name : {'precision': 0.8108108108108109, 'recall': 0.9375, 'f1': 0.8695652173913043, 'number': 32}
ram : {'precision': 0.9230769230769231, 'recall': 0.9230769230769231, 'f1': 0.9230769230769231, 'number': 26}
overall_precision : 0.7903225806451613
overall_recall : 0.8448275862068966
overall_f1 : 0.8166666666666667
overall_accuracy : 0.9177852348993288


## Implementation

### Name Entity Recognition

In [41]:
model_test = AutoModelForTokenClassification.from_pretrained("Ponlawat1645/SaleAI-token-classification")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/709M [00:00<?, ?B/s]

In [42]:
tokenizer_test = AutoTokenizer.from_pretrained("Ponlawat1645/SaleAI-token-classification")

Downloading (…)okenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [43]:
classify_tokens = pipeline(task='ner',
         tokenizer = tokenizer_test,
         model = model_test,
         ignore_labels = [], 
         grouped_entities = True)



In [44]:
classified_tokens = classify_tokens("Asus รุ่น vivobook ที่คีบอร์ดมีไฟ ราคาไม่เกิน 90000")

In [45]:
def slot_filling(classified_tokens, ids_to_tokens):
  entities = ["brand", "model", "processor_brand", "processor_name", "ram", "memory", "price"]
  entity_slot = {entity: str() for entity in entities}

  for token in classified_tokens:
    entity = ids_to_tokens[int(token["entity_group"].split("_")[1])]

    if entity != "O":
      if entity.split("-")[0] == "B":
        entity_slot[entity.split("-")[1]] = token["word"] + entity_slot[entity.split("-")[1]]
      else:
        entity_slot[entity.split("-")[1]] = entity_slot[entity.split("-")[1]] + token["word"]

  return entity_slot

In [46]:
entity_slot = slot_filling(classified_tokens, ids_to_tokens)
entity_slot

{'brand': 'Asus',
 'model': 'รุ่นvivobook',
 'processor_brand': '',
 'processor_name': '',
 'ram': '',
 'memory': '',
 'price': 'ราคา##ไม่เกิน 90000'}

### Entity Similarity

In [47]:
import pandas as pd

laptop_df = pd.read_csv('/content/drive/MyDrive/NLP-Dataset/laptop_database.csv')

In [48]:
model_mnli_checkpoint = 'facebook/bart-large-mnli'

In [49]:
model_mnli = AutoModelForSequenceClassification.from_pretrained(model_mnli_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [50]:
tokenizer_mnli = AutoTokenizer.from_pretrained(model_mnli_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [51]:
classifier = pipeline("zero-shot-classification", model = model_mnli, tokenizer = tokenizer_mnli, revision='finetuned@xnli_th')

In [52]:
def find_unique(entity):
  return laptop_df[entity].unique()

In [53]:
def entity_similarity(entity_slot):
  entity_similarity_slot = entity_slot

  for entity in entity_similarity_slot:
    if entity_similarity_slot[entity] != "":
      if entity == "price":
        entity_similarity_slot[entity] = int("".join([char for char in entity_similarity_slot[entity] if char.isdigit()]))
      else:
        candidate_labels = find_unique(entity)
        entity_similarity_slot[entity] = classifier(entity_similarity_slot[entity], candidate_labels)["labels"][0]

  return entity_similarity_slot

In [54]:
entity_similarity_slot = entity_similarity(entity_slot)
entity_similarity_slot

{'brand': 'ASUS',
 'model': 'Vivobook',
 'processor_brand': '',
 'processor_name': '',
 'ram': '',
 'memory': '',
 'price': 90000}

### Database Querying

In [55]:
def query_dataframe(df, params, sort_by=None):
    query = pd.Series(True, index=df.index)

    for key, value in params.items():
      if value != '':
        if key == 'price':
            query &= (pd.to_numeric(df[key]) < value)
        else:
            query &= (df[key] == value)

    return df[query].sort_values(by=sort_by, ascending=False)[:3]

In [56]:
params = entity_slot
sort_by = 'star'

result = query_dataframe(laptop_df, params, sort_by)
print(result)

    brand     model processor_brand processor_name processor_gnrtn ram  \
61   ASUS  Vivobook           Intel        Core i3            11th   8   
35   ASUS  Vivobook           Intel        Core i7            10th   8   
742  ASUS  Vivobook           Intel        Core i3            11th   4   

    ram_type  ssd  hdd       os  os_bit display_size  price  star  
61      DDR4  256    0  Windows      64           14  37890   4.6  
35      DDR4  512    0  Windows      64         15.6  61990   4.5  
742     DDR4  256    0  Windows      64           14  36990   4.5  
