<a href="https://colab.research.google.com/github/PigStep/NER_based-ML-furniture-store-extraction/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

#For parsing
from urllib.parse import quote
from bs4 import BeautifulSoup
import requests
import re
import json

#For modeling
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset

In [None]:
urls_df = pd.read_csv("URL_list.csv")
urls_df.head()

In [None]:
url_series = pd.Series(urls_df["max(page)"]) #To make code easier
url_series.head()

# Data preprocessing
Lets save only opening sites for better processing performace

In [None]:
def check_is_url_parsing(url):
  headers = {"User-Agent": "Mozilla/5.0"}
  try:
      response = requests.get(url, headers=headers, timeout=10)
      response.raise_for_status() #check for html error
      soup = BeautifulSoup(response.text, "html.parser")

      return True

  except requests.exceptions.RequestException as e:
    print(f"Attemp to URL falied: {e}")
    return False

In [None]:
def create_good_urls_csv():
  good_urls =[]
  for url in url_series:
    if(check_is_url_parsing(url)):
      good_urls.append(url)

  urls = pd.Series(good_urls, name="url")
  urls.to_csv("ParsingURL_list.csv")
  print("Dataset have been created and saved")

In [None]:
# create_good_urls_csv()

In [None]:
# Recreate seies with good URLs
url_series = pd.read_csv("ParsingURL_list.csv")
url_series = pd.Series(url_series["url"])

#Parsing data from sites

In [None]:
def extract_top_product_names(url, max_length=80, min_length=0, top_n=5):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    blacklist_tags = {"footer", "nav", "script", "style", "noscript", "form", "aside", "ul", "li"}
    candidates = {}

    def add_candidate(text, source):
        text = text.strip()
        if "$" in text:
          return
        if min_length < len(text) <= max_length:
            candidates[text] = source

    # 1. h1
    h1 = soup.find("h1")
    if h1:
        add_candidate(h1.get_text(), "h1")

    # 2. h2 и h3
    for tag in soup.find_all(["h2", "h3"]):
        if tag.find_parent(blacklist_tags): continue
        add_candidate(tag.get_text(), tag.name)

    # 3. By keywords in class
    class_keywords = ["product__", "product-","title"]
    for tag in soup.find_all(True):
        classes = tag.get("class")
        if not classes:
            continue
        if any(any(k in cls.lower() for k in class_keywords) for cls in classes):
            if tag.find_parent(blacklist_tags): continue
            add_candidate(tag.get_text(), f"class={','.join(classes)}")

    # Longer textes can provide better information
    sorted_candidates = sorted(candidates.items(), key=lambda x: len(x[0]), reverse=True)

    return sorted_candidates[:top_n]


In [None]:
def create_list_for_annotations():
  with open("annote.txt", "w", encoding="utf-8") as file:
    for indx in range(0,100):
      url = url_series[indx]

      for i, (text, tag) in enumerate(extract_top_product_names(url,max_length=100,top_n=10)):
        text = text.replace("\n","")
        file.write(f'[{tag}]: {text}')
      file.write('\n\n')

    print("saving complete in 'annote.txt'")

# create_list_for_annotations()

This `annote.txt` file can be used for manual labeling to create `annotations.json` as training file that we will use in future

# Model fine tuning
In my case I will use `bert-base-multilingual-cased` as strong pretrained model. Lets fine tune it by mine manual created dataset

In [None]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=3,  # only O, B-PRODUCT, I-PRODUCT
    id2label={0: "O", 1: "B-PRODUCT", 2: "I-PRODUCT"},
    label2id={"O": 0, "B-PRODUCT": 1, "I-PRODUCT": 2}
)

In [None]:
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

#Load train JSON
data = load_data("annotations.json")

annotations = data["annotations"]  # Get annotaions list
tokenizer = AutoTokenizer.from_pretrained(model_name)

def convert_to_iob(text, entities):
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=False)
    input_ids = tokenized["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    offsets = tokenized["offset_mapping"]

    ner_tags = ["O"] * len(tokens)

    for start_char, end_char, label in entities:
        for i, (start_offset, end_offset) in enumerate(offsets):
            if start_offset is None or end_offset is None:
                continue
            #
            if start_offset < end_char and end_offset > start_char:
                if start_offset == start_char:
                    ner_tags[i] = f"B-{label}"
                else:
                    ner_tags[i] = f"I-{label}"

    return {"tokens": tokens, "ner_tags": ner_tags}


In [None]:
label_list = ["O", "B-PRODUCT", "I-PRODUCT"]
label2id = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(example):
    tokenized_input = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    labels = []
    word_ids = tokenized_input.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            tag = example["ner_tags"][word_idx]
            if tag.startswith("B-"):
                tag = tag.replace("B-", "I-")
            labels.append(label2id[tag])
        previous_word_idx = word_idx

    tokenized_input["labels"] = labels

    # Deleting offset_mapping for model training
    tokenized_input.pop("offset_mapping", None)

    # returning only needed params
    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_input["labels"]
    }


processed_data = []
for item in annotations:
    text = item[0]
    entities = item[1].get("entities", [])
    if entities:
        processed_data.append(convert_to_iob(text, entities))

dataset = Dataset.from_list(processed_data)
tokenized_dataset = dataset.map(tokenize_and_align_labels).remove_columns(['tokens', 'ner_tags'])

# Get model metrics

In [None]:
pip install seqeval evaluate

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_row if l != -100]
        for label_row in labels
    ]
    true_predictions = [
        [label_list[p] for p, l in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


In [None]:
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42) # split 80/20

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
metrics = trainer.evaluate()
print(metrics)

Result of this notebook is a NER model for e-commerce web page PRODUCT classification with metrics:

- `'eval_accuracy': 0.839`
- `'eval_precision': 0.0`
- `'eval_recall': 0.0`
- `'eval_f1': 0.0`

Despite high accuracy level other metrics are significatly bad for NER classification. This can be a result of bad training dataset labeling, bad parcing pattern providing low value potential for NER entity classification

# Model deploying

script of model deploying is realized in `main.py` script

In [None]:
trainer.save_model("product_ner_model")