<a href="https://colab.research.google.com/github/PigStep/furniture-store-extraction/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

#For parsing
from urllib.parse import quote
from bs4 import BeautifulSoup
import requests
import re
import json

from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset

In [None]:
urls_df = pd.read_csv("URL_list.csv")
urls_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'URL_list.csv'

In [None]:
url_series = pd.Series(urls_df["max(page)"]) #To make code easier
url_series.head()

# Data preprocessing
Lets save only opening sites for better processing performace

In [None]:
def check_is_url_parsing(url):
  headers = {"User-Agent": "Mozilla/5.0"}
  try:
      response = requests.get(url, headers=headers, timeout=10)
      response.raise_for_status() #check for html error
      soup = BeautifulSoup(response.text, "html.parser")

      return True

  except requests.exceptions.RequestException as e:
    print(f"Attemp to URL falied: {e}")
    return False

In [None]:
def create_good_urls_csv():
  """Создает файл 'ParsingURL_list.csv' с посещаемыми ссылками"""
  good_urls =[]
  for url in url_series:
    if(check_is_url_parsing(url)):
      good_urls.append(url)

  urls = pd.Series(good_urls, name="url")
  urls.to_csv("ParsingURL_list.csv")
  print("Dataset have been created and saved")

In [None]:
# create_good_urls_csv()

In [3]:
# Recreate seies with good URLs
url_series = pd.read_csv("ParsingURL_list.csv")
url_series = pd.Series(url_series["url"])

#Parsing data from sites

In [17]:
def extract_top_product_names(url, max_length=80, min_length=0, top_n=5):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    blacklist_tags = {"footer", "nav", "script", "style", "noscript", "form", "aside", "ul", "li"}
    candidates = {}

    def add_candidate(text, source):
        text = text.strip()
        if "$" in text:
          return
        if min_length < len(text) <= max_length:
            candidates[text] = source

    # 1. h1
    h1 = soup.find("h1")
    if h1:
        add_candidate(h1.get_text(), "h1")

    # 2. h2 и h3
    for tag in soup.find_all(["h2", "h3"]):
        if tag.find_parent(blacklist_tags): continue
        add_candidate(tag.get_text(), tag.name)

    # 3. By keywords in class
    class_keywords = ["product__", "product-","title"]
    for tag in soup.find_all(True):
        classes = tag.get("class")
        if not classes:
            continue
        if any(any(k in cls.lower() for k in class_keywords) for cls in classes):
            if tag.find_parent(blacklist_tags): continue
            add_candidate(tag.get_text(), f"class={','.join(classes)}")

    # Longer textes can provide better information
    sorted_candidates = sorted(candidates.items(), key=lambda x: len(x[0]), reverse=True)

    return sorted_candidates[:top_n]


In [18]:
def create_list_for_annotations():
  with open("annote.txt", "w", encoding="utf-8") as file:
    for indx in range(0,100):
      url = url_series[indx]

      for i, (text, tag) in enumerate(extract_top_product_names(url,max_length=100,top_n=10)):
        text = text.replace("\n","")
        file.write(f'[{tag}]: {text}')
      file.write('\n\n')

    print("saving complete in 'annote.txt'")

# create_list_for_annotations()

This `annote.txt` file can be used for manual labeling to create `annotations.json` as training file that we will use in future

# Model fine tuning
In my case I will use `bert-base-multilingual-cased` as strong pretrained model. Lets fine tune it by mine manual created dataset

In [9]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=3,  # only O, B-PRODUCT, I-PRODUCT
    id2label={0: "O", 1: "B-PRODUCT", 2: "I-PRODUCT"},
    label2id={"O": 0, "B-PRODUCT": 1, "I-PRODUCT": 2}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

#Load train JSON
data = load_data("annotations.json")

annotations = data["annotations"]  # Get annotaions list
tokenizer = AutoTokenizer.from_pretrained(model_name)

def convert_to_iob(text, entities):
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=False)
    input_ids = tokenized["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    offsets = tokenized["offset_mapping"]

    ner_tags = ["O"] * len(tokens)

    for start_char, end_char, label in entities:
        for i, (start_offset, end_offset) in enumerate(offsets):
            if start_offset is None or end_offset is None:
                continue
            #
            if start_offset < end_char and end_offset > start_char:
                if start_offset == start_char:
                    ner_tags[i] = f"B-{label}"
                else:
                    ner_tags[i] = f"I-{label}"

    return {"tokens": tokens, "ner_tags": ner_tags}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [11]:
label_list = ["O", "B-PRODUCT", "I-PRODUCT"]
label2id = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(example):
    tokenized_input = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    labels = []
    word_ids = tokenized_input.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            tag = example["ner_tags"][word_idx]
            if tag.startswith("B-"):
                tag = tag.replace("B-", "I-")
            labels.append(label2id[tag])
        previous_word_idx = word_idx

    tokenized_input["labels"] = labels

    # Deleting offset_mapping for model training
    tokenized_input.pop("offset_mapping", None)

    # returning only needed params
    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_input["labels"]
    }


processed_data = []
for item in annotations:
    text = item[0]
    entities = item[1].get("entities", [])
    if entities:
        processed_data.append(convert_to_iob(text, entities))

dataset = Dataset.from_list(processed_data)
tokenized_dataset = dataset.map(tokenize_and_align_labels).remove_columns(['tokens', 'ner_tags'])

Token indices sequence length is longer than the specified maximum sequence length for this model (1009 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

# Get model metrics

In [12]:
pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=5d4507744c42c1983df9fee42ca6a8da7344f95858c217b8e01210a40d727ee4
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully buil

In [13]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_row if l != -100]
        for label_row in labels
    ]
    true_predictions = [
        [label_list[p] for p, l in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42) # split 80/20

In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()
metrics = trainer.evaluate()
print(metrics)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvladsmertev24[0m ([33mvladsmertev24-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss


{'eval_loss': 0.3540322482585907, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8396275219865494, 'eval_runtime': 6.3191, 'eval_samples_per_second': 2.532, 'eval_steps_per_second': 0.316, 'epoch': 3.0}


Result of this notebook is a NER model for e-commerce web page PRODUCT classification with metrics:

- `'eval_accuracy': 0.839`
- `'eval_precision': 0.0`
- `'eval_recall': 0.0`
- `'eval_f1': 0.0`

Despite high accuracy level other metrics are significatly bad for NER classification. This can be a result of bad training dataset labeling, bad parcing pattern providing low value potential for NER entity classification