# Загрузка библиотек

In [8]:
import os
import json

import numpy as np

from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader
from tqdm import tqdm

# Configuration

In [9]:
batch_size = 50

allowed_labels = ["title", "short_text", "date", "time", "tag", "short_title", "author"]

label2id = {label: idx+1 for idx, label in enumerate(allowed_labels)}
label2id["OTHER"] = 0

id2label = {idx+1: label for idx, label in enumerate(allowed_labels)}
id2label[0] = "OTHER"

# Загрузка данных

In [10]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["labeled_xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths:
                labels.append(label2id[labeled_xpaths[xpath]])
            else:
                labels.append(0)


        # print(len(labels))
        # print([_ for _ in labels if _ != 0])
        
        if (len([_ for _ in labels if _ != 0]) == 0):
            with open("bad_file.txt", "a") as file:
                print(file_path, file=file)

        
        labels = [labels]
        # print(len(encoding['nodes'][0]), len(encoding['xpaths'][0]), len(labels[0]))
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data
    

In [11]:
train_data = load_from_folder("test_dataset/train_part")
valid_data = load_from_folder("test_dataset/test_part")

100%|██████████| 1980/1980 [04:51<00:00,  6.80it/s]
100%|██████████| 850/850 [01:52<00:00,  7.57it/s]


In [13]:
print("Train size : ", len(train_data))
print("Test size : ", len(valid_data))
print("Train proportion : ", len(train_data) / (len(valid_data) + len(train_data)))

Train size :  1980
Test size :  850
Train proportion :  0.6996466431095406


In [14]:
idx = 0
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] != 'OTHER':
    print(node, id2label[label])
     

В Башкирии презентовали гастрономический гид title
16.02.2024 08:23 date
В республике издадут туристический путеводитель по лучшим гастрономическим местам республики. short_title
В Уфе разрабатывается кофейный напиток «уфачино» title
28.12.2023 10:59 date
Его будут готовить из кофейных зерен, которые растут на деревьях в уфимском лимонарии. short_title
Фестиваль «Есть» в Уфе стал победителем премии в сфере событийного туризма title
25.11.2023 17:30 date
Фестиваль объединил 140 участников и более 150 тысяч гостей. short_title
Повара из Уфы участвуют в кастинге шоу «Битва шефов» Ивлева и Агзамова title
13.11.2023 10:39 date
Мастера из столицы Башкирии предложили татарский пирог «Губадия» и кролика в апельсиновой глазури. short_title
Башкирия начнет поставлять в Китай куриные лапки в обмен на утиные грудки title
09.11.2023 14:11 date
Республика планирует отгружать в Китай по 700 тонн товара в месяц. short_title
Башкирия будет поставлять в Китай мороженое title
07.11.2023 20:33 date
Всего 

# Инициалиация датасета

In [15]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]
        nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

        # provide to processor
        encoding = self.processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt")

        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        return encoding

In [16]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

In [17]:
example = valid_set[0]
for k,v in example.items():
  print(k,v.shape)

input_ids torch.Size([512])
token_type_ids torch.Size([512])
attention_mask torch.Size([512])
xpath_tags_seq torch.Size([512, 50])
xpath_subs_seq torch.Size([512, 50])
labels torch.Size([512])


In [18]:
processor.decode(example['input_ids'])

'<s>created_at 25-02-2024 19:51:43window.isIndexPage = 0;\n        window.isMobileBrowser = 0;\n        window.disableSidebarCut = 1;\n        window.bannerDebugMode = 0;Еда и продукты на | АиФ-Уфа[if IE 8]><link href="https://ufa.aif.ru/css/ie8.css?44f" media="all" rel="stylesheet" type="text/css" ><![endif][if lt IE 10]><link href="https://ufa.aif.ru/css/ie9.css?44f" media="all" rel="stylesheet" type="text/css" ><![endif]//<!--\n    var isRedesignPage = false;    //-->[if lt IE 9]><script type="text/javascript" src="https://html5shiv.googlecode.com/svn/trunk/html5.js?44f"></script><![endif][if lt IE 10]><script type="text/javascript" src="https://ufa.aif.ru/resources/front/js/hybrid/css3-multi-column.js?44f"></script><![endif]//<!--\n    var _sf_startpt=(new Date()).getTime()    //-->//<!--\n    function AdFox_getWindowSize() {\n    var winWidth,winHeight;\n\tif( typeof( window.innerWidth ) == \'number\' ) {\n\t\t//Non-IE\n\t\twinWidth = window.innerWidth;\n\t\twinHeight = window.inn

In [19]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    # if label != -100:
    #     print(processor.decode([id]), label)
    if label == 1:
        print(processor.decode([id]), label)

In [20]:
train_dataloader = DataLoader(train_set, batch_size=3, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=3, shuffle=True)

In [21]:
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
label_list = ["B-" + x for x in list(id2label.values())]

In [25]:
import evaluate

# Metric
metric = evaluate.load("seqeval")

def get_labels(predictions, references):
    # Transform predictions and references tensos to numpy arrays
    if device.type == "cpu":
        y_pred = predictions.detach().clone().numpy()
        y_true = references.detach().clone().numpy()
    else:
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    return true_predictions, true_labels

def compute_metrics(metric, return_entity_level_metrics=True):
    results = metric.compute()
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# TRAIN

In [26]:
import torch
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

model.train()
print(device)
for epoch in range(10):
    for batch in tqdm(train_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )

    train_metric = compute_metrics(metric)
    print(f"Epoch {epoch}:", train_metric)
      


cuda


100%|██████████| 660/660 [07:35<00:00,  1.45it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 0: {'author_precision': 0.0, 'author_recall': 0.0, 'author_f1': 0.0, 'author_number': 0, 'date_precision': 0.0, 'date_recall': 0.0, 'date_f1': 0.0, 'date_number': 0, 'short_text_precision': 0.0, 'short_text_recall': 0.0, 'short_text_f1': 0.0, 'short_text_number': 40, 'short_title_precision': 0.06666666666666667, 'short_title_recall': 0.05714285714285714, 'short_title_f1': 0.061538461538461535, 'short_title_number': 70, 'time_precision': 0.0, 'time_recall': 0.0, 'time_f1': 0.0, 'time_number': 16, 'title_precision': 0.9945244956772334, 'title_recall': 0.9941425004801229, 'title_f1': 0.994333461390703, 'title_number': 20828, 'overall_precision': 0.988355445261048, 'overall_recall': 0.988355445261048, 'overall_f1': 0.988355445261048, 'overall_accuracy': 0.988355445261048}


100%|██████████| 660/660 [07:56<00:00,  1.39it/s]


Epoch 1: {'short_text_precision': 0.0, 'short_text_recall': 0.0, 'short_text_f1': 0.0, 'short_text_number': 40, 'short_title_precision': 0.0, 'short_title_recall': 0.0, 'short_title_f1': 0.0, 'short_title_number': 70, 'time_precision': 0.0, 'time_recall': 0.0, 'time_f1': 0.0, 'time_number': 16, 'title_precision': 0.9939868282905412, 'title_recall': 1.0, 'title_f1': 0.9969843473266, 'title_number': 20828, 'overall_precision': 0.9939868282905412, 'overall_recall': 0.9939868282905412, 'overall_f1': 0.9939868282905412, 'overall_accuracy': 0.9939868282905412}


 22%|██▏       | 143/660 [01:43<07:49,  1.10it/s]

# TEST

In [None]:
model.eval()
print(device)

test_metric = evaluate.load("seqeval")

for batch in tqdm(valid_dataloader):
    # get the inputs;
    inputs = {k:v.to(device) for k,v in batch.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]
    preds, refs = get_labels(predictions, labels)
    test_metric.add_batch(
        predictions=preds,
        references=refs,
    )

eval_metric = compute_metrics(test_metric)
print("TESTING RESULT :", eval_metric)