# Загрузка библиотек

In [1]:
import os
import json

import numpy as np

from tqdm import tqdm
from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


# Configuration

In [2]:
batch_size = 4


id2label = {1: "BEGIN", 0: "OTHER"}
label2id = {"BEGIN": 1, "OTHER": 0}

# Загрузка данных

In [3]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths:
                labels.append(1)
            else:
                labels.append(0)


        finded_segments = [_ for _ in labels if _ != 0]
        if len(finded_segments) == 0:
            print(file_path)
            with open("labeled_xpaths", "w") as f:
                print(*labeled_xpaths, sep='\n', file=f)
            with open("xpaths", "w") as f:
                print(*encoding["xpaths"][0], sep='\n', file=f)
                
            raise Exception("No blocks found")
        
        
        # print(len(labels))
        # print([_ for _ in labels if _ != 0])
        
        labels = [labels]
        # print(len(encoding['nodes'][0]), len(encoding['xpaths'][0]), len(labels[0]))
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data

In [4]:
train_data = load_from_folder("test_dataset/train_part")
valid_data = load_from_folder("test_dataset/test_part")

100%|██████████| 2599/2599 [01:46<00:00, 24.51it/s]
100%|██████████| 867/867 [00:33<00:00, 25.75it/s]


In [5]:
print("Train part size : ", len(train_data))
print("Train part size : ", len(valid_data))
print("Train part proportion : ", len(train_data) / (len(train_data) + len(valid_data)))

Train part size :  2599
Train part size :  867
Train part proportion :  0.7498557414887478


In [6]:
idx = 0
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] != 'OTHER':
    print(node, id2label[label])
     

Putin thanked Alexander Lukashenko for coming to St. Petersburg. BEGIN
CEC will consider registering Putin in presidential elections on 29 January BEGIN
Putin promised to inform Lukashenko of what was happening in the SVO area. BEGIN
Putin stated that the RF and Beloroussi relations were developing very vigorously. BEGIN
Zaharova called a monstrous execution of a prisoner in the U.S. pure nitrogen BEGIN
The army of the Russian Federation told me how the VSA left the wounded when the field hospital was taken to the rear. BEGIN
Putin noted the contribution of the Navy ' s military personnel to the preservation of Russia ' s fleet history. BEGIN
Rusophobia in the West will last decades, but common sense prevails - Peskov. BEGIN
Putin started the winter station at the Antarctic. BEGIN
The memory of the lening of the Leningrads gives true values to young people - Patirusev BEGIN
Peskov told me how he managed to combine his father and work. BEGIN
MFA warned the West of the effects of the con

# Инициалиация датасета

In [7]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]
        nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

        # provide to processor
        encoding = self.processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt")

        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        return encoding
    

In [8]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

In [9]:
example = valid_set[0]
for k,v in example.items():
  print(k,v.shape)

input_ids torch.Size([512])
token_type_ids torch.Size([512])
attention_mask torch.Size([512])
xpath_tags_seq torch.Size([512, 50])
xpath_subs_seq torch.Size([512, 50])
labels torch.Size([512])


In [10]:
processor.decode(example['input_ids'])

'<s>Recent news in Moscow - Moscow 24 - M24.RUMoscow 24TVRadioSpecial projectsNewsHistoryPhoto galleryVideoInfographyAudioProgrammesReverse communicationContactsAdvertisementPolicySocietyEconomyWorldSportCasesCultureShaw businessTechnologyScienceTransportCitySecurityEnvironmentHistoryExclusionsDevelopmentsCoronaurus COVID-19TourismRegionsMayor of MoscowWeb search formAbroadMetrosecurityMoscow 24TVRadioExclusionsSpecial projectsWeb search formSpecial operationsMoscow onlineNewsHistoryPhoto galleryVideoInfographyAudioProgrammesPolicySocietyEconomyWorldSportCasesCultureReverse communicationContactsAdvertisementTelegramVkontakteGradesYoutubeRutubeICQViberTiktokNewsNewsMayor of MoscowSobyanin: 49 new Moscow Longevity Centres opened last year19:51Putin thanked Alexander Lukashenko for coming to St. Petersburg.PowerPolicy19:43Sobyanin: 49 new Moscow Longevity Centres opened last yearMayor of MoscowSociety19:24CEC will consider registering Putin in presidential elections on 29 JanuaryPolicy19:

In [11]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    # if label != -100:
    #     print(processor.decode([id]), label)
    if label == 1:
        print(processor.decode([id]), label)

In [12]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

In [13]:
import torch
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

if os.path.exists("segmentation_model.pth"):
    model.load_state_dict(torch.load("segmentation_model.pth"))
    print("Model Loaded")

Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded


In [14]:
# label_list = ["B-" + x for x in list(id2label.values())]

In [15]:
# import evaluate

# metric = evaluate.load("seqeval")

# def get_labels(predictions, references):
#     # Transform predictions and references tensos to numpy arrays
#     if device.type == "cpu":
#         y_pred = predictions.detach().clone().numpy()
#         y_true = references.detach().clone().numpy()
#     else:
#         y_pred = predictions.detach().cpu().clone().numpy()
#         y_true = references.detach().cpu().clone().numpy()

#     # Remove ignored index (special tokens)
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
#         for pred, gold_label in zip(y_pred, y_true)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
#         for pred, gold_label in zip(y_pred, y_true)
#     ]
#     return true_predictions, true_labels

# def compute_metrics(return_entity_level_metrics=True):
#     results = metric.compute()
#     if return_entity_level_metrics:
#         # Unpack nested dictionaries
#         final_results = {}
#         for key, value in results.items():
#             if isinstance(value, dict):
#                 for n, v in value.items():
#                     final_results[f"{key}_{n}"] = v
#             else:
#                 final_results[key] = value
#         return final_results
#     else:
#         return {
#             "precision": results["overall_precision"],
#             "recall": results["overall_recall"],
#             "f1": results["overall_f1"],
#             "accuracy": results["overall_accuracy"],
#         }

# TRAIN

In [16]:
import datetime

best_metric = 0

train_history = []
test_history = []

def train_model():
    labels_true = []
    labels_predicted = []

    for batch in tqdm(train_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True)['1']['f1-score']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Train : \n", score, file=logfile)

    train_history.append(score)
    with open("train_history.json", "w") as f:
        json.dump(train_history, f)

    print(f"Train : {score}")
        

def test_model():
    global best_metric
    labels_true = []
    labels_predicted = []

    for batch in tqdm(valid_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}


        # forward + backward + optimize
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True)['1']['f1-score']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Test : \n", score, file=logfile)
      
    if score > best_metric:
        best_metric = score     
        torch.save(model.state_dict(), f"segmentation_model.pth")

    test_history.append(score)
    with open("test_history.json", "w") as f:
        json.dump(test_history, f)
    print(f"Test : {score}")

In [17]:
import torch
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
print(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


cuda


In [18]:
model.train()
for epoch in range(0):
    print(f"Epoch : {epoch}")
    train_model()
    test_model()

# TEST

In [19]:
from metrics import segmentation_metric
from sklearn.metrics import classification_report

In [20]:
test_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
test_processor.parse_html = False

model.to(torch.device("cpu"))
model.eval()
print(device)

valid_metric = segmentation_metric()

true_labels = []
predicted_labels = []

valid_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
valid_processor.parse_html = False

for record in tqdm(valid_data):

    item = record
    nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

    encoding = valid_processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
    
    offset_mapping = encoding.pop("offset_mapping")
    labels = encoding.pop("labels")
    
    with torch.no_grad():
        outputs = model(**encoding)

    predictions = outputs.logits.argmax(dim=-1)

    pred_xpaths = []
    true_xpaths = [xpath for idx, xpath in enumerate(xpaths[0]) if node_labels[0][idx] == 1]    
    all_xpaths = []

    for pred_id, word_id, offset, label_id in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist(), labels[0].tolist()):
        if word_id is not None and offset[0] == 0:
            true_labels += [label_id]
            predicted_labels += [pred_id]
            if (pred_id == 1):
                pred_xpaths += [xpaths[0][word_id]]
            all_xpaths += [xpaths[0][word_id]]


    valid_metric.add_result({"true_xpaths" : true_xpaths,
                             "pred_xpaths" : pred_xpaths,
                            #  "html": item["html"],
                             "all_xpaths" : all_xpaths})
    

cuda


100%|██████████| 867/867 [04:54<00:00,  2.95it/s]


In [21]:
print("Validation score : ")
print(*valid_metric.get_metric().items(), sep='\n')

Validation score : 
('avg_precision', 0.41467722024882775)
('avg_recall', 0.4325053684956899)
('avg_f1', 0.42340370619815393)
('avg_NMI', 0.7721857394471541)
('avg_ARI', 0.7421571993488417)


In [22]:
print("Results without 'bad' htmls : ")
ari = [score["ARI"] for score in valid_metric.ARI_NMI if score["ARI"] > 0.01]
nmi = [score["NMI"] for score in valid_metric.ARI_NMI if score["ARI"] > 0.01]
print("ARI : ", sum(ari) / len(ari))
print("NMI : ", sum(nmi) / len(nmi))

Results without 'bad' htmls : 
ARI :  0.90275919127062
NMI :  0.9388096988601047


In [23]:
print("Label-marking scores :")
print(classification_report(true_labels, predicted_labels))

Label-marking scores :
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     79981
           1       0.81      0.74      0.78      7443

    accuracy                           0.96     87424
   macro avg       0.89      0.86      0.88     87424
weighted avg       0.96      0.96      0.96     87424

