# Загрузка библиотек

In [62]:
import os
import json
import torch

import numpy as np

from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader
from tqdm import tqdm

# Configuration

In [63]:
batch_size = 4
fout_model = "title_date_tag.pth"

allowed_labels = ["title", "short_text", "date", "time", "tag", "short_title", "author"]


label2id = {"OTHER" : 0,
            "title" : 1, 
            "short_text" : 0, 
            "date" : 2, 
            "time" : 2, 
            "tag" : 3, 
            "short_title" : 0, 
            "author" : 0}

id2label = {0: "OTHER",
            1 : "title",
            2 : "date",
            3 : "tag"}


# Загрузка данных

In [64]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["labeled_xpaths"]
        block_xpaths = info["xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths.keys():
                labels.append(label2id[labeled_xpaths[xpath]])
            else:
                labels.append(0)
            
        # if (len([_ for _ in labels if _ !=  0]) == 0):
        #     raise Exception("No labeled data found")

        
        labels = [labels]
        
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html,
                     'block_xpaths' : block_xpaths})
        
    return data
    

In [65]:
train_data = load_from_folder("test_dataset/train_part")
valid_data = load_from_folder("test_dataset/test_part")

100%|██████████| 2599/2599 [01:58<00:00, 21.87it/s]
100%|██████████| 867/867 [00:35<00:00, 24.55it/s]


In [66]:
print("Train size : ", len(train_data))
print("Test size : ", len(valid_data))
print("Train proportion : ", len(train_data) / (len(valid_data) + len(train_data)))

Train size :  2599
Test size :  867
Train proportion :  0.7498557414887478


In [67]:
idx = 256
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] == 'title':
    print(node, id2label[label])
  # print(node, id2label[label])

Jurist Narmin: The number of bankruptcy in Cuba has almost doubled title
Transfer of FF assets as bonds Ukraine will have to respond. title
Jeff Beos is about to sell about 50 million shares of the giant Amazon. title
The analysts discovered what would happen to the ruble, oil and the exchange market until 11 February title
The Wolgograd FAS has entered the " black list " of the pirate company title
In Astrahani, petrol and diesel prices did not change title
Russia will undergo the largest monetary reform since the 1990s. title
More than 11.7 million roubles of maternal capital have been stolen by 27 Australians title
The Etalon Group will trade in the London Exchange before delicting on 6 February title
The Australian region ' s budget increased by 18 billion roubles title
Cliningrades ' savings in banks increased by 25% title
The Antonov analyst called a reason why the dollar would keep it in 95 rubles. title
Income from NDFL increased its share in the Kaliningrad budget by 8.5 per c

# Инициалиация датасета

In [68]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.processor = processor
        newdata = []
        for item in tqdm(data):
            nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']
            encoding = self.processor(nodes=nodes, xpaths=xpaths, stride=20, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=True, return_offsets_mapping=True)
            
            # encoding['block_xpaths'] = np.array([item['block_xpaths']] * len(encoding['labels']))
            # encoding['all_xpaths'] = np.array([item['xpaths']] * len(encoding['labels']))
            
            for idx in range(len(encoding['labels'])):
                newdata += [{k: v[idx].squeeze() for k, v in encoding.items()}]

        self.data = newdata

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]
        # nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

        # provide to processor
        # encoding = self.processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=False)
        # encoding = self.processor(nodes=nodes, xpaths=xpaths, stride=2, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=True)
      

        # remove batch dimension
        # encoding = {k: v.squeeze() for k, v in encoding.items()}
        return item

In [69]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

100%|██████████| 2599/2599 [02:44<00:00, 15.78it/s]
100%|██████████| 867/867 [00:50<00:00, 17.05it/s]


In [70]:
example = valid_set[9]
for k,v in example.items():
  print(k,v.shape)

input_ids torch.Size([512])
token_type_ids torch.Size([512])
attention_mask torch.Size([512])
offset_mapping torch.Size([512, 2])
overflow_to_sample_mapping torch.Size([])
xpath_tags_seq torch.Size([512, 50])
xpath_subs_seq torch.Size([512, 50])
labels torch.Size([512])


In [71]:
processor.decode(example['input_ids'])

'<s>Sevastopol та Our newspaper CrimeCrime newsInterviewSevastopolCome in.Crime newsMainInteresting.Our Telegram.SevastopolCrime newsYears ago.One of the Sevastopol areas was left without light due to the accidentCrime newsYears ago.In Sevastopol, the " yellow " level of terrorist threat was extended until 28 SeptemberCrime newsYears ago.The resident of Sevastopol attacked his father with a shovel.Crime newsYears ago.In Sevastopol, a fighter who died in a special operationCrime newsYears ago.One year before the death of Elizabeth II wrote a letter to the Sevastopol teacherCrime newsYears ago.UPZ employee in Sevastopol was caught in the transfer of bribesCrime newsYears ago.For the first time since 2019, the " Bessmer regiment " will be held in CrimeaCrime newsYears ago.Sevastopol and Simferopol became the most popular cities for single tourists.Crime newsYears ago.In Sevastopol, a high-readiness regime was introducedCrime news2 years agoFour persons accused of terrorism were detained i

In [72]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    if label != -100:
        print(processor.decode([id]), label)
    # if label == 1:
    #     print(processor.decode([id]), label)

S 0
Crime 0
Interview 0
S 0
Come 0
Crime 0
Main 0
Interest 0
Our 0
S 0
Crime 3
Years 2
One 1
Crime 3
Years 2
In 1
Crime 3
Years 2
The 1
Crime 3
Years 2
In 1
Crime 3
Years 2
One 1
Crime 3
Years 2
UP 1
Crime 3
Years 2
For 1
Crime 3
Years 2
S 1
Crime 3
Years 2
In 1
Crime 3
2 2
Four 1
Crime 3
2 2
Syn 1
Crime 3
2 2
Syn 1
Crime 3
2 2
Three 1
Crime 3
2 2
The 1
Crime 3
2 2
In 1
Crime 3
2 2
The 1
Crime 3
2 2
The 1
Crime 3
2 2
At 1
Crime 3
2 2
On 1
Crime 3
2 2
In 1
Crime 3
2 2
In 1
� 0
� 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
� 0
� 0
popular 0
Russian 3
Day 2
Ret 1
Russian 3
12 2
The 1
Russian 3
Day 2


In [73]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

In [74]:
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

if os.path.exists(fout_model):
    model.load_state_dict(torch.load(fout_model))
    print("Model Loaded")
else:
    print("Its new model")

Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded


# TRAIN

In [75]:
import datetime

best_metric = 0

train_history = []
test_history = []

def train_model(): 
    model.train()

    labels_true = []
    labels_predicted = []

    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch.pop("overflow_to_sample_mapping")
        inputs = {k:v.to(device) for k,v in batch.items()}
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['macro avg']['f1-score']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Train : \n", score, file=logfile)

    train_history.append(score)
    with open("train_history.json", "w") as f:
        json.dump(train_history, f)

    print(f"Train : {score}")


def test_model():
    model.eval()

    global best_metric
    labels_true = []
    labels_predicted = []

    for batch in tqdm(valid_dataloader):
        # get the inputs;
        batch.pop("overflow_to_sample_mapping")
        inputs = {k:v.to(device) for k,v in batch.items()}

    
        # forward + backward + optimize
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Test : \n", score, file=logfile)
    
    score_f1 = score['macro avg']['f1-score']

    if score_f1 > best_metric:
        best_metric = score_f1     
        torch.save(model.state_dict(), fout_model)

    test_history.append(score_f1)
    with open("test_history.json", "w") as f:
        json.dump(test_history, f)
    print(f"Test : {score_f1}")

In [76]:
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=5e-6)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(device)
for epoch in range(0):
    print(f"Epoch {epoch}")
    train_model()
    test_model()
    

cuda


# TEST

In [77]:
from metrics import *

In [79]:
model.to(torch.device("cuda"))
model.eval()

print(device)

true_labels = []
predicted_labels = []

valid_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
valid_processor.parse_html = False

with open("out_all_xpath.txt", "w") as f:
    pass
    
for record in tqdm(valid_data):

    item = record
    nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

    block_xpaths = generate_segmentation_str(item["block_xpaths"])
    
    # encoding = valid_processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
    
    encoding = valid_processor(nodes=nodes, xpaths=xpaths, stride=0, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=True, return_offsets_mapping=True)
    input = {k:v.to(device) for k,v in encoding.items()}
    
    input.pop("overflow_to_sample_mapping")
    offset_mapping = input.pop("offset_mapping")
    labels = input.pop("labels")
    
    with open("out_xpath_fulllist.txt", "w") as f:
        print(*generate_all_xpaths(item['html']), file=f, sep='\n')
    with open("out_block_xpath.txt", "w") as f:
        print(*block_xpaths, file=f, sep='\n')
    
    with torch.no_grad():
        outputs = model(**input)
    predictions = outputs.logits.argmax(dim=-1)
    
    for idx in range(len(predictions)):
        for pred_id, word_id, offset, label_id in zip(predictions[idx].tolist(), encoding.word_ids(idx), offset_mapping[idx].tolist(), labels[idx].tolist()):
            
            if word_id is not None and offset[0] == 0:
                
                in_block = any([path_contains(block_xpath.split('/'), xpaths[0][word_id].split('/')) for block_xpath in block_xpaths])
                
                with open("out_all_xpath.txt", "a") as f:
                    print(xpaths[0][word_id], label_id, pred_id, in_block, file=f)
                    

                if in_block:
                    predicted_labels += [pred_id]
                    true_labels += [label_id]
                else:
                    predicted_labels += [0]
                    true_labels += [0]
                
    # print(true_labels)
    # print(predicted_labels)
    # while True:
    #     pass
    
       
        
classification_report(true_labels, predicted_labels, zero_division=0)

cuda


  0%|          | 0/867 [00:00<?, ?it/s]

'              precision    recall  f1-score   support\n\n           0       0.97      0.98      0.97    136643\n           1       0.84      0.89      0.87     13959\n           2       0.95      0.87      0.91     13457\n           3       0.95      0.83      0.89     11220\n\n    accuracy                           0.95    175279\n   macro avg       0.93      0.89      0.91    175279\nweighted avg       0.95      0.95      0.95    175279\n'

In [82]:
ans = classification_report(true_labels, predicted_labels, zero_division=0, target_names=[v for k, v in id2label.items()])
print(ans)

              precision    recall  f1-score   support

       OTHER       0.97      0.98      0.97    136643
       title       0.84      0.89      0.87     13959
        date       0.95      0.87      0.91     13457
         tag       0.95      0.83      0.89     11220

    accuracy                           0.95    175279
   macro avg       0.93      0.89      0.91    175279
weighted avg       0.95      0.95      0.95    175279



: 

In [None]:
model.to(torch.device("cpu"))
model.eval()

print(device)

true_labels = []
predicted_labels = []

valid_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
valid_processor.parse_html = False


for record in tqdm(valid_data):

    item = record
    nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

    block_xpaths = generate_segmentation_str(item["block_xpaths"])
    
    encoding = valid_processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
    
    offset_mapping = encoding.pop("offset_mapping")
    labels = encoding.pop("labels")
    
    for k,v in encoding.items():
        print(k,v.shape)
        
    with torch.no_grad():
        outputs = model(**encoding)

    predictions = outputs.logits.argmax(dim=-1)
    print(predictions.shape)
    pred_xpaths = []
    true_xpaths = [xpath for idx, xpath in enumerate(xpaths[0]) if node_labels[0][idx] != 0]    
    all_xpaths = []

    with open("out_block_xpath.txt", "w") as f:
        print(*block_xpaths, file=f, sep='\n')
        
    with open("out_xpath_fulllist.txt", "w") as f:
        print(*generate_all_xpaths(item['html']), file=f, sep='\n')
    
    # print(offset_mapping[0])
    for pred_id, word_id, offset, label_id in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist(), labels[0].tolist()):
        if word_id is not None and offset[0] == 0:
            
            with open("out_all_xpath.txt", "a") as f:
                print(xpaths[0][word_id], file=f)
                
                
            
            if(any([path_contains(block_xpath.split('/'), xpaths[0][word_id]) for block_xpath in block_xpaths])):
                predicted_labels += [pred_id]
                true_labels += [label_id]
                print("label_OK")
            else:
                predicted_labels += [0]
                true_labels += [0]
                    
        
classification_report(true_labels, predicted_labels, output_dict=True, zero_division=0, labels=[v for k,v in id2label.items()])

cuda


  0%|          | 0/867 [00:00<?, ?it/s]

input_ids torch.Size([1, 512])
token_type_ids torch.Size([1, 512])
attention_mask torch.Size([1, 512])
xpath_tags_seq torch.Size([1, 512, 50])
xpath_subs_seq torch.Size([1, 512, 50])
torch.Size([1, 512])
input_ids torch.Size([1, 512])
token_type_ids torch.Size([1, 512])
attention_mask torch.Size([1, 512])
xpath_tags_seq torch.Size([1, 512, 50])
xpath_subs_seq torch.Size([1, 512, 50])
torch.Size([1, 512])
input_ids torch.Size([1, 512])
token_type_ids torch.Size([1, 512])
attention_mask torch.Size([1, 512])
xpath_tags_seq torch.Size([1, 512, 50])
xpath_subs_seq torch.Size([1, 512, 50])


KeyboardInterrupt: 