# Загрузка библиотек

In [19]:
import os
import json
import torch

import numpy as np

from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader
from tqdm import tqdm

# Configuration

In [20]:
batch_size = 4

allowed_labels = ["title", "short_text", "date", "time", "tag", "short_title", "author"]

label2id = {label: idx+1 for idx, label in enumerate(allowed_labels)}
label2id["OTHER"] = 0

id2label = {idx+1: label for idx, label in enumerate(allowed_labels)}
id2label[0] = "OTHER"

# Загрузка данных

In [21]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["labeled_xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths:
                labels.append(label2id[labeled_xpaths[xpath]])
            else:
                labels.append(0)
        
        if (len([_ for _ in labels if _ !=  0]) == 0):
            raise Exception("No labeled data found")

        
        labels = [labels]
        # print(len(encoding['nodes'][0]), len(encoding['xpaths'][0]), len(labels[0]))
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data
    

In [22]:
train_data = load_from_folder("test_dataset/train_part")
valid_data = load_from_folder("test_dataset/test_part")

  0%|          | 0/2599 [00:00<?, ?it/s]

100%|██████████| 2599/2599 [01:57<00:00, 22.19it/s]
100%|██████████| 867/867 [00:36<00:00, 23.58it/s]


In [23]:
print("Train size : ", len(train_data))
print("Test size : ", len(valid_data))
print("Train proportion : ", len(train_data) / (len(valid_data) + len(train_data)))

Train size :  2599
Test size :  867
Train proportion :  0.7498557414887478


In [24]:
idx = 0
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] != 'OTHER':
    print(node, id2label[label])
  # print(node, id2label[label])

Radio title
Special projects title
News title
Power tag
Policy tag
Mayor of Moscow tag
Society tag
Policy tag
Accidents tag
regions tag
Accidents tag
Mayor of Moscow tag
City tag
Power tag
Policy tag
Power tag
Policy tag
Economics tag
Policy tag
Accidents tag
regions tag
Accidents tag
fire tag
Mayor of Moscow tag
transport tag
Policy tag
Accidents tag
Power tag
sports tag
Accidents tag
Abroad tag
transport tag
Accidents tag
Power tag
Policy tag
Accidents tag
regions tag
Culture tag
Accidents tag
Policy tag
Environment tag
Culture tag
Power tag
Policy tag
Accidents tag
Power tag
Policy tag
Science tag
Putin thanked Alexander Lukashenko for coming to St. Petersburg. title
19:51 date
Power tag
Policy tag
CEC will consider registering Putin in presidential elections on 29 January title
19:24 date
Policy tag
Putin promised to inform Lukashenko of what was happening in the SVO area. title
18:48 date
Power tag
Policy tag
Putin stated that the RF and Beloroussi relations were developing very v

# Инициалиация датасета

In [25]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]
        nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

        # provide to processor
        encoding = self.processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt")

        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        return encoding

In [26]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

In [27]:
example = valid_set[0]
for k,v in example.items():
  print(k,v.shape)

input_ids torch.Size([512])
token_type_ids torch.Size([512])
attention_mask torch.Size([512])
xpath_tags_seq torch.Size([512, 50])
xpath_subs_seq torch.Size([512, 50])
labels torch.Size([512])


In [28]:
processor.decode(example['input_ids'])

'<s>Recent news in Moscow - Moscow 24 - M24.RUMoscow 24TVRadioSpecial projectsNewsHistoryPhoto galleryVideoInfographyAudioProgrammesReverse communicationContactsAdvertisementPolicySocietyEconomyWorldSportCasesCultureShaw businessTechnologyScienceTransportCitySecurityEnvironmentHistoryExclusionsDevelopmentsCoronaurus COVID-19TourismRegionsMayor of MoscowWeb search formAbroadMetrosecurityMoscow 24TVRadioExclusionsSpecial projectsWeb search formSpecial operationsMoscow onlineNewsHistoryPhoto galleryVideoInfographyAudioProgrammesPolicySocietyEconomyWorldSportCasesCultureReverse communicationContactsAdvertisementTelegramVkontakteGradesYoutubeRutubeICQViberTiktokNewsNewsMayor of MoscowSobyanin: 49 new Moscow Longevity Centres opened last year19:51Putin thanked Alexander Lukashenko for coming to St. Petersburg.PowerPolicy19:43Sobyanin: 49 new Moscow Longevity Centres opened last yearMayor of MoscowSociety19:24CEC will consider registering Putin in presidential elections on 29 JanuaryPolicy19:

In [29]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    if label != -100:
        print(processor.decode([id]), label)
    # if label == 1:
    #     print(processor.decode([id]), label)

Recent 0
Moscow 0
TV 0
Radio 1
Special 1
News 0
History 0
Photo 0
Video 0
Inf 0
Audio 0
Program 0
R 0
Cont 0
Advertisement 0
Policy 0
Soc 0
Econom 0
World 0
Sport 0
C 0
C 0
Sh 0
Technology 0
Science 0
Trans 0
City 0
Security 0
Environment 0
History 0
Ex 0
Develop 0
Cor 0
Tour 0
Reg 0
Mayor 0
Web 0
Ab 0
Metro 0
security 0
Moscow 0
TV 0
Radio 0
Ex 0
Special 0
Web 0
Special 0
Moscow 0
News 0
History 0
Photo 0
Video 0
Inf 0
Audio 0
Program 0
Policy 0
Soc 0
Econom 0
World 0
Sport 0
C 0
C 0
R 0
Cont 0
Advertisement 0
Te 0
V 0
Gr 0
Y 0
R 0
IC 0
V 0
T 0
News 1
News 0
Mayor 0
S 0
19 0
Putin 0
Power 5
Policy 5
19 0
S 0
Mayor 5
Soc 5
19 0
C 0
Policy 5
19 0
The 0
Acc 5
reg 5
19 0
The 0
Acc 5
19 0
S 0
Mayor 5
City 5
18 0
Putin 0
Power 5
Policy 5
18 0
Putin 0
Power 5
Policy 5
Econom 5
18 0
Z 0
Policy 5
18 0
A 0
Acc 5
reg 5
18 0
The 0
Acc 5
fire 5
17 0
The 0
Mayor 5
trans 5
17 0
The 0
Policy 5
17 0
Russian 0
Acc 5
17 0


In [30]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

In [31]:
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

if os.path.exists("extracting_model.pth"):
    model.load_state_dict(torch.load("extracting_model.pth"))
    print("Model Loaded")
else:
    print("Its new model")

Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded


# TRAIN

In [34]:
import datetime

best_metric = 0

train_history = []
test_history = []

def train_model(): 
    model.train()

    labels_true = []
    labels_predicted = []

    for batch in tqdm(train_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['macro avg']['f1-score']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Train : \n", score, file=logfile)

    train_history.append(score)
    with open("train_history.json", "w") as f:
        json.dump(train_history, f)

    print(f"Train : {score}")


def test_model():
    model.eval()

    global best_metric
    labels_true = []
    labels_predicted = []

    for batch in tqdm(valid_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}


        # forward + backward + optimize
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now())
        print("Test : \n", score, file=logfile)
    
    score_f1 = score['macro avg']['f1-score']

    if score_f1 > best_metric:
        best_metric = score_f1     
        torch.save(model.state_dict(), f"extracting_model.pth")

    test_history.append(score_f1)
    with open("test_history.json", "w") as f:
        json.dump(test_history, f)
    print(f"Test : {score_f1}")

In [35]:
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(device)
for epoch in range(0):
    print(f"Epoch {epoch}")
    train_model()
    test_model()
    

cuda


# TEST

In [36]:
import matplotlib.pyplot as plt

model.eval()

labels_true = []
labels_predicted = []

for batch in tqdm(valid_dataloader):
    # get the inputs;
    inputs = {k:v.to(device) for k,v in batch.items()}


    # forward + backward + optimize
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(dim=-1)

    labels_predicted += predictions[0].tolist()
    labels_true += inputs["labels"][0].tolist()

score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)

plt.scatter(allowed_labels, [score[str(label2id[label])]['f1-score'] for label in allowed_labels])
plt.show()

100%|██████████| 217/217 [00:46<00:00,  4.69it/s]


KeyError: 1