# Загрузка библиотек

In [None]:
import os
import json
import torch

import numpy as np

from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader
from tqdm import tqdm

# Configuration

In [None]:
batch_size = 4
fout_model = "ext_segmentation.pth"

allowed_labels = ["title", "short_text", "date", "time", "tag", "short_title", "author"]


id2label = {1: "BEGIN", 0: "OTHER"}
label2id = {"BEGIN": 1, "OTHER": 0}

# Загрузка данных

In [None]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths:
                labels.append(1)
            else:
                labels.append(0)


        finded_segments = [_ for _ in labels if _ != 0]
        if len(finded_segments) == 0:
            print(file_path)
            with open("labeled_xpaths", "w") as f:
                print(*labeled_xpaths, sep='\n', file=f)
            with open("xpaths", "w") as f:
                print(*encoding["xpaths"][0], sep='\n', file=f)
                
            raise Exception("No blocks found")
        
        
        # print(len(labels))
        # print([_ for _ in labels if _ != 0])
        
        labels = [labels]
        # print(len(encoding['nodes'][0]), len(encoding['xpaths'][0]), len(labels[0]))
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data

In [None]:
def load_bad_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.html"))
    
    data = []
    
    for file_path in tqdm(files_path):
        # print(file_path)
        with open(file_path) as file:
            info = file.read()
            
        html = info

        encoding = extractor(html)
            
        labels = []
        
        for xpath in encoding["xpaths"][0]:
            labels.append(0)

        labels = [labels]
        
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data

In [None]:
train_data = load_from_folder("test_dataset/train_part")
train_data += load_bad_folder("test_dataset/bad")
valid_data = load_from_folder("test_dataset/test_part")

In [None]:
print("Train size : ", len(train_data))
print("Test size : ", len(valid_data))
print("Train proportion : ", len(train_data) / (len(valid_data) + len(train_data)))

In [None]:
idx = 256
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] == 'title':
    print(node, id2label[label])
  # print(node, id2label[label])

# Инициалиация датасета

In [None]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.processor = processor
        newdata = []
        for item in tqdm(data):
            nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']
            encoding = self.processor(nodes=nodes, xpaths=xpaths, stride=200, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=True, return_offsets_mapping=True)
            
            # encoding['block_xpaths'] = np.array([item['block_xpaths']] * len(encoding['labels']))
            # encoding['all_xpaths'] = np.array([item['xpaths']] * len(encoding['labels']))
            
            for idx in range(len(encoding['labels'])):
                newdata += [{k: v[idx].squeeze() for k, v in encoding.items()}]

        self.data = newdata

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]

        return item

In [None]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

In [None]:
example = valid_set[9]
for k,v in example.items():
  print(k,v.shape)

In [None]:
processor.decode(example['input_ids'])

In [None]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    if label != -100:
        print(processor.decode([id]), label)
    # if label == 1:
    #     print(processor.decode([id]), label)

In [None]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

In [None]:
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

if os.path.exists(fout_model):
    model.load_state_dict(torch.load(fout_model))
    print("Model Loaded")
else:
    print("Its new model")

# TRAIN

In [None]:
import datetime

best_metric = 0

train_history = []
test_history = []

def train_model(): 
    model.train()

    labels_true = []
    labels_predicted = []

    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch.pop("overflow_to_sample_mapping")
        batch.pop("offset_mapping")
        inputs = {k:v.to(device) for k,v in batch.items()}
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['1']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now(), file=logfile)
        print("Train : \n", score, file=logfile)

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['1']['f1-score']
    train_history.append(score)
    with open("train_history.json", "w") as f:
        json.dump(train_history, f)

    print(f"Train : {score}")


def test_model():
    model.eval()

    global best_metric
    labels_true = []
    labels_predicted = []

    for batch in tqdm(valid_dataloader):
        # get the inputs;
        batch.pop("overflow_to_sample_mapping")
        batch.pop("offset_mapping")
        inputs = {k:v.to(device) for k,v in batch.items()}

    
        # forward + backward + optimize
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = outputs.logits.argmax(dim=-1)

        labels_predicted += predictions[0].tolist()
        labels_true += inputs["labels"][0].tolist()

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['1']
    with open("out_log.txt", "a") as logfile:
        print(datetime.datetime.now(), file=logfile)
        print("Test : \n", score, file=logfile)

    score = classification_report(labels_true, labels_predicted, output_dict=True, zero_division=0)['1']['f1-score']
    if score > best_metric:
        best_metric = score     
        torch.save(model.state_dict(), fout_model)

    test_history.append(score)
    with open("test_history.json", "w") as f:
        json.dump(test_history, f)
    print(f"Test : {score}")

In [None]:
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(device)

In [None]:
for epoch in range(10000):
    print(f"Epoch {epoch}")
    train_model()
    test_model()

# TEST

In [None]:
from metrics import *
from sklearn.metrics import classification_report

In [None]:
test_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
test_processor.parse_html = False

model.to(torch.device("cuda"))
model.eval()
print(device)

valid_metric = segmentation_metric()

true_labels = []
predicted_labels = []

valid_processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
valid_processor.parse_html = False

    
for record in tqdm(valid_data):

    item = record
    nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']
    
    encoding = valid_processor(nodes=nodes, xpaths=xpaths, stride=200, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt", return_overflowing_tokens=True, return_offsets_mapping=True)
    input = {k:v.to(device) for k,v in encoding.items()}
    
    input.pop("overflow_to_sample_mapping")
    offset_mapping = input.pop("offset_mapping")
    labels = input.pop("labels")
    
    with torch.no_grad():
        outputs = model(**input)
        
    predictions = outputs.logits.argmax(dim=-1)
    pred_xpaths = []
    true_xpaths = [xpath for idx, xpath in enumerate(xpaths[0]) if node_labels[0][idx] == 1]    
    all_xpaths = []
    probs = []
    
    for idx in range(len(predictions)):
        for pred_id, word_id, offset, label_id, probability in zip(predictions[idx].tolist(), encoding.word_ids(idx), offset_mapping[idx].tolist(), labels[idx].tolist(), outputs.logits[idx]):
            if word_id is not None and offset[0] == 0:
                true_labels += [label_id]
                predicted_labels += [pred_id]
                if (pred_id == 1):
                    pred_xpaths += [xpaths[0][word_id]]
                all_xpaths += [xpaths[0][word_id]]
                probs += [(idx, probability.tolist())]
                
    # if len(pred_xpaths) == 0:
    #     print(list(zip(all_xpaths, probs)))  
    #     print(true_xpaths)    
    
    valid_metric.add_result({"true_xpaths" : true_xpaths,
                             "pred_xpaths" : pred_xpaths,
                            #  "html": item["html"],
                             "all_xpaths" : all_xpaths})
        

In [None]:
print("Validation score : ")
print(*valid_metric.get_metric().items(), sep='\n')

In [None]:
print("Results without 'bad' htmls : ")
ari = [score["ARI"] for score in valid_metric.ARI_NMI if score["ARI"] > 0.01]
nmi = [score["NMI"] for score in valid_metric.ARI_NMI if score["ARI"] > 0.01]
print("ARI : ", sum(ari) / len(ari))
print("NMI : ", sum(nmi) / len(nmi))

In [None]:
bad_ari = [score["ARI"] for score in valid_metric.ARI_NMI if score["ARI"] <= 0.01]
print(len(bad_ari))

In [None]:
print("Label-marking scores :")
print(classification_report(true_labels, predicted_labels))