# Загрузка библиотек

In [None]:
import os
import json

import numpy as np

from glob import glob
from transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMForTokenClassification
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, random_split, DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Configuration

In [None]:
batch_size = 50

allowed_labels = ["title", "short_text", "date", "time", "tag", "short_title", "author"]

label2id = {label: idx+1 for idx, label in enumerate(allowed_labels)}
label2id["OTHER"] = 0

id2label = {idx+1: label for idx, label in enumerate(allowed_labels)}
id2label[0] = "OTHER"

# Загрузка данных

In [None]:
def load_from_folder(folder_path : str):
    '''
        This function loading all json files from folder.
        Each file contains dict with labels and its values.
        Each file must contains "html" label with its html code. 
        Each file must contains "xpaths" label with its labeled xpaths list. 
        
    '''
    extractor = MarkupLMFeatureExtractor()
    
    folder_path = os.path.abspath(folder_path)
    files_path = glob(os.path.join(folder_path, "*.json"))
    
    data = []
    
    for file_path in tqdm(files_path):
        print(file_path)
        with open(file_path) as file:
            info = json.load(file)
            
        html = info["html"]
        labeled_xpaths = info["labeled_xpaths"]

        encoding = extractor(html)
            
        
        labels = []
        for xpath in encoding["xpaths"][0]:
            if xpath in labeled_xpaths:
                labels.append(label2id[labeled_xpaths[xpath]])
            else:
                labels.append(0)


        # print(len(labels))
        # print([_ for _ in labels if _ != 0])
        
        if (len([_ for _ in labels if _ != 0]) == 0):
            print(file_path)
        
        labels = [labels]
        # print(len(encoding['nodes'][0]), len(encoding['xpaths'][0]), len(labels[0]))
        data.append({'nodes': encoding['nodes'],
                     'xpaths': encoding['xpaths'],
                     'node_labels': labels,
                     'html': html})
        
    return data
    

In [None]:
train_data = load_from_folder("test_dataset/train_part")
valid_data = load_from_folder("test_dataset/test_part")

  0%|          | 0/1980 [00:00<?, ?it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/24tnews.ru_80.json


  0%|          | 1/1980 [00:14<7:56:38, 14.45s/it]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/piter98.ru_24.json


  0%|          | 2/1980 [00:14<3:19:15,  6.04s/it]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kubnews.ru_213.json


  0%|          | 3/1980 [00:14<1:50:53,  3.37s/it]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_105.json


  0%|          | 4/1980 [00:15<1:10:00,  2.13s/it]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ug.ru_40.json


  0%|          | 5/1980 [00:15<49:00,  1.49s/it]  

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_76.json


  0%|          | 6/1980 [00:15<34:58,  1.06s/it]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_238.json


  0%|          | 7/1980 [00:16<31:09,  1.06it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_0.json


  0%|          | 9/1980 [00:16<18:28,  1.78it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ncrim.ru_109.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.penzainform.ru_120.json


  1%|          | 10/1980 [00:17<16:44,  1.96it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_23.json


  1%|          | 11/1980 [00:17<14:12,  2.31it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gtrk-vyatka.ru_298.json


  1%|          | 12/1980 [00:17<14:23,  2.28it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta-vp.ru_28.json


  1%|          | 14/1980 [00:18<10:56,  2.99it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/leninogorsk-rt.ru_10.json


  1%|          | 15/1980 [00:18<08:53,  3.68it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/buzdyaknews.ru_34.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/pnz.ru_21.json


  1%|          | 16/1980 [00:18<07:28,  4.38it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkovrov.ru_48.json


  1%|          | 17/1980 [00:18<08:03,  4.06it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.infpol.ru_38.json


  1%|          | 19/1980 [00:19<06:36,  4.95it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_67.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rabochy-put.ru_61.json


  1%|          | 21/1980 [00:19<05:42,  5.72it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/cheb-live.ru_1.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/citysakh.ru_41.json


  1%|          | 22/1980 [00:19<06:28,  5.05it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.amic.ru_24.json


  1%|          | 23/1980 [00:20<07:33,  4.32it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta-vp.ru_73.json


  1%|          | 24/1980 [00:20<08:01,  4.06it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/piter98.ru_201.json


  1%|▏         | 25/1980 [00:20<07:55,  4.11it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/bloknot-krasnodar.ru_29.json


  1%|▏         | 26/1980 [00:21<11:14,  2.90it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tvtver.ru_31.json


  1%|▏         | 28/1980 [00:22<14:18,  2.27it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rabochy-put.ru_57.json


  1%|▏         | 29/1980 [00:22<11:00,  2.95it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/bst.bratsk.ru_38.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/sarnovosti.ru_29.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_163.json


  2%|▏         | 32/1980 [00:22<06:56,  4.67it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vz.ru_21.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/bloknot-krasnodar.ru_61.json


  2%|▏         | 34/1980 [00:23<07:09,  4.54it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta-vp.ru_123.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.smolensk2.ru_55.json


  2%|▏         | 36/1980 [00:23<06:22,  5.08it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_136.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/askino.info_16.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.kommersant.ru_24.json


  2%|▏         | 38/1980 [00:24<05:51,  5.53it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.e1.ru_382.json


  2%|▏         | 40/1980 [00:24<08:27,  3.82it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_288.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ntr-24.ru_158.json


  2%|▏         | 41/1980 [00:25<09:54,  3.26it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/perm.aif.ru_135.json


  2%|▏         | 42/1980 [00:25<10:16,  3.14it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_182.json


  2%|▏         | 43/1980 [00:25<09:28,  3.40it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/a24.press_85.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.e1.ru_64.json


  2%|▏         | 46/1980 [00:26<07:16,  4.43it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tv-gubernia.ru_9.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/nokstv.ru_47.json


  2%|▏         | 47/1980 [00:27<11:25,  2.82it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_216.json


  3%|▎         | 50/1980 [00:27<06:51,  4.69it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ufa-town.ru_102.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/a24.press_154.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ufatime.ru_94.json


  3%|▎         | 51/1980 [00:27<06:15,  5.14it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/pnz.ru_124.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.mordovmedia.ru_15.json


  3%|▎         | 54/1980 [00:28<06:12,  5.17it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkirov.ru_36.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/nokstv.ru_60.json


  3%|▎         | 56/1980 [00:29<12:33,  2.55it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_34.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/62info.ru_43.json


  3%|▎         | 57/1980 [00:29<10:50,  2.96it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ya62.ru_342.json


  3%|▎         | 58/1980 [00:31<19:48,  1.62it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ptzgovorit.ru_56.json


  3%|▎         | 59/1980 [00:31<17:31,  1.83it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vz.ru_35.json


  3%|▎         | 60/1980 [00:32<16:32,  1.93it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/moe-online.ru_47.json


  3%|▎         | 62/1980 [00:32<13:04,  2.44it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaltasy-zarya.ru_55.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/chistopol-rt.ru_25.json


  3%|▎         | 63/1980 [00:33<11:12,  2.85it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/primamedia.ru_152.json


  3%|▎         | 65/1980 [00:33<08:32,  3.74it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/a24.press_141.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tvernews.ru_55.json


  3%|▎         | 66/1980 [00:33<07:52,  4.05it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kineshemec.ru_0.json


  3%|▎         | 67/1980 [00:34<10:19,  3.09it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rusvesna.su_4.json


  3%|▎         | 69/1980 [00:34<09:37,  3.31it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/1tulatv.ru_22.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/primamedia.ru_61.json


  4%|▎         | 70/1980 [00:35<09:16,  3.43it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/yanzori.com_21.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rabochy-put.ru_23.json


  4%|▎         | 72/1980 [00:35<07:17,  4.37it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_55.json


  4%|▎         | 73/1980 [00:35<08:17,  3.83it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tverigrad.ru_156.json


  4%|▎         | 74/1980 [00:36<10:00,  3.17it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_298.json


  4%|▍         | 76/1980 [00:37<11:03,  2.87it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_306.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svpressa.ru_24.json


  4%|▍         | 78/1980 [00:37<08:22,  3.78it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaltasy-zarya.ru_18.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_254.json


  4%|▍         | 79/1980 [00:37<08:03,  3.94it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vrn.aif.ru_101.json


  4%|▍         | 80/1980 [00:37<09:02,  3.50it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/1prime.ru_60.json


  4%|▍         | 81/1980 [00:38<10:10,  3.11it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ulpressa.ru_3.json


  4%|▍         | 83/1980 [00:38<08:23,  3.76it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_112.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ndn.info_112.json


  4%|▍         | 85/1980 [00:39<08:04,  3.91it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/bst.bratsk.ru_26.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_33.json


  4%|▍         | 86/1980 [00:39<07:30,  4.21it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ura.news_25.json


  4%|▍         | 87/1980 [00:40<09:45,  3.23it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tverigrad.ru_78.json


  4%|▍         | 88/1980 [00:40<11:34,  2.73it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_133.json


  4%|▍         | 89/1980 [00:41<13:17,  2.37it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ntr-24.ru_45.json


  5%|▍         | 90/1980 [00:41<13:12,  2.38it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ulpressa.ru_131.json


  5%|▍         | 91/1980 [00:41<11:39,  2.70it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.smolensk2.ru_102.json


  5%|▍         | 93/1980 [00:42<09:37,  3.27it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.bashinform.ru_5.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/piter98.ru_123.json


  5%|▍         | 95/1980 [00:42<07:12,  4.36it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaltasy-zarya.ru_112.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorobzor.ru_157.json


  5%|▍         | 97/1980 [00:43<06:54,  4.54it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/okt-neft.ru_163.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gtrk-vyatka.ru_288.json


  5%|▍         | 98/1980 [00:43<08:50,  3.55it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_77.json


  5%|▌         | 100/1980 [00:43<07:28,  4.19it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/pnz.ru_128.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tverigrad.ru_162.json


  5%|▌         | 101/1980 [00:44<09:41,  3.23it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_28.json


  5%|▌         | 102/1980 [00:44<09:20,  3.35it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.dp.ru_34.json


  5%|▌         | 103/1980 [00:44<09:08,  3.42it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_191.json


  5%|▌         | 104/1980 [00:45<08:56,  3.50it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/7info.ru_81.json


  5%|▌         | 106/1980 [00:45<07:53,  3.96it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tvknews.ru_35.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tvtver.ru_201.json


  5%|▌         | 107/1980 [00:46<15:16,  2.04it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkovrov.ru_56.json


  6%|▌         | 109/1980 [00:47<10:42,  2.91it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/qostanay.tv_4.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/76.ru_7.json


  6%|▌         | 111/1980 [00:48<12:20,  2.53it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kubnews.ru_88.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/aif.ru_52.json


  6%|▌         | 113/1980 [00:48<10:38,  2.92it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.penzainform.ru_9.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.e1.ru_249.json


  6%|▌         | 114/1980 [00:49<13:48,  2.25it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ku66.ru_10.json


  6%|▌         | 116/1980 [00:50<10:23,  2.99it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_155.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.vesti.ru_68.json


  6%|▌         | 117/1980 [00:50<08:53,  3.49it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/elabuga-rt.ru_21.json


  6%|▌         | 118/1980 [00:50<10:03,  3.09it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorobzor.ru_74.json


  6%|▌         | 120/1980 [00:51<08:35,  3.61it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/stolica-s.su_51.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/citysakh.ru_26.json


  6%|▌         | 121/1980 [00:51<08:15,  3.75it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.amic.ru_14.json


  6%|▌         | 123/1980 [00:51<06:52,  4.50it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ufa-town.ru_107.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorobzor.ru_140.json


  6%|▋         | 124/1980 [00:52<07:52,  3.93it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_43.json


  6%|▋         | 125/1980 [00:52<10:44,  2.88it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.amic.ru_16.json


  6%|▋         | 127/1980 [00:53<09:03,  3.41it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tver24.com_35.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/primamedia.ru_195.json


  6%|▋         | 128/1980 [00:53<08:49,  3.49it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tv-gubernia.ru_182.json


  7%|▋         | 130/1980 [00:53<07:33,  4.08it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/battime.ru_34.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_69.json


  7%|▋         | 131/1980 [00:53<06:26,  4.78it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_73.json


  7%|▋         | 133/1980 [00:54<05:54,  5.21it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/strana-live.ru_72.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ncrim.ru_54.json


  7%|▋         | 135/1980 [00:54<05:00,  6.14it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/askino.info_70.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vrn.aif.ru_4.json


  7%|▋         | 136/1980 [00:54<06:42,  4.58it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/mybashkortostan.ru_9.json


  7%|▋         | 137/1980 [00:55<08:37,  3.56it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaibicy.ru_27.json


  7%|▋         | 138/1980 [00:55<07:56,  3.86it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.kommersant.ru_26.json


  7%|▋         | 141/1980 [00:56<05:53,  5.21it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaluganews.com_22.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.e1.ru_197.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/stolica58.ru_19.json


  7%|▋         | 143/1980 [00:56<04:40,  6.55it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/aksubayevo.ru_13.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/okt-neft.ru_124.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.mk.ru_142.json


  7%|▋         | 146/1980 [00:56<05:52,  5.20it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_2.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.mordovmedia.ru_7.json


  7%|▋         | 147/1980 [00:57<05:32,  5.52it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ufa-town.ru_16.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tv-gubernia.ru_46.json


  8%|▊         | 149/1980 [00:57<04:42,  6.48it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/livebir.ru_10.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_11.json


  8%|▊         | 151/1980 [00:57<04:42,  6.47it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/1tulatv.ru_4.json


  8%|▊         | 153/1980 [00:58<05:03,  6.02it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/pronedra.ru_33.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_144.json


  8%|▊         | 155/1980 [00:58<04:57,  6.14it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/qostanay.tv_30.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/news-bash.ru_17.json


  8%|▊         | 156/1980 [00:58<04:30,  6.75it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_16.json


  8%|▊         | 158/1980 [00:58<05:50,  5.20it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.osnmedia.ru_18.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/agryz-rt.ru_9.json


  8%|▊         | 159/1980 [00:59<05:48,  5.23it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_40.json


  8%|▊         | 160/1980 [00:59<07:21,  4.12it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ya62.ru_103.json


  8%|▊         | 162/1980 [01:01<13:54,  2.18it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.bashinform.ru_1.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/leninogorsk-rt.ru_29.json


  8%|▊         | 163/1980 [01:01<13:56,  2.17it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkovrov.ru_4.json


  8%|▊         | 164/1980 [01:01<12:20,  2.45it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/citysakh.ru_7.json


  8%|▊         | 165/1980 [01:02<10:42,  2.82it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta-vp.ru_30.json


  8%|▊         | 166/1980 [01:02<09:51,  3.07it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/sdelanounas.ru_29.json


  8%|▊         | 168/1980 [01:03<10:13,  2.95it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ufa-town.ru_41.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ctnews.ru_15.json


  9%|▊         | 169/1980 [01:03<08:16,  3.65it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.smolensk2.ru_16.json


  9%|▊         | 171/1980 [01:03<08:25,  3.58it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vz.ru_98.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_183.json


  9%|▊         | 173/1980 [01:04<06:54,  4.36it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_224.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vrn.aif.ru_134.json


  9%|▉         | 175/1980 [01:04<06:55,  4.35it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/progorodsamara.ru_36.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/altai.aif.ru_38.json


  9%|▉         | 176/1980 [01:05<07:30,  4.00it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/sdelanounas.ru_15.json


  9%|▉         | 177/1980 [01:05<10:29,  2.86it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/moe-online.ru_10.json


  9%|▉         | 178/1980 [01:06<11:54,  2.52it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/togliatti24.ru_14.json


  9%|▉         | 181/1980 [01:06<07:17,  4.12it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_15.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_169.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/piter98.ru_52.json


  9%|▉         | 183/1980 [01:06<06:03,  4.95it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/strana-live.ru_1.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_29.json


  9%|▉         | 184/1980 [01:07<07:17,  4.11it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/russian.rt.com_14.json


  9%|▉         | 185/1980 [01:07<07:00,  4.26it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/russian.rt.com_14.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.province.ru_24.json


  9%|▉         | 186/1980 [01:07<09:04,  3.29it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gtrk-vyatka.ru_320.json


  9%|▉         | 187/1980 [01:08<10:35,  2.82it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.smolensk2.ru_143.json


  9%|▉         | 188/1980 [01:08<11:03,  2.70it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/161.ru_33.json


 10%|▉         | 189/1980 [01:09<13:27,  2.22it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkovrov.ru_66.json


 10%|▉         | 190/1980 [01:09<11:21,  2.63it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tv-gubernia.ru_150.json


 10%|▉         | 192/1980 [01:10<08:40,  3.43it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/pronedra.ru_15.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_65.json


 10%|▉         | 193/1980 [01:10<07:00,  4.24it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/primamedia.ru_176.json


 10%|▉         | 194/1980 [01:10<09:20,  3.19it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tvernews.ru_26.json


 10%|▉         | 195/1980 [01:11<09:03,  3.28it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.amic.ru_38.json


 10%|▉         | 197/1980 [01:11<07:40,  3.87it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/mybashkortostan.ru_0.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_7.json


 10%|█         | 198/1980 [01:11<08:36,  3.45it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/elabuga-rt.ru_6.json


 10%|█         | 199/1980 [01:12<08:03,  3.68it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/vz.ru_96.json


 10%|█         | 200/1980 [01:12<08:00,  3.71it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tmn.aif.ru_185.json


 10%|█         | 201/1980 [01:12<07:49,  3.79it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rzn.mk.ru_28.json


 10%|█         | 202/1980 [01:13<14:07,  2.10it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/politcentr.ru_135.json


 10%|█         | 204/1980 [01:14<10:08,  2.92it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/voshodnews.ru_34.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gorno-altaisk.info_18.json


 10%|█         | 205/1980 [01:14<09:10,  3.22it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/mybashkortostan.ru_12.json


 10%|█         | 206/1980 [01:14<09:38,  3.07it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/nash-krai.ru_190.json


 10%|█         | 207/1980 [01:15<10:31,  2.81it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/chuprale-online.ru_30.json


 11%|█         | 208/1980 [01:15<09:13,  3.20it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/askino.info_48.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rusvesna.su_42.json


 11%|█         | 210/1980 [01:15<08:59,  3.28it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ku66.ru_12.json


 11%|█         | 212/1980 [01:16<08:45,  3.37it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkirov.ru_18.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_17.json


 11%|█         | 213/1980 [01:16<09:10,  3.21it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/freedom-news.ru_183.json


 11%|█         | 214/1980 [01:17<08:34,  3.43it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tmn.aif.ru_0.json


 11%|█         | 216/1980 [01:17<07:00,  4.20it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaltasy-zarya.ru_17.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodglazov.com_9.json


 11%|█         | 217/1980 [01:17<07:34,  3.88it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/v102.ru_66.json


 11%|█         | 218/1980 [01:18<10:29,  2.80it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gtrk-vyatka.ru_44.json


 11%|█         | 220/1980 [01:18<09:16,  3.17it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.gorno-altaisk.info_24.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rusvesna.su_0.json


 11%|█         | 222/1980 [01:19<08:28,  3.46it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/chuprale-online.ru_21.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/perm.aif.ru_67.json


 11%|█▏        | 223/1980 [01:19<08:04,  3.63it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.rbc.ru_20.json


 11%|█▏        | 224/1980 [01:20<11:08,  2.63it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/svetput.ru_207.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kem-live.ru_38.json


 11%|█▏        | 227/1980 [01:20<07:03,  4.14it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/bavly-tat.ru_0.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/7info.ru_103.json


 12%|█▏        | 229/1980 [01:21<07:01,  4.15it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/tkgorod.ru_244.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/perm.aif.ru_207.json


 12%|█▏        | 231/1980 [01:21<06:18,  4.62it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/togliatti24.ru_31.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/rk.karelia.ru_182.json


 12%|█▏        | 232/1980 [01:21<06:24,  4.55it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.vesti.ru_47.json


 12%|█▏        | 233/1980 [01:22<06:34,  4.43it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.e1.ru_297.json


 12%|█▏        | 235/1980 [01:23<08:48,  3.30it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/strana-live.ru_131.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/orenday.ru_2.json


 12%|█▏        | 236/1980 [01:23<09:22,  3.10it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/a24.press_178.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/perm.aif.ru_167.json


 12%|█▏        | 238/1980 [01:23<08:43,  3.33it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/primamedia.ru_11.json


 12%|█▏        | 239/1980 [01:24<08:43,  3.32it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/perm.aif.ru_114.json


 12%|█▏        | 240/1980 [01:24<08:43,  3.33it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/ulan.mk.ru_31.json


 12%|█▏        | 241/1980 [01:25<13:29,  2.15it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.m24.ru_243.json


 12%|█▏        | 242/1980 [01:25<13:01,  2.22it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gorodkovrov.ru_69.json


 12%|█▏        | 244/1980 [01:26<09:18,  3.11it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/kaluganews.com_38.json
/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/www.newsler.ru_54.json


 12%|█▏        | 245/1980 [01:26<09:20,  3.10it/s]

/home/ubuntu/Documents/MarkupClassification/test_dataset/train_part/gazeta.a42.ru_6.json


In [None]:
print("Train size : ", len(train_data))
print("Test size : ", len(valid_data))
print("Train proportion : ", (len(valid_data) + len(train_data)) / len(train_data))

In [None]:
idx = 0
for node, label in zip(valid_data[idx]['nodes'][0], valid_data[idx]['node_labels'][0]):
  if id2label[label] != 'OTHER':
    print(node, id2label[label])
     

# Инициалиация датасета

In [None]:
class MarkupLMDataset(Dataset):
    """Dataset for token classification with MarkupLM."""

    def __init__(self, data, processor=None):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # first, get nodes, xpaths and node labels
        item = self.data[idx]
        nodes, xpaths, node_labels = item['nodes'], item['xpaths'], item['node_labels']

        # provide to processor
        encoding = self.processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, padding="max_length", truncation=True, return_tensors="pt")

        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        return encoding

In [None]:
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base", truncation = True)
processor.parse_html = False

train_set = MarkupLMDataset(data=train_data, processor=processor)
valid_set = MarkupLMDataset(data=valid_data, processor=processor)

In [None]:
example = valid_set[0]
for k,v in example.items():
  print(k,v.shape)

In [None]:
processor.decode(example['input_ids'])

In [None]:
for id, label in zip(example['input_ids'].tolist(), example['labels'].tolist()):
    # if label != -100:
    #     print(processor.decode([id]), label)
    if label == 1:
        print(processor.decode([id]), label)

In [None]:
train_dataloader = DataLoader(train_set, batch_size=3, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=3, shuffle=True)

In [None]:
model = MarkupLMForTokenClassification.from_pretrained("microsoft/markuplm-base", id2label=id2label, label2id=label2id)

In [None]:
label_list = ["B-" + x for x in list(id2label.values())]

In [None]:
import evaluate

# Metric
metric = evaluate.load("seqeval")

def get_labels(predictions, references):
    # Transform predictions and references tensos to numpy arrays
    if device.type == "cpu":
        y_pred = predictions.detach().clone().numpy()
        y_true = references.detach().clone().numpy()
    else:
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    return true_predictions, true_labels

def compute_metrics(metric, return_entity_level_metrics=True):
    results = metric.compute()
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

# TRAIN

In [None]:
import torch
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, f1_score

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

model.train()
print(device)
for epoch in range(10):
    for batch in tqdm(train_dataloader):
        # get the inputs;
        inputs = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(**inputs)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # print("Loss:", loss.item())

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )

    train_metric = compute_metrics(metric)
    print(f"Epoch {epoch}:", train_metric)
      


# TEST

In [None]:
model.eval()
print(device)

test_metric = evaluate.load("seqeval")

for batch in tqdm(valid_dataloader):
    # get the inputs;
    inputs = {k:v.to(device) for k,v in batch.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]
    preds, refs = get_labels(predictions, labels)
    test_metric.add_batch(
        predictions=preds,
        references=refs,
    )

eval_metric = compute_metrics(test_metric)
print("TESTING RESULT :", eval_metric)