# Ноутбук для третьего задания по курсу "Обработка текстов" 2022

## Работа с датасетом

### Работа с файлами

In [1]:
# Сортировка для отрезков [start, end] (глупая за O(n^2))
def bubble_sort(arr):
    for i in range(len(arr) - 1):
        for j in range(len(arr) - i - 1):
            if arr[j][0] > arr[j + 1][0] or arr[j][0] == arr[j + 1][0] and arr[j][1] < arr[j + 1][1]:
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
    return arr


# Проверка на пересечение двух отсортированных отрезков
def check_intersection(l1, r1, l2, r2):
    return l2 <= r1
                

# Работа с файлами .ann формата
def read_ann_file(path):
    spans = []
    
    file = open(path, 'r')
    for line in file:
        if len(line) == 0:
            continue
        if line[0] != 'T':
            continue
        
        columns = line.split('\t')
        if ';' not in columns[1]:
            category, start, end = columns[1].split(' ')
            spans.append([int(start), int(end), category])
    file.close()
    
    spans = bubble_sort(spans)
    last_not_intersect = [-1, -1]
    filtered_spans = []
    for elem in spans:
        if not check_intersection(last_not_intersect[0], last_not_intersect[1], elem[0], elem[1]):
            last_not_intersect = [elem[0], elem[1]]
            filtered_spans.append(elem)
    
    return filtered_spans
    

# Работа с файлами .txt формата
def read_text_file(path):
    with open(path) as f:
        return f.read()

### BIO разметка

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification, BertForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

def bio(path):
    ann_file = read_ann_file(f'{path}.ann')
    txt_file = read_text_file(f'{path}.txt')
    spans = tokenizer(txt_file, return_offsets_mapping=True, add_special_tokens=False, truncation=True).offset_mapping
    tokens = []
    
    for span in spans:
        tokens.append(txt_file[span[0]:span[1]])
    labels = ["O"] * len(tokens)
    idx = 0
    cnt_spans = len(spans)
    for ann in ann_file:
        while idx < cnt_spans and ann[0] > spans[idx][0]:
            idx += 1
        if idx < cnt_spans:
            labels[idx] = f'B-{ann[2]}'
        idx += 1
        while idx < cnt_spans and spans[idx][1] <= ann[1]:
            labels[idx] = f"I-{ann[2]}"
            idx += 1
    
    return tokens, labels, spans
    

  from .autonotebook import tqdm as notebook_tqdm


### Функция для чтения датасета

In [3]:
import os


def read_data(path):
    token_seq, label_seq, spans_seq = [], [], []
    files = set()
    for file in os.listdir(path):
        if file[0] != '.':
            files.add(file[:-4])
    
    for file in files:
        tokens, labels, spans = bio(f'{path}/{file}')
        token_seq.append(tokens)
        label_seq.append(labels)
        spans_seq.append(spans)
    
    return token_seq, label_seq, spans_seq

In [4]:
TRAIN_PATH = 'data/train'
VALID_PATH = 'data/dev'
TEST_PATH = 'data/test'

train_token_seq, train_label_seq, train_spans_seq = read_data(TRAIN_PATH)

test_token_seq, test_label_seq, test_spans_seq = read_data(TEST_PATH)

valid_token_seq, valid_label_seq, valid_spans_seq = read_data(VALID_PATH)

print(train_token_seq[:10])

[['Египет', 'ского', 'студента', 'могут', 'выслать', 'из', 'страны', 'за', 'высказывания', 'о', 'Трамп', 'е', 'Дональд', 'Трамп', 'Египет', 'ский', 'студент', ',', 'обуч', 'ающийся', 'в', 'американской', 'лет', 'ной', 'школе', ',', 'в', 'пятницу', ',', '4', 'марта', '2016', 'года', ',', 'вынужден', 'был', 'явиться', 'в', 'иммигра', 'ционный', 'суд', ',', 'где', 'будет', 'приниматься', 'решение', 'о', 'его', 'депортации', 'в', 'связи', 'с', 'опубликованным', 'и', 'им', 'в', 'социальных', 'сетях', 'комментариями', ',', 'содержа', 'щими', 'угрозы', 'в', 'адрес', 'потенциального', 'кандидата', 'в', 'президенты', 'от', 'Республиканской', 'партии', 'Дональда', 'Трампа', '.', 'Эм', 'аде', 'льд', 'ин', 'Эль', 'са', 'ед', ',', '23', '-', 'летний', 'студент', 'из', 'Каи', 'ра', ',', 'предстал', 'перед', 'иммигра', 'ционным', 'судом', 'в', 'Лос', '-', 'Анджелесе', ',', 'после', 'того', 'как', 'он', 'написал', 'на', 'своей', 'странице', 'в', 'Facebook', ',', 'что', 'готов', 'отсид', 'еть', 'пожиз'

In [5]:
print(train_label_seq[:10])

[['B-NATIONALITY', 'I-NATIONALITY', 'B-PROFESSION', 'O', 'B-PENALTY', 'I-PENALTY', 'I-PENALTY', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'B-PERSON', 'I-PERSON', 'B-NATIONALITY', 'I-NATIONALITY', 'B-PROFESSION', 'O', 'O', 'O', 'O', 'B-COUNTRY', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PENALTY', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROFESSION', 'I-PROFESSION', 'I-PROFESSION', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'B-AGE', 'I-AGE', 'I-AGE', 'B-PROFESSION', 'O', 'B-CITY', 'I-CITY', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'B-CITY', 'I-CITY', 'I-CITY', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCT', 'O', 'O', 'O',

In [6]:
print(train_spans_seq[:10])

[[(0, 6), (6, 11), (12, 20), (21, 26), (27, 34), (35, 37), (38, 44), (45, 47), (48, 60), (61, 62), (63, 68), (68, 69), (70, 77), (78, 83), (84, 90), (90, 94), (95, 102), (102, 103), (104, 108), (108, 115), (116, 117), (118, 130), (131, 134), (134, 137), (138, 143), (143, 144), (145, 146), (147, 154), (154, 155), (156, 157), (158, 163), (164, 168), (169, 173), (173, 174), (175, 183), (184, 187), (188, 195), (196, 197), (198, 205), (205, 212), (213, 216), (216, 217), (218, 221), (222, 227), (228, 239), (240, 247), (248, 249), (250, 253), (254, 264), (265, 266), (267, 272), (273, 274), (275, 289), (289, 290), (291, 293), (294, 295), (296, 306), (307, 312), (313, 326), (326, 327), (328, 335), (335, 339), (340, 346), (347, 348), (349, 354), (355, 369), (370, 379), (380, 381), (382, 392), (393, 395), (396, 411), (412, 418), (419, 427), (428, 434), (434, 435), (437, 439), (439, 442), (442, 445), (445, 447), (448, 451), (451, 453), (453, 455), (455, 456), (457, 459), (459, 460), (460, 466), (4

## Создание Dataset и Collator (ЧАСТЬ ВЗЯТА ИЗ НОУТБУКА ПО DL "Домашнее задание 3 Named Entity Recognition")

In [7]:
from collections import Counter
from typing import Tuple, List, Dict, Any

token2cnt = Counter([token for sentence in train_token_seq for token in sentence])

In [8]:
# используйте параметр min_count для того, чтобы отсекать слова частотой cnt < min_count

def get_token2idx(
    token2cnt: Dict[str, int],
    min_count: int,
) -> Dict[str, int]:
    """
    Get mapping from tokens to indices to use with Embedding layer.
    """

    token2idx: Dict[str, int] = {}
    token2idx["<PAD>"], token2idx["<UNK>"] = 0, 1
    idx = 0
    for token, cnt in token2cnt.items():
        if cnt < min_count:
            continue
        idx += 1
        token2idx[token] = idx + 1 # потому что 0 1 индексы уже используем

    return token2idx

In [9]:
token2idx = get_token2idx(token2cnt, min_count=1)

In [10]:
# Функция для сортировки тегов, чтобы сначала был тег O, потом теги B- и только после теги I- (можно задать вручную)

def sort_labels_func(x: str) -> int:
    if x == "O":
        return 0
    elif x.startswith("B-"):
        return 1
    else:
        return 2

label_set = sorted(
    set(label for sentence in train_label_seq for label in sentence),
    key=lambda x: (sort_labels_func(x), x),
)

In [11]:
label_set

['O',
 'B-AGE',
 'B-AWARD',
 'B-CITY',
 'B-COUNTRY',
 'B-CRIME',
 'B-DATE',
 'B-DISEASE',
 'B-DISTRICT',
 'B-EVENT',
 'B-FACILITY',
 'B-FAMILY',
 'B-IDEOLOGY',
 'B-LANGUAGE',
 'B-LAW',
 'B-LOCATION',
 'B-MONEY',
 'B-NATIONALITY',
 'B-NUMBER',
 'B-ORDINAL',
 'B-ORGANIZATION',
 'B-PENALTY',
 'B-PERCENT',
 'B-PERSON',
 'B-PRODUCT',
 'B-PROFESSION',
 'B-RELIGION',
 'B-STATE_OR_PROVINCE',
 'B-TIME',
 'B-WORK_OF_ART',
 'I-AGE',
 'I-AWARD',
 'I-CITY',
 'I-COUNTRY',
 'I-CRIME',
 'I-DATE',
 'I-DISEASE',
 'I-DISTRICT',
 'I-EVENT',
 'I-FACILITY',
 'I-FAMILY',
 'I-IDEOLOGY',
 'I-LANGUAGE',
 'I-LAW',
 'I-LOCATION',
 'I-MONEY',
 'I-NATIONALITY',
 'I-NUMBER',
 'I-ORDINAL',
 'I-ORGANIZATION',
 'I-PENALTY',
 'I-PERCENT',
 'I-PERSON',
 'I-PRODUCT',
 'I-PROFESSION',
 'I-RELIGION',
 'I-STATE_OR_PROVINCE',
 'I-TIME',
 'I-WORK_OF_ART']

In [12]:
def get_label2idx(label_set: List[str]) -> Dict[str, int]:
    """
    Get mapping from labels to indices.
    """

    label2idx: Dict[str, int] = {}
    for idx, label in enumerate(label_set):
        label2idx[label] = idx

    return label2idx

In [13]:
label2idx = get_label2idx(label_set)

In [14]:
for token, idx in list(token2idx.items())[:10]:
    print(f"{token}\t{idx}")

<PAD>	0
<UNK>	1
Египет	2
ского	3
студента	4
могут	5
выслать	6
из	7
страны	8
за	9


In [15]:
for label, idx in list(label2idx.items())[:10]:
    print(f"{label}\t{idx}")

O	0
B-AGE	1
B-AWARD	2
B-CITY	3
B-COUNTRY	4
B-CRIME	5
B-DATE	6
B-DISEASE	7
B-DISTRICT	8
B-EVENT	9


In [16]:
# класс датасета NERDataset

class NERDataset(torch.utils.data.Dataset):
    """
    PyTorch Dataset for NER.
    """

    def __init__(
        self,
        token_seq: List[List[str]],
        label_seq: List[List[str]],
        token2idx: Dict[str, int],
        label2idx: Dict[str, int],
    ):
        self.token2idx = token2idx
        self.label2idx = label2idx

        self.token_seq = [self.process_tokens(tokens, token2idx) for tokens in token_seq]
        self.label_seq = [self.process_labels(labels, label2idx) for labels in label_seq]

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(
        self,
        idx: int,
    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
        first_item = torch.LongTensor(self.token_seq[idx])
        second_item = torch.LongTensor(self.label_seq[idx])
        return first_item, second_item
        
    @staticmethod
    def process_tokens(
        tokens: List[str],
        token2idx: Dict[str, int],
        unk: str = "<UNK>",
    ) -> List[int]:
        """
        Transform list of tokens into list of tokens' indices.
        """
        idxs = []
        for token in tokens:
            key = unk
            if token in token2idx:
                key = token
            idx = token2idx[key]
            idxs.append(idx)
        return idxs

    @staticmethod
    def process_labels(
        labels: List[str],
        label2idx: Dict[str, int],
    ) -> List[int]:
        """
        Transform list of labels into list of labels' indices.
        """
        idxs = []
        for label in labels:
            idx = label2idx[label]
            idxs.append(idx)
        
        return idxs

In [17]:
train_dataset = NERDataset(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
    token2idx=token2idx,
    label2idx=label2idx,
)
valid_dataset = NERDataset(
    token_seq=valid_token_seq,
    label_seq=valid_label_seq,
    token2idx=token2idx,
    label2idx=label2idx,
)
test_dataset = NERDataset(
    token_seq=test_token_seq,
    label_seq=test_label_seq,
    token2idx=token2idx,
    label2idx=label2idx,
)

In [18]:
train_dataset[0]

(tensor([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  12,
           2,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  17,  20,  25,
          17,  26,  27,  28,  29,  17,  30,  31,  32,  20,  33,  34,  35,  17,
          36,  37,  38,  39,  11,  40,  41,  20,  42,  43,  44,  45,  46,  20,
          47,  48,  49,  17,  50,  51,  52,  20,  53,  54,  55,  20,  56,  57,
          58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  17,  70,
          71,  72,  16,   7,  73,  74,  17,  75,  76,  33,  77,  78,  20,  79,
          71,  80,  17,  81,  82,  83,  84,  85,  86,  87,  88,  20,  89,  17,
          90,  91,  92,  93,  94,  95,  96,   9,  97,  61,  45,  98,  17,  90,
          99,  37, 100, 101,   9, 102,  62, 103,  33, 104, 105, 106,  67,  68,
         107,  20, 108, 109, 110, 111, 112, 113, 114,  17,  36,  84, 115,  43,
         116, 117,  62, 118, 119, 120, 121, 122,  67,  68, 107,  17, 123, 124,
          17,  90,  40, 125, 126, 127, 128, 129, 130

In [19]:
class NERCollator:
    """
    Collator that handles variable-size sentences.
    """

    def __init__(
        self,
        token_padding_value: int,
        label_padding_value: int,
    ):
        self.token_padding_value = token_padding_value
        self.label_padding_value = label_padding_value

    def __call__(
        self,
        batch: List[Tuple[torch.LongTensor, torch.LongTensor]],
    ) -> Dict[str, torch.LongTensor]:

        tokens, labels = zip(*batch)
        tokens = torch.nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=self.token_padding_value).long()
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.label_padding_value).long()
        attention_mask = torch.nn.utils.rnn.pad_sequence(list(map(partial(torch.ones_like, dtype=torch.bool), tokens)), batch_first=True, padding_value=False).long()
        return {
            "input_ids": tokens,
            "labels": labels,
            "attention_mask": attention_mask
        }

In [20]:
collator = NERCollator(
    token_padding_value=tokenizer.pad_token_id,
    label_padding_value=-100,
)

## Обучаемся

In [21]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [22]:
from transformers import EvalPrediction
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import logging
import numpy as np


logger = logging.getLogger(__name__)

label_list = []
for label in label_set:
    if label != 'O':
        label_list.append(label2idx[label])

def compute_metrics(
    evaluation_results: EvalPrediction,
    category_id_mapping: Dict[int, str],
    no_entity_category_id: int,
    short_output = False,
    BIO_NOTATION = False
    ) -> Dict[str, float]:

    predictions = np.argmax(evaluation_results.predictions, axis=-1)
    padding_mask = label_mask = (evaluation_results.label_ids != -100)

    #label_mask = np.triu(padding_mask)
    label_ids = evaluation_results.label_ids[label_mask]
    predictions = predictions[label_mask]

    unique_label_ids = set(np.unique(label_ids[label_ids != no_entity_category_id]))

    labels = sorted(category_id_mapping.keys())
    f1_category_scores = f1_score(label_ids, predictions, average=None, labels=labels, zero_division=0)
    recall_category_scores = recall_score(label_ids, predictions, average=None, labels=label_list, zero_division=0)
    precision_category_scores = precision_score(label_ids, predictions, average=None, labels=label_list, zero_division=0)

    results: Dict[str, float] = {}
    sum_f1 = 0
    sum_recall = 0
    sum_precision = 0
    for category_id, (f1, recall, precision) in enumerate(zip(f1_category_scores, recall_category_scores, precision_category_scores)):
        if category_id == no_entity_category_id:
            logger.info(f'O: {f1}, {recall}, {precision}')
            continue

        if category_id not in unique_label_ids:
            logger.info(f'Skipping {category_id_mapping[category_id]}: {f1}, {recall}, {precision}')

        category = category_id_mapping[category_id]

        sum_f1 += f1
        sum_recall += recall
        sum_precision += precision

    num_categories = len(category_id_mapping) - 1

    results['F1_macro'] = sum_f1 / num_categories
    results['Recall_macro'] = sum_recall / num_categories
    results['Precision_macro'] = sum_precision / num_categories
    return results


In [23]:
from transformers import Trainer, EvalPrediction

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"

from functools import partial
from transformers import TrainingArguments

model = BertForTokenClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=len(label_set)).to(device)

args = TrainingArguments(
    output_dir="model_out",
    learning_rate=3e-4,
    weight_decay=1e-4,
    lr_scheduler_type="cosine",
    full_determinism=False,
    seed=42,
    per_device_train_batch_size=2,
    num_train_epochs=50,
    evaluation_strategy="steps",
    eval_steps=500,
)

label_from_idx: Dict[int, str] = {}

for idx, label in enumerate(label_set):
    label_from_idx[idx] = label

trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=partial(
        compute_metrics,
        category_id_mapping=label_from_idx,
        no_entity_category_id=-1,
        short_output=True,
        BIO_NOTATION=True
    )
)
trainer.train()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized 

Step,Training Loss,Validation Loss,F1 Macro,Recall Macro,Precision Macro
500,1.444,1.282482,0.185399,0.169085,0.315646
1000,0.7557,1.144688,0.324784,0.29287,0.415076
1500,0.4273,1.216442,0.4155,0.366108,0.528281
2000,0.2311,1.317605,0.440378,0.394883,0.513976
2500,0.1519,1.412639,0.463209,0.41821,0.536814
3000,0.1082,1.46174,0.491598,0.428788,0.612122
3500,0.0766,1.545399,0.496341,0.439796,0.606107
4000,0.061,1.584387,0.498134,0.447723,0.58591
4500,0.0512,1.615715,0.498469,0.455271,0.578472
5000,0.0409,1.692062,0.497571,0.45234,0.567101


***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
Saving model checkpoint to model_out/checkpoint-500
Configuration saved in model_out/checkpoint-500/config.json
Model weights saved in model_out/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
Saving model checkpoint to model_out/checkpoint-1000
Configuration saved in model_out/checkpoint-1000/config.json
Model weights saved in model_out/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
Saving model checkpoint to model_out/checkpoint-1500
Configuration saved in model_out/checkpoint-1500/config.json
Model weights saved in model_out/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
Saving model checkpoint to model_out/checkpoint-2000
Configuration saved in model_out/checkpoint-2000/config.json
Model weights saved in model_out/checkpoint-2000/pytorch_model.bin
***** R

TrainOutput(global_step=18650, training_loss=0.0953935909834528, metrics={'train_runtime': 226.2191, 'train_samples_per_second': 164.884, 'train_steps_per_second': 82.442, 'total_flos': 200561311362984.0, 'train_loss': 0.0953935909834528, 'epoch': 50.0})

In [25]:
import pickle

from transformers.modeling_utils import unwrap_model

trained_model = unwrap_model(trainer.model_wrapped)
trained_model = trained_model.to("cpu")

with open('rubert_tiny.pkl', 'wb') as handle:
    pickle.dump(trained_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
with open('label2idx.pkl', 'wb') as handle:
    pickle.dump(label2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('token2idx.pkl', 'wb') as handle:
    pickle.dump(token2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
import lzma
import pickle
 
def compress_data(data):
    return lzma.compress(pickle.dumps(data), format=lzma.FORMAT_RAW, filters=[{"id":lzma.FILTER_LZMA2,"dict_size":268435456, "preset":9, "mf":lzma.MF_HC3, "depth":0, "lc":3}])

In [28]:
compressed_state = compress_data(trained_model.state_dict())
compressed_config = compress_data(trained_model.config)
f = open("state.xz", "wb")
f.write(compressed_state)
f.close()
f = open("config.xz", "wb")
f.write(compressed_config)
f.close()

In [29]:
! ls -lah

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 211M
drwxrwxr-x 11 xellar xellar 4.0K Nov 28 10:29 .
drwxrwxr-x  4 xellar xellar 4.0K Nov 26 16:29 ..
-rw-rw-r--  1 xellar xellar 1.4K Nov 28 10:30 config.xz
drwxrwxr-x  5 xellar xellar 4.0K Nov 26 17:48 data
-rw-rw-r--  1 xellar xellar 1.9K Nov 27 19:25 dataset.py
drwxrwxr-x  2 xellar xellar 4.0K Nov 27 18:44 .ipynb_checkpoints
-rw-rw-r--  1 xellar xellar  842 Nov 28 10:27 label2idx.pkl
drwxrwxr-x 39 xellar xellar 4.0K Nov 28 10:27 model_out
-rw-rw-r--  1 xellar xellar 143K Nov 28 10:29 NER_notebook.ipynb
drwxrwxr-x  2 xellar xellar 4.0K Nov 27 10:43 out
drwxrwxr-x  2 xellar xellar 4.0K Nov 27 19:33 __pycache__
drwxrwxr-x  3 xellar xellar 4.0K Nov 27 12:20 res
-rw-rw-r--  1 xellar xellar 8.8K Nov 27 1