In [1]:
from datasets import load_dataset
import numpy as np
from collections import defaultdict
import re
import random

## Polyglot

In [72]:
dataset = load_dataset(path = "polyglot_ner", 
                       name = "en",
                       split = "train[:2000]")

Found cached dataset polyglot_ner (/home/sush/.cache/huggingface/datasets/polyglot_ner/en/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1)


In [73]:
dataset

Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 2000
})

In [74]:
instruction_list = ['Your objective is to extract the entities by utilizing the accompanying definitions.',
 'You are required to identify and isolate the entities described in the provided definitions.',
 'Your task is to separate and categorize the entities based on their corresponding definitions.',
 'Extract the entities given the provided definitions by employing your understanding.',
 'Identify and extract the entities based on the given definitions.',
 'Your responsibility is to isolate and identify the entities according to the definitions provided.',
 'Separate and distinguish the entities by matching them with their respective definitions.',
 'Extract the entities by comprehending the definitions given for each entity.',
 'Your duty is to identify and separate the entities, utilizing the accompanying definitions.',
 'Given the definitions, your job is to extract the entities by applying your knowledge.',
 'Your task is to categorize and extract the entities based on the provided definitions.',
 'Separate and differentiate the entities by matching them with their corresponding definitions.',
 'Extract the entities by interpreting the definitions given for each entity.',
 'Your objective is to identify and isolate the entities according to the provided definitions.',
 'Separate and classify the entities based on the accompanying definitions.',
 'Extract the entities by understanding the given definitions.',
 'Your responsibility is to identify and separate the entities provided their respective definitions.',
 'Separate and identify the entities by matching them with their corresponding definitions.',
 'Extract the entities by comprehending the definitions provided for each entity.',
 'Your duty is to identify and separate the entities based on the given definitions.']

entity_def_dict = {'ORG': 'Represents a formal group or entity such as a company or organization.',
                   'PER': 'Refers to an individual person or a group of individuals.',
                   'LOC': 'Represents a specific place or geographical location.'
                  }
entity_def_str = '\n'.join([f"{ent}: {deftn}" for ent, deftn  in entity_def_dict.items()])

In [75]:
def generate_prompt(entity_def_str, input, label):
    # returns the full prompt from instruction, input, label
    # only if all the data is provided label is provided
    
    label_str = '\n'.join([f'''{ent}: {', '.join([f'"{val}"' for val in values])}''' for ent, values  in label.items() if values is not None])
    
    full_text = f'''{random.choice(instruction_list)}
    
{entity_def_str}

QUERY: "{input}"

ENTITIES:
{label_str}'''
    
    return full_text

In [76]:
def process_example(example):
    
    query_text = ' '.join(example['words'])
    pattern = r'\s([!\"#$%&\'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])'
    replacement = r'\1'
    query_text = re.sub(pattern, replacement, query_text)
    
    entity_dict = defaultdict(list)
    for word, ner in list(zip(example['words'], example['ner'])):
        if ner != 'O': entity_dict[ner].append(word)
    entity_dict = dict(entity_dict)
    
    full_text = generate_prompt(entity_def_str, query_text, entity_dict)
    
    return {'query_text': query_text,
            'entity_dict': entity_dict,
            'full_text': full_text}

dataset = dataset.map(process_example, batched=False)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [77]:
dataset = dataset.remove_columns(['lang', 'words', 'ner'])

In [78]:
print(dataset[0]["full_text"])

Extract the entities by interpreting the definitions given for each entity.
    
ORG: Represents a formal group or entity such as a company or organization.
PER: Refers to an individual person or a group of individuals.
LOC: Represents a specific place or geographical location.

QUERY: "The basilikon(," imperial[ coin]"), commonly also referred to as the doukaton( Greek: δουκάτον), was a widely circulated Byzantine silver coin of the first half of the 14th century."

ENTITIES:
LOC: "Byzantine"


In [79]:
def preprocess_batch(batch, max_length=2048):
        return tokenizer(
        batch["full_text"],
        max_length=max_length,
        truncation=True,
            # padding=True,
            # return_tensors='pt'
    )

In [80]:
 dataset = dataset.map(
        preprocess_batch,
        batched=True,
        remove_columns=['id', 'query_text', 'entity_dict', 'full_text'],
    )

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [81]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2000
})

In [84]:
dataset.save_to_disk("data/polyglot_processed_2000")

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [66]:
from transformers import DataCollatorForLanguageModeling

from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast

base_model = "databricks/dolly-v2-3b"
tokenizer = GPTNeoXTokenizerFast.from_pretrained(base_model)


dataCollator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False,
    pad_to_multiple_of = 8,
    return_tensors = 'pt')

In [82]:
from torch.utils.data import DataLoader

In [83]:
for y in DataLoader(dataset, collate_fn=dataCollator, batch_size=2):
    print(y)
    break

{'input_ids': tensor([[ 7992,   974,   253, 14429,   407, 29375,   253, 14308,  1677,   323,
          1016, 10726,    15,   187, 50274,   187,  1372,    40,    27,  2719,
          5957,   247,  7473,  1387,   390, 10726,   824,   347,   247,  2567,
           390,  6003,    15,   187, 10810,    27,  7567,   398,   281,   271,
          2060,  1436,   390,   247,  1387,   273,  4292,    15,   187, 21766,
            27,  2719,  5957,   247,  2173,  1659,   390, 25231,  4328,    15,
           187,   187,  8846, 16759,    27,   346,   510, 40683,  1479,   251,
             9,   937, 21474,    60, 18011,    62,  8375,  7744,   671,  6289,
           281,   347,   253,  2443,    76, 13078,     9, 11308,    27, 11112,
          8520,  5676,  8348, 16003,  3147,   582,   369,   247,  7561, 41443,
         47154,  9711, 18011,   273,   253,   806,  2716,   273,   253,  1638,
           394,  5331,   449,   187,   187,  3489,  1433,  9785,    27,   187,
         21766,    27,   346,  3463,  

## tner/bionlp2004

In [152]:
dataset = load_dataset(path = "tner/bionlp2004", 
                       split = "train")

Found cached dataset bionlp2004 (/home/sush/.cache/huggingface/datasets/tner___bionlp2004/bionlp2004/1.0.0/9f41d3f0270b773c2762dee333ae36c29331e2216114a57081f77639fdb5e904)


In [156]:
dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 16619
})

In [162]:
i = 20
list(zip(dataset[i]['tokens'], dataset[i]['tags']))

[('In', 0),
 ('the', 0),
 ('presence', 0),
 ('of', 0),
 ('Epo', 3),
 (',', 0),
 ('c-myb', 9),
 ('mRNA', 10),
 ('declined', 0),
 ('and', 0),
 ('20', 0),
 ('%', 0),
 ('of', 0),
 ('K562', 7),
 ('cells', 8),
 ('synthesized', 0),
 ('Hb', 3),
 ('regardless', 0),
 ('of', 0),
 ('antisense', 9),
 ('myb', 10),
 ('RNA', 10),
 ('expression', 0),
 ('.', 0)]

In [None]:
{
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

In [157]:
dataset['tags'][:1]

[[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]]