# Dataset

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
Co

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.14.6 dill-0.3.7 multiprocess-0.70.15


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files="/content/drive/MyDrive/aicup/PublicDataset_phase3/opendid_set1.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fid', 'idx', 'content', 'label'],
        num_rows: 85736
    })
})

In [6]:
dataset['train'][0]

{'fid': '10',
 'idx': 1,
 'content': 'Episode No:  09F016547J',
 'label': 'IDNUM: 09F016547J'}

In [7]:
dataset['train'][1]

{'fid': '10',
 'idx': 25,
 'content': '091016.NMT',
 'label': 'MEDICALRECORD: 091016.NMT'}

In [8]:
dataset['train'][7]

{'fid': '10',
 'idx': 114,
 'content': 'D.O.B:  24/8/1993',
 'label': 'DATE: 24/8/1993=>1993-08-24'}

For demonstration purpose, we only use the randomly sampled 20000 instances.

In [9]:
import torch
sub_datasets = torch.utils.data.random_split(dataset['train'], [20000, 65736])
print(len(sub_datasets[0]))
for i in range(4): print(sub_datasets[0][i])

20000
{'fid': 'file26009', 'idx': 3243, 'content': 'All pancreatic blocks have been examined at multiple levels.', 'label': 'PHI: NULL'}
{'fid': 'file12198', 'idx': 8769, 'content': '- Tumour tissue is retained if testing for other biomarkers is required.', 'label': 'PHI: NULL'}
{'fid': '31', 'idx': 334, 'content': 'Left breast sentinel nodebiopsy - in print cytology.', 'label': 'PHI: NULL'}
{'fid': '603', 'idx': 4082, 'content': 'A.', 'label': 'PHI: NULL'}


# Data loader

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-70m" #"EleutherAI/pythia-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)p3000/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [11]:
!pip install islab-opendeid

Collecting islab-opendeid
  Downloading islab_opendeid-0.0.1.1-py3-none-any.whl (3.0 kB)
Installing collected packages: islab-opendeid
Successfully installed islab-opendeid-0.0.1.1


In [12]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(sub_datasets[0])
train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([3, 22])


(tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
              0,   329,    15, 50279,  6663,    42,    27,  5812,   209, 50277],
         [    0,   313,  7058,    27, 25840,    16,   886,   608,    15,   805,
             15,  2082,    10, 50279, 33762,    27,   608,    15,   805,    15,
           2082, 14490,   938,  2082,    14,  1762,    14,   805,   209, 50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278,     0, 19954,    14,  2251,  3605,    27,   608,
            428,   884,     6, 50279,  6663,    42,    27,  5812,   209, 50277]]),
 tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
              0,   329,    15, 50279,  6663,    42,    27,  5812,   209, 50277],
         [    0,   313,  7058,    

In [13]:
results = tokenizer(["Lab No: 14H02780", "“STOCKDALE” 653 MONAGHAN RD"], padding=True)
print(results['input_ids'])
print()
print(results['input_ids'][0])
print(tokenizer.decode(results['input_ids'][0]))
print(results['input_ids'][1])
print(tokenizer.decode(results['input_ids'][1]))

[[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438], [1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]]

[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438]
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>Lab No: 14H02780
[1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]
“STOCKDALE” 653 MONAGHAN RD


In [14]:
results = tokenizer(
    [f"{bos} 9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY {eos}",
     f"{bos} This is a sentence {sep} PHI: NULL {eos}"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<|endoftext|> 9364819.RAN\nMINTANIA, JEFFRY 

####

 ID: 9364819.RAN\nNAME: MINTANIA, JEFFRY <|END|>
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|endoftext|> This is a sentence 

####

 PHI: NULL <|END|>


In [15]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 8
bucket_train_dataloader = DataLoader(train_data,
                                     batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)

# for idx, batch in enumerate(bucket_train_dataloader):
#     print(batch)
#     print(batch[0].shape)
#     print(batch[1].shape)
#     break

# Model

In [16]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 3 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=3e-5) # YOU CAN ADJUST LEARNING RATE

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [19]:
from tqdm import tqdm,trange

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

Epoch:  33%|███▎      | 1/3 [01:45<03:30, 105.16s/it]

Average train loss: 1.863233209180832


Epoch:  67%|██████▋   | 2/3 [03:28<01:44, 104.30s/it]

Average train loss: 1.4270867498278619


Epoch: 100%|██████████| 3/3 [05:12<00:00, 104.33s/it]

Average train loss: 1.2469054220557212





In [20]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="/content/drive/MyDrive/aicup/PublicDataset_phase3/opendid_valid.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

[{'fid': '1001',
  'idx': 0,
  'content': 'Episode No:  88Y206206L',
  'label': None},
 {'fid': '1001', 'idx': 24, 'content': '8892062.BPL', 'label': None},
 {'fid': '1001',
  'idx': 37,
  'content': 'Vatterott, Jerrie CLARENCE',
  'label': None},
 {'fid': '1001',
  'idx': 65,
  'content': 'Lab No:  88Y20620,88Y20620',
  'label': None},
 {'fid': '1001', 'idx': 92, 'content': 'Exeter', 'label': None},
 {'fid': '1001',
  'idx': 99,
  'content': 'DECEPTION BAY  Northern Territory  6845',
  'label': None},
 {'fid': '1001',
  'idx': 139,
  'content': 'Specimen: Fluid,Tissue',
  'label': None},
 {'fid': '1001', 'idx': 162, 'content': 'D.O.B:  15/11/2004', 'label': None},
 {'fid': '1001', 'idx': 181, 'content': 'Sex:  F', 'label': None},
 {'fid': '1001',
  'idx': 189,
  'content': 'Collected: 20/5/2064 at :',
  'label': None},
 {'fid': '1001',
  'idx': 215,
  'content': 'Location:  PARKES 8 - GUNNEDAH DISTRICT HOSPITAL',
  'label': None},
 {'fid': '1001',
  'idx': 264,
  'content': 'DR Edison

In [21]:
from tqdm.notebook import tqdm
from islab.aicup import aicup_predict
import io
BATCH_SIZE = 32

with open("./answer.txt",'w',encoding='utf8') as f:
#with io.open("answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

  0%|          | 0/1387 [00:00<?, ?it/s]