In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-160m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.padding_side = 'left'

In [7]:
from datasets import load_dataset, Features, Value, interleave_datasets
dataset1 = load_dataset("csv", data_files="opendid_set1.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

dataset2 = load_dataset("csv", data_files="opendid_set2.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

# 組合 dataset1 和 dataset2
dataset = interleave_datasets([dataset1["train"], dataset2["train"]])

In [8]:
train_dataset_length = len(dataset)
print(f'Train dataset length: {train_dataset_length}')

Train dataset length: 157518


In [9]:
import torch
sub_datasets = torch.utils.data.random_split(dataset, [157518])
print(len(sub_datasets[0]))
for i in range(4): print(sub_datasets[0][i])

157518
{'fid': '1549', 'idx': 3870, 'content': 'E.', 'label': 'PHI: NULL'}
{'fid': '1346', 'idx': 6428, 'content': 'Preserved nuclear staining of a carcinoma for MLH1, PMS2, MSH2 and MSH6 indicates a low likelihood of microsatellite instability phenotype.', 'label': 'PHI: NULL'}
{'fid': '266', 'idx': 2115, 'content': "It is 2 mm from the 9 o'clock margin and is 4 mm from the 6 o'clock margin.", 'label': 'PHI: NULL'}
{'fid': '1445', 'idx': 4839, 'content': 'Representative sections in 3 blocks.(TO: KT;AJ/ec 6.2.63)', 'label': 'DOCTOR: KT\\nDOCTOR: AJ\\nDATE: 6.2.63=>2063-06-02'}


In [2]:
PAD_IDX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
IGNORED_PAD_IDX = -100
PAD_IDX

50278

In [11]:
from torch.utils.data import DataLoader
import torch

train_data = list(sub_datasets[0])

def collate_batch(batch):
    texts = [f"{bos} {data['content']} {sep}"+ data['label'].replace('\\n','\n')+f" {eos}" for data in list(batch)] # 範例 prompt
    encoded_seq = tokenizer(texts, padding=True)

    indexed_tks = torch.tensor(encoded_seq['input_ids'])
    attention_mask = torch.tensor(encoded_seq['attention_mask'])
    encoded_label = torch.tensor(encoded_seq['input_ids'])
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

    return indexed_tks, encoded_label, attention_mask

train_dataloader = DataLoader(train_data, batch_size=2, shuffle=False, collate_fn=collate_batch)
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([2, 42])


(tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,     0,   733,
            310,   374,  5823,   432,   253,   898,   258,     8, 13273,  8459,
            285,   310,   577,  5823,   432,   253,   721,   258,     8, 13273,
           8459,    15,   209, 50279,  6663,    42,    27,  5812,   209, 50277],
         [    0, 19974,  7118,   275,   495,  8336, 14517,  7058,    27,   611,
             53,    28, 36947,    16,   886,   721,    15,    19,    15,  3571,
             10,   209, 50279, 13220, 39321,    27,   611,    53,   187, 13220,
          39321,    27, 26060,   187, 33762,    27,   721,    15,    19,    15,
           3571, 14490,   938,  3571,    14,  3071,    14,  2640,   209, 50277]]),
 tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,     0,   733,
            310,   374,  5823,   432

In [12]:
import random
BATCH_SIZE = 5 # 自行決定大小

class BatchSampler():
    def __init__(self, data, batch_size):
        self.pooled_indices = []
        self.data = data
        self.batch_size = batch_size
        self.len = len(list(data))
    def __iter__(self):
        self.pooled_indices = []
        indices = [(index, len(data["content"])) for index, data in enumerate(self.data)]
        random.shuffle(indices)
        for i in range(0, len(indices), BATCH_SIZE * 100):
            self.pooled_indices.extend(sorted(indices[i:i + BATCH_SIZE * 100], key=lambda x: x[1], reverse=True))
        self.pooled_indices = [x[0] for x in self.pooled_indices]

        for i in range(0, len(self.pooled_indices), BATCH_SIZE):
            yield self.pooled_indices[i:i + BATCH_SIZE]
    def __len__(self):
        return (self.len + self.batch_size - 1) // self.batch_size

bucket_train_dataloader = DataLoader(train_data, batch_sampler=BatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=collate_batch, pin_memory=True)

In [3]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [5]:
import torch
#from peft import get_peft_model, LoraConfig, TaskType
#peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
#model = get_peft_model(model, peft_config)
#model.print_trainable_parameters()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [12]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [15]:
import torch
from tqdm import tqdm#, tqdm_notebook
from torch.nn import functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def sample_text(model, tokenizer, seed, n_words=20):
    model = model.to(device)
    model.eval()
    text = tokenizer.encode(seed)
    inputs, past_key_values = torch.tensor([text]), None
    with torch.no_grad():
        for _ in tqdm(range(n_words)):
            out = model(inputs.to(device), past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break


    return tokenizer.decode(text)

sample_text(model, tokenizer, seed=f"{bos} DR AADLAND ABRAHAM {sep}")

100%|██████████| 20/20 [00:00<00:00, 30.50it/s]


'<|endoftext|> DR AADLAND ABRAHAM \n\n####\n\n Presentation Filters that are Projected\n\nby GDC this is really it from fellow to'

In [6]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 5 # 設定你的訓練次數
optimizer = AdamW(model.parameters(),lr=5e-5)

steps = len(bucket_train_dataloader)
total_steps = steps * EPOCHS
print(steps, total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps*0.1,
    num_training_steps=total_steps
)

model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(f'Total numbers of steps: {total_steps}')
model


NameError: name 'bucket_train_dataloader' is not defined

In [17]:
from tqdm import tqdm,trange
checkpoint = torch.load("model_checkpoint_epoch_{}.pt".format(9))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
global_step = 0
total_loss = 0
train_loss_list = []
model.train()
for epoch in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels)#, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    # 將每個epoch的平均訓練損失添加到list中
    train_loss_list.append(avg_train_loss)
    # 在每3個 epoch 檢查是否保存模型
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss_list': train_loss_list,
        }, "model_checkpoint_epoch_{}.pt".format(epoch + 9 +1))
    print("Model saved at epoch {}".format(epoch + 9 +1))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Average train loss: 0.4689004434159838


Epoch:  50%|█████     | 1/2 [36:58<36:58, 2218.77s/it]

Model saved at epoch 21
Average train loss: 0.4688747150039725


Epoch: 100%|██████████| 2/2 [1:13:25<00:00, 2202.91s/it]

Model saved at epoch 22





In [2]:
import matplotlib.pyplot as plt
# 繪製訓練損失的折線圖
plt.plot(range(1, EPOCHS + 1), train_loss_list, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()

NameError: name 'EPOCHS' is not defined

In [7]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="D:\AICUP\opendid_test.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):


[{'fid': '1097', 'idx': 1, 'content': '433475.RDC', 'label': None},
 {'fid': '1097', 'idx': 12, 'content': 'Timmins, ELDEN', 'label': None},
 {'fid': '1097', 'idx': 27, 'content': '43J47561,43J47561', 'label': None},
 {'fid': '1097',
  'idx': 46,
  'content': 'Last edited : 7/9/2063  Page: 2',
  'label': None},
 {'fid': '1097', 'idx': 78, 'content': 'CLINICAL:', 'label': None},
 {'fid': '1097',
  'idx': 88,
  'content': 'Metastatic cancer ?colorectal primary.',
  'label': None},
 {'fid': '1097', 'idx': 128, 'content': 'MACROSCOPIC:', 'label': None},
 {'fid': '1097',
  'idx': 141,
  'content': 'Specimen labelled "Omentum secondary", consists of a piece of omentum 120 x 100 x 30mm.',
  'label': None},
 {'fid': '1097',
  'idx': 230,
  'content': 'On sectioning there are multiple fibrotic white ill-defined nodules identified.',
  'label': None},
 {'fid': '1097',
  'idx': 312,
  'content': 'Blocks: 1 to 5 - representative sections from the nodules.',
  'label': None},
 {'fid': '1097',
  'id

In [8]:
from tqdm import tqdm#, tqdm_notebook

exclude_chars = ['PATIENT', 'DOCTOR','USERNAME','PROFESSION','ROOM','DEPARTMENT','HOSPITAL','ORGANIZATION','STREET','CITY','STATE','COUNTRY','ZIP','LOCATION-OTHER','AGE','DATE','TIME','DURATION','SET','PHONE','FAX','EMAIL','URL','IPADDR','SSN','MEDICALRECORD','HEALTHPLAN','ACCOUNT','LICENSE','VECHICLE','DEVICE','BIOID','IDNUM','OTHER']

for model_epoch in range(1,2,1):
    checkpoint = torch.load("final_model_checkpoint_epoch_{}.pt".format(model_epoch))
    model.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'])
    tokenizer.padding_side = 'left'
    def sample_batch(model, tokenizer, input):
        """Generate text from a trained model."""
        model.eval()
        seeds = [f"{bos} {text['content']} {sep}" for text in input]
        texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
        outputs = []
        #return
        with torch.cuda.amp.autocast():
            output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = PAD_IDX,
                                            eos_token_id=tokenizer.convert_tokens_to_ids(eos))
            preds = tokenizer.batch_decode(output_tokens)
            for idx , pred in enumerate(preds):
                pred = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
                if pred == "PHI: NULL":
                    continue
                if all(char not in pred for char in exclude_chars):
                    continue
                phis = pred.split('\n')
                lidxs = {}
                for p in phis:
                    tid = p.find(':')
                    if tid > 0:
                        text = p[tid+1:].strip()
                        nv = text.find('=>')
                        normalizedV = None
                        # 處理時間正規化
                        if nv>0:
                            normalizedV = text[nv+2:]
                            text = text[:nv]
                        lidx = 0
                        if text in lidxs:
                            lidx = lidxs[text]
                        lidx = input[idx]['content'].find(text, lidx)
                        eidx = lidx+len(text)
                        lidxs[text] = eidx
                        sidx=int(input[idx]['idx'])
                        if lidx+sidx < eidx+sidx :  #判斷起始位置是否小於結束位置
                            if normalizedV is None:
                                outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}')
                            else:
                                outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}\t{normalizedV}')
        return outputs

    f = open("test_answer{}.txt".format(model_epoch), "w")
    BATCH_SIZE = 50
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = sample_batch(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')
    f.close()

100%|██████████| 1580/1580 [12:40<00:00,  2.08it/s] 
