In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

# os.system('pip install iterative-stratification==0.1.7')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, Features, Value


os.system('pip install -q transformers')
os.system('pip install -q tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.3
transformers.__version__: 4.29.2
env: TOKENIZERS_PARALLELISM=true


In [2]:
from torch.cuda.amp import autocast, GradScaler
from sklearn import metrics
# from src.machine_learning_util import set_seed, set_device, init_logger, AverageMeter, to_pickle, unpickle, asMinutes, timeSince

In [3]:
class CFG:
    EXP_ID = '024'
    apex = True
    model ='microsoft/deberta-v3-large' # 'microsoft/deberta-large' # 'microsoft/deberta-v3-base' #'microsoft/deberta-v3-large' 
    seed = 2022 # 42 # 71
    n_splits = 4
    max_len = 640 + 2 # 1429 # 1024 # 512
    dropout = 0
    target_cols = "label"
    target_size = None
    n_accumulate=1
    print_freq = 100
    eval_freq = 780 * 2 # 390 # 170
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 12 # 2 # 4
    num_workers = 0 #3
    lr = 5e-6 # 3e-6
    weigth_decay = 0.01
    epochs = 3
    n_fold = 4
    trn_fold = [i for i in range(n_fold)]
    train = True
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    freezing = True
    gradient_checkpoint = True
    reinit_layers = 4 # 3
    tokenizer = AutoTokenizer.from_pretrained(model)
    max_norm = 1

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
           
seed_everything(CFG.seed)
# seed_everything(seed=60)

# Data Loading

In [5]:
test_report_file_path = "./NER_Dataset/private_data" #巡迴課程 val 資料集

dataset = load_dataset("csv", data_files=f"{test_report_file_path}/demo.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'text': Value('string')}),
                              column_names=['fid', 'idx', 'text'], keep_default_na=False)

test_list = list(dataset["train"])

test_df = pd.DataFrame.from_dict(test_list)

In [6]:
test_df.head()

Unnamed: 0,fid,idx,text
0,1097,1,433475.RDC
1,1097,12,"Timmins, ELDEN"
2,1097,27,"43J47561,43J47561"
3,1097,46,Last edited : 7/9/2063 Page: 2
4,1097,78,CLINICAL:


In [7]:
len(test_df)

128

In [8]:
label_name_list = ["PATIENT","DOCTOR","USERNAME","PROFESSION","ROOM","DEPARTMENT","HOSPITAL",
              "ORGANIZATION","STREET","CITY","STATE","COUNTRY","ZIP","LOCATION-OTHER",
              "AGE","DATE","TIME","DURATION","SET","PHONE","FAX","EMAIL","URL","IPADDR",
              "SSN","MEDICALRECORD","HEALTHPLAN","ACCOUNT","LICENSE","VECHICLE","DEVICE",
              "BIOID","IDNUM","PHI"]

id_to_label = dict(enumerate(label_name_list))
label_to_id = {v: k for k, v in id_to_label.items()}

CFG.target_size = len(label_name_list)

# TestDataset

In [9]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])

        # token_start = (inputs['attention_mask'] > 0).int()

        cls_token_index = 0

        sep_token_index = torch.count_nonzero(inputs['attention_mask']).item()-1

        token_start = (inputs['attention_mask'] > 0).int()

        token_start[cls_token_index] = -1
        token_start[sep_token_index] = -1
        
        return {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
            'token_start': token_start
            }

    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [10]:
testDataset = TestDataset(CFG, test_df)

In [11]:
testDataset[0]

{'input_ids': tensor([    1, 50759, 35872,   260, 77653,     2,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [12]:
test_loader = DataLoader(testDataset,
                              batch_size = CFG.batch_size,
                              shuffle=False,
                            #   collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)

# for step, data in enumerate(test_loader):
#     print(step)

In [13]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False



# NER Model

In [14]:
class NER_Model(nn.Module):
    def __init__(self, model_name):
        super(NER_Model, self).__init__()

        self.cfg = CFG
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.hidden_dropout_prob = 0
        self.config.attention_probs_dropout_prob = 0

        self.model = AutoModel.from_pretrained(model_name, config=self.config)

        self.output = nn.Sequential(
            nn.LayerNorm(self.config.hidden_size),
            nn.Linear(self.config.hidden_size, self.cfg.target_size)
        )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, ids, mask, token_type_ids=None, targets=None, input_token_starts = None):
        if token_type_ids:
            transformer_out = self.model(ids, mask, token_type_ids)
        else:
            transformer_out = self.model(ids, mask)
        
        sequence_output = transformer_out[0] # shape : (batch,length,dimension)

        # 去除[CLS]标签等位置，获得与label对齐的pre_label表示
        # token_sequence_output = [layer[starts.nonzero().squeeze(1)]
        #                           for layer, starts in zip(sequence_output, input_token_starts)]
        
        # 将sequence_output的pred_label维度padding到最大长度
        # padded_sequence_output = pad_sequence(token_sequence_output, batch_first=True)
        
        logits = self.output(sequence_output)

        return logits

In [16]:
# opensetid_model_path = "C:/Users/Lab000/Desktop/2023_AI_CUP秋季/code/NER/巡迴課程資料集/FB3_13th_solution/AI_CUP_1/model"
model = NER_Model(CFG.model)

state = torch.load(f"./AI_CUP_3/infer_model/deberta-v3-large_full-data-2_gpt-5/{CFG.model.replace('/', '-')}_best.pth",
                                   map_location=torch.device('cpu'))['model']

# state = model.load_state_dict(f"./AI_CUP_3/infer_model/deberta-v3-large_full-data-2_gpt-5/{CFG.model.replace('/', '-')}_best.pth")['model']

# state = torch.load(f"{opensetid_model_path}/{CFG.model.replace('/', '-')}_best.pth",
#                                    map_location=torch.device('cpu'))['model']
    
model.load_state_dict(state)
model.to(device)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NER_Model(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
         

# Prediction

In [17]:
pred_tags = []

for step, data in enumerate(tqdm(test_loader)):
    model.eval()
    ids = data['input_ids'].to(device, dtype=torch.long)
    mask = data['attention_mask'].to(device, dtype=torch.long)
    token_start = data['token_start'].to(device, dtype=torch.long)
    with torch.no_grad():
        logits = model(ids, mask)

    token_position = token_start.gt(0)
    dims = token_position.shape
    
    logits = logits.detach().cpu().numpy()
    token_position = token_position.detach().cpu().numpy()

    for text_no in range(dims[0]):
        text_tags = []
        for pos in range(dims[1]):
            if token_position[text_no,pos]:
                
                text_tags.append(id_to_label[np.argmax(logits[text_no, pos])])
        # print(text_tags)
        pred_tags.append(text_tags)

    # active_loss = token_position.view(-1) == 1
    # active_logits = logits.view(-1, CFG.target_size)[active_loss]
    
    

100%|██████████| 11/11 [00:08<00:00,  1.31it/s]


In [18]:
pred_tags

[['MEDICALRECORD', 'MEDICALRECORD', 'MEDICALRECORD', 'MEDICALRECORD'],
 ['PATIENT', 'PATIENT', 'PATIENT', 'PATIENT'],
 ['IDNUM',
  'IDNUM',
  'IDNUM',
  'IDNUM',
  'PHI',
  'IDNUM',
  'IDNUM',
  'IDNUM',
  'IDNUM'],
 ['PHI',
  'PHI',
  'PHI',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'DATE',
  'PHI',
  'PHI',
  'PHI'],
 ['PHI', 'PHI'],
 ['PHI', 'PHI', 'PHI', 'PHI', 'PHI', 'PHI', 'PHI', 'PHI', 'PHI'],
 ['PHI', 'PHI', 'PHI', 'PHI', 'PHI'],
 ['PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI'],
 ['PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI'],
 ['PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI',
  'PHI'],
 ['PHI', 'PHI', 'PHI', 'PHI', 'PHI'],
 ['PHI',
  'DOCTOR',
  'PHI',
  'PHI',
  'DATE',
  'DATE'

# 文章總數

In [19]:
print("文章總數:",test_df["fid"].unique().shape[0])
print("文章總數:",len(test_df[(test_df["idx"]==0)]) + len(test_df[(test_df["idx"]==1)]) + len(test_df[(test_df["idx"]==2)])) #文章第一句起始為0、1或2

文章總數: 2
文章總數: 2


# 寫入answer.txt

In [20]:
for index,j in enumerate(pred_tags):
    pred_tags[index] = [label_to_id [k]for k in pred_tags[index]]

In [21]:
id_to_label

{0: 'PATIENT',
 1: 'DOCTOR',
 2: 'USERNAME',
 3: 'PROFESSION',
 4: 'ROOM',
 5: 'DEPARTMENT',
 6: 'HOSPITAL',
 7: 'ORGANIZATION',
 8: 'STREET',
 9: 'CITY',
 10: 'STATE',
 11: 'COUNTRY',
 12: 'ZIP',
 13: 'LOCATION-OTHER',
 14: 'AGE',
 15: 'DATE',
 16: 'TIME',
 17: 'DURATION',
 18: 'SET',
 19: 'PHONE',
 20: 'FAX',
 21: 'EMAIL',
 22: 'URL',
 23: 'IPADDR',
 24: 'SSN',
 25: 'MEDICALRECORD',
 26: 'HEALTHPLAN',
 27: 'ACCOUNT',
 28: 'LICENSE',
 29: 'VECHICLE',
 30: 'DEVICE',
 31: 'BIOID',
 32: 'IDNUM',
 33: 'PHI'}

In [22]:
pred_tags

[[25, 25, 25, 25],
 [0, 0, 0, 0],
 [32, 32, 32, 32, 33, 32, 32, 32, 32],
 [33, 33, 33, 15, 15, 15, 15, 15, 15, 33, 33, 33],
 [33, 33],
 [33, 33, 33, 33, 33, 33, 33, 33, 33],
 [33, 33, 33, 33, 33],
 [33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33],
 [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
 [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
 [33, 33, 33, 33, 33],
 [33, 1, 33, 33, 15, 15, 15, 15, 15, 33],
 [33, 33, 33, 33, 33, 33, 33, 33, 1, 1, 1, 33, 33],
 [33, 33, 33, 33, 33, 33, 33, 33, 33],
 [33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33],
 [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
 [33, 33, 33, 33, 33, 33],
 [33],
 [33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  33,
  32,
  32,
  32,
  

In [23]:
answer = [0 if j==33 else j for j in pred_tags[1]]

In [24]:
def find_continuous_and_non_continuous_numbers_and_indices(input_list):
    result = []
    current_number = input_list[0]
    start_index = 0

    for i in range(1, len(input_list)):
        if input_list[i] != current_number:
            end_index = i - 1
            result.append((current_number, (start_index, end_index)))
            current_number = input_list[i]
            start_index = i

    # 處理最後一個連續或非連續數字序列
    end_index = len(input_list) - 1
    result.append((current_number, (start_index, end_index)))

    return result

In [25]:
test_df

Unnamed: 0,fid,idx,text
0,1097,1,433475.RDC
1,1097,12,"Timmins, ELDEN"
2,1097,27,"43J47561,43J47561"
3,1097,46,Last edited : 7/9/2063 Page: 2
4,1097,78,CLINICAL:
5,1097,88,Metastatic cancer ?colorectal primary.
6,1097,128,MACROSCOPIC:
7,1097,141,"Specimen labelled ""Omentum secondary"", consist..."
8,1097,230,On sectioning there are multiple fibrotic whit...
9,1097,312,Blocks: 1 to 5 - representative sections from ...


# post-process-ver-3

In [26]:
pre_label_name_list = ["IDNUM","MEDICALRECORD","PATIENT","CITY","STATE","ZIP","DEPARTMENT",
                  "HOSPITAL","DOCTOR","STREET","ORGANIZATION","AGE",
                  "DATE","TIME","PHONE"]
pre_label_id_list = [label_to_id[label_name] for label_name in pre_label_name_list]

In [27]:
import itertools
lengths = []
tk0 = tqdm(test_df['text'].fillna("").values, total=len(test_df))

final_pre_text = []


for index,text in enumerate(tk0):
    
    

    fid = test_df.loc[index]["fid"]
    idx = test_df.loc[index]["idx"]

    # token_list = CFG.tokenizer(text, add_special_tokens=False)['input_ids']
    
    encoded = CFG.tokenizer(text,
                            add_special_tokens=False,
                            return_offsets_mapping=True)
    
    token_list = encoded['input_ids']
    offset_mapping_list = encoded['offset_mapping']
    

    WithoutNULL = [-1 if j==33 else j for j in pred_tags[index]]

    pre_text_label_and_index = find_continuous_and_non_continuous_numbers_and_indices(WithoutNULL)
    
    for position, pre_word_label_and_index in enumerate(pre_text_label_and_index) :
        pre_label = pre_word_label_and_index[0]
        pre_word_start_end = pre_word_label_and_index[1]

        if pre_label != -1 and pre_label in pre_label_id_list:
        
            start = pre_word_start_end[0]
            end = pre_word_start_end[1]
            
            pre_word_text = CFG.tokenizer.decode(token_list[start:end+1])

            if pre_word_text == '':# [507] token decode 為 ''，如果預測為PHI，寫進答案會Submission Error
                continue

            offset_idx = text.find(pre_word_text)

            ########################################################################## 
            if pre_label == 13: # 針對LOCATION-OTHER，Decode時會少一個空白的處理
                white_space_position = []
                for i, offset_mapping in enumerate(offset_mapping_list):
                    if i == 0:
                        continue
                    previous_offset_start = offset_mapping_list[i-1][0]
                    previous_offset_end = offset_mapping_list[i-1][1]
                    current_offset_start = offset_mapping_list[i][0]
                    current_offset_end = offset_mapping_list[i][1]
                    if previous_offset_end!=current_offset_start:
                        white_space_position.append(previous_offset_end)

                if len(white_space_position)!=0:
                    for position in white_space_position:
                        pre_word_text = pre_word_text[:position] + ' ' + pre_word_text[position:]
                
                offset_idx = text.find(pre_word_text)
            ##########################################################################
            
                
            print(f"{fid}\t{id_to_label[pre_label]}\t{idx+offset_idx}\t{idx+offset_idx+len(pre_word_text)}\t{pre_word_text}")

            final_pre_text.append(f"{fid}\t{id_to_label[pre_label]}\t{idx+offset_idx}\t{idx+offset_idx+len(pre_word_text)}\t{pre_word_text}")
            





100%|██████████| 128/128 [00:00<00:00, 8533.82it/s]

1097	MEDICALRECORD	1	11	433475.RDC
1097	PATIENT	12	26	Timmins, ELDEN
1097	IDNUM	27	35	43J47561
1097	IDNUM	27	35	43J47561
1097	DATE	60	68	7/9/2063
1097	DOCTOR	396	398	IC
1097	DATE	402	408	5.9.63
1097	DOCTOR	438	446	L Bonnot
1097	IDNUM	835	842	43J4756
1135	IDNUM	13	23	23F340166Q
1135	MEDICALRECORD	24	35	2323401.RRQ
1135	PATIENT	37	55	Jourdan, WILLIEMAE
1135	IDNUM	66	74	23F34016
1135	STREET	75	82	Redacre
1135	CITY	83	88	COWRA
1135	STATE	90	92	WA
1135	ZIP	94	98	6021
1135	DATE	124	132	3/6/1989
1135	DATE	152	162	07/08/2062
1135	DEPARTMENT	183	198	ENVOI Pathology
1135	HOSPITAL	199	214	TEMORA HOSPITAL
1135	DOCTOR	218	234	ANTHONY ROCKHOLD
1135	DOCTOR	1606	1608	FA
1135	DATE	1612	1618	8/8/62
1135	DOCTOR	1649	1656	F Serpe





In [28]:
with open('./DemoAnswer/NER/NER_answer.txt','w', encoding='utf-8') as f:
        for final_pre in final_pre_text:
            f.write(final_pre)
            f.write('\n')