In [16]:
import os
import zipfile

zip_file_path = "./microbat.zip"
extract_to_folder = "./data"
if os.path.exists(zip_file_path):
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to_folder)

        inner_zip_path = os.path.join(
            extract_to_folder, "MACCROBAT2018.zip")
        with zipfile.ZipFile(inner_zip_path, "r") as inner_zip_ref:
            inner_extract_folder = os.path.join(
                extract_to_folder, "MACCROBAT2018")
            os.makedirs(inner_extract_folder, exist_ok=True)
            inner_zip_ref.extractall(inner_extract_folder)

        print(f"Đã giải nén tất cả các file vào {inner_extract_folder}")

else:
    print(f"Không tìm thấy file {zip_file_path}")

Đã giải nén tất cả các file vào ./data\MACCROBAT2018


In [17]:
os.listdir('./data/MACCROBAT2018')

['15939911.ann',
 '15939911.txt',
 '16778410.ann',
 '16778410.txt',
 '17803823.ann',
 '17803823.txt',
 '18236639.ann',
 '18236639.txt',
 '18258107.ann',
 '18258107.txt',
 '18416479.ann',
 '18416479.txt',
 '18561524.ann',
 '18561524.txt',
 '18666334.ann',
 '18666334.txt',
 '18787726.ann',
 '18787726.txt',
 '18815636.ann',
 '18815636.txt',
 '19009665.ann',
 '19009665.txt',
 '19214295.ann',
 '19214295.txt',
 '19307547.ann',
 '19307547.txt',
 '19610147.ann',
 '19610147.txt',
 '19816630.ann',
 '19816630.txt',
 '19860006.ann',
 '19860006.txt',
 '19860007.ann',
 '19860007.txt',
 '19860925.ann',
 '19860925.txt',
 '20146086.ann',
 '20146086.txt',
 '20671919.ann',
 '20671919.txt',
 '20977862.ann',
 '20977862.txt',
 '21067996.ann',
 '21067996.txt',
 '21129213.ann',
 '21129213.txt',
 '21254744.ann',
 '21254744.txt',
 '21308977.ann',
 '21308977.txt',
 '21477357.ann',
 '21477357.txt',
 '21505579.ann',
 '21505579.txt',
 '21527041.ann',
 '21527041.txt',
 '21672201.ann',
 '21672201.txt',
 '21720478.ann

## Preprocessing

In [99]:
from typing import List, Dict, Tuple

class preprocessing_data():
    def __init__(self, data_folder: str, tokenizer = None):
        self.list_id = [name_file.split('.')[0] for name_file in os.listdir(
            data_folder) if name_file.endswith('.txt')] # lấy unique id từ tên file
        
        self.tokenizer = tokenizer
        self.texts : List[str] = []
        self.tags : List[Dict[str, str]] = []
        
        for id in self.list_id:
            with open(os.path.join(data_folder, id + '.txt'), 'r') as file:
                self.texts.append(file.read())
            with open(os.path.join(data_folder, id + '.ann'), 'r') as file:
                # chỉ lấy tag = T
                text_bound_ann = [tag for tag in file.read().split('\n') if tag.startswith('T')]
                text_bound_lst = []

                for tag in text_bound_ann:
                    text = tag.split('\t') # ['T1', 'Age 8 19', '28-year-old']
                    label = text[1].split(' ') # ['Age', '8', '19']
                # có trường hợp start-end kh phải là số nên cần xử lý trường hợp ngoại lệ
                    try: 
                        start_offset = int(label[1])
                        end_offset = int(label[2])
                        tag_dict = {
                            "text": text[-1],
                            "label": label[0],
                            "start": str(start_offset),
                            "end": str(end_offset)
                        }
                        text_bound_lst.append(tag_dict)
                    except ValueError:
                        print(
                            f"Warning: Skipping annotation in file {id}.ann due to not number in line: {tag}")
                        continue
                    
                self.tags.append(text_bound_lst)

    def process(self) -> Tuple[List[List[str]], List[List[str]]]:
        input_texts = []
        input_labels = []
        for i in range(len(self.list_id)):
            fulltext = self.texts[i]
            tags = self.tags[i]

            label_offset = []
            continuous_label_offset = []
            for tag in tags:
                offset = list(range(int(tag["start"]), int(tag["end"]) + 1)) # tag 1: [8,9,10,...19], tag 2: [20, 21, 22,..., 38]
                label_offset.append(offset)# [[8,9,10,...19], [20, 21, 22,..., 38]]
                continuous_label_offset.extend(offset)# [8,9,10,...19, 20, 21, 22,..., 38]

            all_offset = list(range(len(fulltext))) # [0,1,2,..., len(fulltext) - 1]
            zero_offset = [offset for offset in all_offset if offset not in continuous_label_offset] # [0,1,2,...,7, 39, 40, 41,...]
            zero_offset = preprocessing_data.find_continuous_ranges(zero_offset) # [[0,1,2,...,7], [39, 40, 41,...]] gom nhóm các khoảng label liên tục lại

            self.tokens = []  
            self.labels = []  
            self._merge_offset(fulltext, tags, zero_offset, label_offset)

            # Danh sách tokens đã được xử lý cho văn bản hiện tại (ví dụ: ['Patient', 'is', '28', '-', 'year', '-', 'old', '.'])
            # Danh sách labels tương ứng với tokens (ví dụ: ['O', 'O', 'B-Age', 'I-Age', 'I-Age', 'I-Age', 'I-Age', 'O']
            input_texts.append(self.tokens)
            input_labels.append(self.labels)
            
        return input_texts, input_labels
    
    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        # zero: [[0,1,2], [6,7]] label: [[3,4,5]]
        i = j = 0
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1

        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
            i += 1

        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1

    def _add_zero(self, full_text, offset, index):
        # lấy index đầu-đuôi, bỏ qua khoảng giữa
        start, *_, end = offset[index] if len(offset[index]) > 1 else (
            offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            ["O"] * len(text_tokens)
        )

    def _add_label(self, full_text, offset, index, tags):
        # lấy index đầu-đuôi, bỏ qua khoảng giữa
        start, *_, end = offset[index] if len(offset[index]) > 1 else (
            offset[index][0], offset[index][0] + 1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            [f"B-{tags[index]['label']}"] +
            [f"I-{tags[index]['label']}"] * (len(text_tokens) - 1)
        )
    
    @staticmethod
    def find_continuous_ranges(continuous_list:List[int]) -> List[List[int]]: # [0,1,2,...,7, 39, 40, 41,...]
        ranges = []
        start = continuous_list[0] # 0
        current = continuous_list[0] # 0
        for i in range(1, len(continuous_list)):
            if continuous_list[i] - 1 == current:
                current = continuous_list[i]
            else:
                ranges.append(list(range(start, current + 1)))
                start = continuous_list[i]
                current = continuous_list[i]

        return ranges  # [0,1,2,...,7, 39, 40, 41,...] -> [[0,1,2,...,7], [39, 40, 41,...]]
    

In [101]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")

dataset_folder = "./data/MACCROBAT2018"

Maccrobat_builder = preprocessing_data(dataset_folder, tokenizer)
input_texts, input_labels = Maccrobat_builder.process()
print("tokens: ",input_texts)
print("labels: ", input_labels)

lõi {'text': '28-year-old', 'label': 'Age', 'start': '8', 'end': '19'}
lõi {'text': 'previously healthy', 'label': 'History', 'start': '20', 'end': '38'}
lõi {'text': 'man', 'label': 'Sex', 'start': '39', 'end': '42'}
lõi {'text': 'presented', 'label': 'Clinical_event', 'start': '43', 'end': '52'}
lõi {'text': 'healthy', 'label': 'Sign_symptom', 'start': '31', 'end': '38'}
lõi {'text': '6-week', 'label': 'Duration', 'start': '60', 'end': '66'}
lõi {'text': 'palpitations', 'label': 'Sign_symptom', 'start': '78', 'end': '90'}
lõi {'text': 'symptoms', 'label': 'Coreference', 'start': '96', 'end': '104'}
lõi {'text': 'rest', 'label': 'Clinical_event', 'start': '121', 'end': '125'}
lõi {'text': '2–3 times per week', 'label': 'Frequency', 'start': '127', 'end': '145'}
lõi {'text': 'dyspnea', 'label': 'Sign_symptom', 'start': '206', 'end': '213'}
lõi {'text': 'up to 30 minutes at a time', 'label': 'Detailed_description', 'start': '154', 'end': '180'}
lõi {'text': 'regurgitation murmur', 'labe