# Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Original data

In [None]:
# 原本給的tsv
origin_file_path = "/content/drive/MyDrive/dataset/opendid_valid/opendid_set1.tsv"

# 使用 pandas 載入 TSV 檔案
df = pd.read_csv(origin_file_path, delimiter='\t', names=['file_name', 'start', 'content', 'answer'])

# 顯示資料
df.head(5)

Unnamed: 0,file_name,start,content,answer
0,10,1,Episode No: 09F016547J,IDNUM: 09F016547J
1,10,25,091016.NMT,MEDICALRECORD: 091016.NMT
2,10,37,"SIZAR, HOWARD","PATIENT: SIZAR, HOWARD"
3,10,52,Lab No: 09F01654,IDNUM: 09F01654
4,10,70,Runford,STREET: Runford


# Create dataset and combine dataset

In [None]:
# 我自己做的訓練tsv
import pandas as pd
import os

def iterate_files_in_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            yield file_path

In [None]:
def merge_content(row):
    if pd.notna(row['time_normalized']):
        row['content'] = row['content'] + '=>' + row['time_normalized']
    return row['content']

def get_answer(path):
  temp_df = pd.read_csv(path, sep='\t', header=None, names=['file_name', 'label', 'start', 'end', 'content', 'time_normalized'])

  temp_df['content'] = temp_df.apply(merge_content, axis=1)
  temp_df['file_name'] = temp_df['file_name'].astype(str)
  temp_df = temp_df.drop(columns=['time_normalized'])
  return temp_df

In [None]:
# 將文件夾中的txt檔以及其答案檔合併成一個訓練dataframe
def txt_to_trainingData(txt_path, ans_path):

  ans_df = get_answer(ans_path)

  file_name = []
  start_pos = []
  origin_contents = []
  expect_ans = []
  time_sym = '=>'

  for file_path in iterate_files_in_folder(txt_path):

    sub_ans_df = ans_df[ans_df['file_name'] == file_path.split('/')[-1].split('.')[0]]
    sub_ans_df = sub_ans_df.reset_index(drop=True)

    with open(file_path, 'r', encoding='utf-8') as file:
      contents = file.readlines()

    curr_pos = 0
    ans_id = 0
    ans_num = len(sub_ans_df['content'].values)


    for content in contents:

      if content != '\n' and content != ' ' and content != '\t\n' and content != '  \n':

        file_name.append(file_path.split('/')[-1].split('.')[0])
        start_pos.append(curr_pos)
        origin_contents.append(content.split('\n')[0])
        curr_pos += len(content)
        flag = 0


        if ans_id == ans_num:
          expect_ans.append('PHI: NULL')
        else:
          line_ans_str = ''
          ans_count = 0


          while flag == 0:
            if str(sub_ans_df['content'].values[ans_id]).split('=>')[0] in content:
              if ans_count == 0:
                line_ans_str = sub_ans_df['label'].values[ans_id] + ": " + str(sub_ans_df['content'].values[ans_id])
                ans_id += 1
                if ans_id >= len(sub_ans_df):
                  flag = 1
                  break
              else:
                line_ans_str = line_ans_str + '\n' + sub_ans_df['label'].values[ans_id] + ": " + str(sub_ans_df['content'].values[ans_id])
                ans_id += 1
                if ans_id >= len(sub_ans_df):
                  flag = 1
                  break
              ans_count += 1
            else:
              expect_ans.append("PHI: NULL")
              flag = 0
              break

            if str(sub_ans_df['content'].values[ans_id]).split('=>')[0] in content:
              flag = 0
            else:
              flag = 1
              break
          if flag == 1:
            expect_ans.append(line_ans_str)

      else:
        curr_pos += 1
  print(len(file_name), len(expect_ans))

  new_df = pd.DataFrame({
      'file_name': file_name,
      'start': start_pos,
      'content': origin_contents,
      'answer': expect_ans
  })
  return new_df

In [None]:
txt_path = "/content/drive/MyDrive/dataset/First_Phase_ReleaseCorrection/First_Phase_Release(Correction)/First_Phase_Text_Dataset"
ans_path = "/content/drive/MyDrive/dataset/First_Phase_ReleaseCorrection/First_Phase_Release(Correction)/answer.txt"

df1 = txt_to_trainingData(txt_path, ans_path)
df1

50428 50428


Unnamed: 0,file_name,start,content,answer
0,file14520,0,SPR no: 61R779861S,IDNUM: 61R779861S
1,file14520,19,MRN no: 61677986,MEDICALRECORD: 61677986
2,file14520,36,Site_name: KALGOORLIE HEALTH CAMPUS,HOSPITAL: KALGOORLIE HEALTH CAMPUS
3,file14520,72,Facility_id: 016,PHI: NULL
4,file14520,89,Specimen_type: Fresh Tissue,PHI: NULL
...,...,...,...,...
50423,file14786,5293,9.7.71,DATE: 9.7.71=>2071-09-07
50424,file14786,5305,SpecimenReceivedDate,PHI: NULL
50425,file14786,5326,2818-10-23 00:00:00,TIME: 2818-10-23 00:00:00=>2818-10-23T00:00:00
50426,file14786,5348,LastName,PHI: NULL


In [None]:
txt_path = "/content/drive/MyDrive/dataset/Second_Phase_Dataset/Second_Phase_Dataset/Second_Phase_Text_Dataset"
ans_path = "/content/drive/MyDrive/dataset/Second_Phase_Dataset/Second_Phase_Dataset/answer.txt"

df2 = txt_to_trainingData(txt_path, ans_path)
df2

27840 27840


Unnamed: 0,file_name,start,content,answer
0,1093,1,Episode No: 48B915480A,IDNUM: 48B915480A
1,1093,25,4809154.WAA,MEDICALRECORD: 4809154.WAA
2,1093,38,"Otterbine, Laverne","PATIENT: Otterbine, Laverne"
3,1093,58,"Lab No: 48B91548,48B91548",IDNUM: 48B91548\nIDNUM: 48B91548
4,1093,85,Legend Manor,STREET: Legend Manor
...,...,...,...,...
27835,2029,1504,Sections of the distal pancreas show a multilo...,PHI: NULL
27836,2029,2255,DIAGNOSIS:,PHI: NULL
27837,2029,2266,Distal pancreas and spleen:,PHI: NULL
27838,2029,2294,"- Mucinous cystic neoplasm of the pancreas, lo...",PHI: NULL


In [None]:
print(len(df1))
print(len(df2))

50428
27840


In [None]:
# 合併兩個 DataFrame
df_combined = pd.concat([df1, df2], ignore_index=True)
df_combined

Unnamed: 0,file_name,start,content,answer
0,file14520,0,SPR no: 61R779861S,IDNUM: 61R779861S
1,file14520,19,MRN no: 61677986,MEDICALRECORD: 61677986
2,file14520,36,Site_name: KALGOORLIE HEALTH CAMPUS,HOSPITAL: KALGOORLIE HEALTH CAMPUS
3,file14520,72,Facility_id: 016,PHI: NULL
4,file14520,89,Specimen_type: Fresh Tissue,PHI: NULL
...,...,...,...,...
78263,2029,1504,Sections of the distal pancreas show a multilo...,PHI: NULL
78264,2029,2255,DIAGNOSIS:,PHI: NULL
78265,2029,2266,Distal pancreas and spleen:,PHI: NULL
78266,2029,2294,"- Mucinous cystic neoplasm of the pancreas, lo...",PHI: NULL


In [None]:
len(df_combined)

78268

In [None]:
tsv_file_path = 'merge_df.tsv'  # 替換成實際的檔案路徑

# 使用 to_csv 函數保存 DataFrame 為 TSV 檔案
df_combined.to_csv(tsv_file_path, sep='\t', index=False ,header=False)

# 測試tsv檔是否有問題

In [None]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.5


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset, Features, Value

# data_path = "/content/drive/MyDrive/AICUP_data/opendid_valid/opendid_set1.tsv"
data_path = "merge_df.tsv"


dataset = load_dataset("csv", data_files = data_path, delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
path = '/content/drive/MyDrive/AICUP_data/My_data/merge_df.tsv'


test = pd.read_csv(path, sep='\t', header=None, names=['file_name', 'start', 'content', 'answer'])
test

Unnamed: 0,file_name,start,content,answer
0,10,1,Episode No: 09F016547J,IDNUM: 09F016547J
1,10,25,091016.NMT,MEDICALRECORD: 091016.NMT
2,10,37,"SIZAR, HOWARD","PATIENT: SIZAR, HOWARD"
3,10,52,Lab No: 09F01654,IDNUM: 09F01654
4,10,70,Runford,STREET: Runford
...,...,...,...,...
112991,2069,939,Sections show a fatty tumour.There is moderate...,PHI: NULL
112992,2069,1621,The mass is partly surrounded by a thin fibrou...,PHI: NULL
112993,2069,1868,The slides have also been viewed by Dr H Vire.,PHI: NULL
112994,2069,1917,DIAGNOSIS:,PHI: NULL
