In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('1yr_data.csv')

# contain chinese
df = df[df[['主訴(S)', '診斷(A)', '計畫(P)']].astype(str).apply(lambda x: x.str.contains(r'[\u4e00-\u9fff]', regex=True)).any(axis=1)]

#### preprocessing and standardize
- remove the chinese part, and add the translated corresponding english at the end of the content --> at least majority of the meaning retained and accurate <br>
- sequence and position matters? 

In [None]:
pd.set_option('display.max_colwidth', None)
df.head(20)

In [None]:
# # subset for testing
# df = df.head(5)
# # Disable truncation for columns
# pd.set_option('display.max_colwidth', None)
# df

In [6]:
import re

def remove_chinese(text):
    if not isinstance(text, str): 
        return "", text
    chinese_pattern = re.compile(r'[\u4e00-\u9fff，。！？、；：]+')
    # retrieve chinese part 
    chinese_parts = " ".join(chinese_pattern.findall(text))
    # remove chinese part
    cleaned_text = chinese_pattern.sub("", text)  
    return chinese_parts, cleaned_text

In [7]:
from transformers import MarianMTModel, MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-zh-en')

def translate_chinese_to_english(text):
    """Translate Chinese text to English."""
    if not text.strip(): 
        return ""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)



In [8]:
def replace_chinese_with_translation(text):
    chinese_parts, cleaned_text = remove_chinese(text)  
    if chinese_parts:  
        translated_text = translate_chinese_to_english(chinese_parts) 
        
        return f"{cleaned_text} {translated_text}".strip()  
    return cleaned_text  

In [None]:
columns_to_process = ['主訴(S)', '診斷(A)', '計畫(P)']

from tqdm import tqdm
tqdm.pandas()

# Apply the function to each column
for col in columns_to_process:
    df[col] = df[col].progress_apply(replace_chinese_with_translation)

#### concat dataframes together as an all-english dataset

In [19]:
df_eng = pd.read_csv('1yr_data.csv')
df_eng = df_eng[~df_eng[['主訴(S)', '診斷(A)', '計畫(P)']].astype(str).apply(lambda x: x.str.contains(r'[\u4e00-\u9fff]', regex=True)).any(axis=1)]

In [21]:
df_eng.shape

(1508, 5)

In [None]:
df_clean = pd.read_csv('translated_data.csv')
# df_clean

In [23]:
df_new = pd.concat([df_eng, df_clean], ignore_index = True)
df_new.shape

(3702, 5)

In [25]:
df_new.to_csv('english_1yr_data.csv', index = False)

In [None]:
pd.set_option('display.max_colwidth', None)
df_new.head(20)

#### remove all measurements from assessment

In [None]:
# remove CDR
import re

def remove_cdr_scores(text):
    cleaned_text = re.sub(r'\b(?:cdr|CDR)\s*[\d\.]+(-[\d\.]+)?\b', '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df_new['診斷(A)'] = df_new['診斷(A)'].apply(remove_cdr_scores)
df_new.head()

In [None]:
# remove all measurements

def remove_measurements(text):
    # find words with numbers combo
    cleaned_text = re.sub(r'\b(?:[a-zA-Z]+)?\s*[\d\.]+(?:/[a-zA-Z\d\.]+)?\b', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df_new["診斷(A)"] = df_new["診斷(A)"].apply(remove_measurements)
df_new.head()

In [48]:
df_new.to_csv('wo_measurements_1yr.csv', index = False)

#### load 病摘

In [3]:
medSum = pd.read_csv('病摘.csv', encoding='big5', encoding_errors='ignore', on_bad_lines = 'skip')  
medSum = medSum[['醫囑指示', '病理報告']]
medSum = medSum.dropna(subset=['醫囑指示','病理報告'])
medSum.shape

(6083, 2)

In [4]:
from translation import replace_chinese_with_translation
columns_translated = ['醫囑指示', '病理報告']

from tqdm import tqdm
tqdm.pandas()

for col in columns_translated:
    medSum[col] = medSum[col].progress_apply(replace_chinese_with_translation)

100%|██████████| 6083/6083 [1:04:00<00:00,  1.58it/s]
100%|██████████| 6083/6083 [03:08<00:00, 32.33it/s] 


In [5]:
medSum.to_csv('medical_summary_english.csv', index = False)