In [82]:
import os
import re
import json

from pathlib import Path
import pandas as pd

In [83]:
# report id -> body part, content, conclusion, diseases
#    ok            ok       ok         ok

In [84]:
def txt_file(filepath):
    if filepath.endswith('TOTAL.txt'):
        pass
    else:
        return filepath.lower().endswith('.txt')
    
def image_file(filepath):
    return filepath.lower().endswith('.jpg')

def check_kesan(text):
    # split the text to each line and strip the space
    text = [t.lower().strip() for t in text.splitlines()]
    if 'kesan:' not in text:
        if 'kesan' not in text:
            return True

def remove_start_end(text):
    location_sentence = ''
    # remove empty lines, start and end sentences that bear no meaning
    sentences = [s.strip() for s in text.splitlines() if s]
    # if there is kesan
    try:
        conclusion_index = sentences.index('kesan')
    # if there is no kesan
    except:
        conclusion_index = -1

    # remove conclusion beginning with ratio   
    for i, s in enumerate(sentences):
        if s.startswith('ratio') or s.startswith('a/n'):
            sentences.pop(i)
            break

    end_word = sentences[-1].split()[0].lower()
    start_word = sentences[2].split()[0].lower()

    if start_word in ['pemeriksaan', 'foto', 'perbandingan']:
        location_sentence = sentences[2]
    else:
        location_sentence = sentences[1]

    if end_word in ['terima', 'atas', 'btk']:
        start_index = 3 if start_word in ['pemeriksaan', 'foto', 'perbandingan'] else 2
        text = os.linesep.join(sentences[start_index:-1])
        if conclusion_index == -1:
            conclusion_text = os.linesep.join([sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences[start_index:start_index+2] if sentence.strip()])
        else:
            # conclusion_text = os.linesep.join(sentences[conclusion_index:-1])
            conclusion_text = os.linesep.join([sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences[conclusion_index:-1] if sentence.strip()])
    else:
        start_index = 3 if start_word in ['pemeriksaan', 'foto', 'perbandingan'] else 2
        text = os.linesep.join(sentences[start_index:])
        if conclusion_index == -1:
            conclusion_text = os.linesep.join([sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences[start_index:start_index+2] if sentence.strip()])
        else:
            # conclusion_text = os.linesep.join(sentences[conclusion_index:])
            conclusion_text = os.linesep.join([sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences[conclusion_index:] if sentence.strip()])
    # content_text = os.linesep.join(sentences[start_index:conclusion_index])

    # Check if there isn't a full stop at the end of each line; if not, add a full stop
    content_text = os.linesep.join([sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences[start_index:conclusion_index] if sentence.strip()])
    return text, content_text, conclusion_text, location_sentence

def remove_punctuations(text):
    punctuations = '''!()[]{};:'"\,>?@#$%^&*~''' # / is not removed for fraction, - is not removed for range (ex: l3-l4), . is not removed, < is not removed (kurang dari)
    new_text = ''
    for char in text:
        if char not in punctuations: 
            new_text += char
    return new_text

def remove_unnecessary_word(content_text, conclusion_text):
    # List of characters and words to remove
    to_remove = ['kesan.', '\r', '\n', 'usul ct scan abdomen.']

    # Function to remove unwanted characters, sentences, and words
    def remove_unwanted(text):    
        text = re.sub(r'(\.\s*|\s*)mohon konfirmasi dengan parameter klinis dan laboratorium', '', text)
        text = re.sub(r'(\.\s*|\s*)terimakasih atas kerjasamanya', '', text)
        for item in to_remove:
            text = text.replace(item, ' ')
        text = re.sub(r'\s+', ' ', text)  # Remove double spaces
        return text.strip()
    
    def remove_strip(text):
        text = re.sub(r'^\s*-\s*', '', text)
        text = re.sub(r'(\.\s*-\s*|\s*-\s*)', '. ', text)
        return text.strip()

    # Clean the content and conclusion texts
    cleaned_content = remove_unwanted(content_text)
    cleaned_conclusion = remove_unwanted(conclusion_text)

    # Clean '-' in kesan
    if cleaned_conclusion.startswith('- '):
        cleaned_conclusion = remove_strip(cleaned_conclusion)

    return cleaned_content, cleaned_conclusion

def decimal_to_text(decimal_number):
    numbers_in_indonesian = {
        '0': 'nol',
        '1': 'satu',
        '2': 'dua',
        '3': 'tiga',
        '4': 'empat',
        '5': 'lima',
        '6': 'enam',
        '7': 'tujuh',
        '8': 'delapan',
        '9': 'sembilan'
    }
    integer_part, fractional_part = str(decimal_number).split('.')

    # Convert the integer part to Indonesian
    integer_text = ''
    for digit in integer_part:
        integer_text += numbers_in_indonesian[digit] + ' '

    # Convert the fractional part to Indonesian
    fractional_text = ''
    for digit in fractional_part:
        fractional_text += numbers_in_indonesian[digit] + ' '

    result = integer_text.strip() + ' koma ' + fractional_text.strip()
    return result

def clean_words(content_text, conclusion_text):

    def clean_typos(text):
        # convert t 4 to t4 and so on
        pattern = r"t\s(\d)"  
        text = re.sub(pattern, r"t\1", text)

        # convert 0.8/8
        pattern = r"0.8/8"  
        text = re.sub(pattern, "nol koma delapan per delapan", text)

        # convert l1-5
        pattern = r"l1-5"  
        text = re.sub(pattern, "l1 sampai l5", text)

        # convert pada1/3 and komplit1/3
        pattern = r"(pada|komplit)1/3"  
        text = re.sub(pattern, r"\1 1/3", text)

        # convert 0.5 to setengah
        pattern = r"0\.5"  
        text = re.sub(pattern, r"setengah", text)

        # convert decimal to text
        pattern = r"\d+\.(\d+)"
        decimal_exist = re.search(pattern, text)
        if decimal_exist:
            decimal_number = decimal_exist.group()
            decimal_number = float(decimal_number)
            converted_decimal = decimal_to_text(decimal_number)
            text = re.sub(pattern, converted_decimal, text)

        # convert 0.5 to setengah
        pattern = r"0\.5"  
        text = re.sub(pattern, r"setengah", text)


        modified_lines = []
        lines = text.split(".")

        with open('corrections.json', 'r') as file:
            json_data = file.read()

        corrections = json.loads(json_data)

        for line in lines:
            pattern = r"dd/(?!\s)"  # Matches "dd/" not followed by a space
            line = re.sub(pattern, r"dd/ ", line) # replace with "dd/ "

            pattern = r"([a-zA-Z]+)/([a-zA-Z]+)"  # Matches "/" between words
            line = re.sub(pattern, r"\1 / \2", line) # replace with " /  "

            pattern = r"([0-9]+)-([a-zA-Z]+)"  # Matches "-" between positions
            line = re.sub(pattern, r"\1 - \2", line) # replace with " - "

            words = line.split()
            modified_words = []
            
            for word in words:
                word = word.replace('.', '')
             
                # Check for "dd/" pattern
                dd_pattern = r"(dd/)(\S+)"
                dd_matches = re.findall(dd_pattern, word)
                if dd_matches:
                    # Replace "dd/" with "atau"
                    dd_replaced = ["atau", dd_matches[0][1]]
                    modified_words.extend(dd_replaced)
                    continue  # Skip further processing for this word

                # Check for "{text}/{text}" pattern
                # text_pattern = r"(\S+)/(\S+)"
                # text_matches = re.findall(text_pattern, word)
                # if text_matches:
                #     # Replace "/" with "atau"
                #     text_replaced = ["atau" if w == "/" else w for w in word.split('/')]
                #     modified_words.extend(text_replaced)
                #     continue  # Skip further processing for this word
                
                # Check if the word is a typo and replace it with the correct word
                corrected_word = corrections.get(word, word)
                modified_words.append(corrected_word)
             
            # Join the modified words back into a line and add to the list of modified lines
            modified_lines.append(" ".join(modified_words))
        return ". ".join(modified_lines)
    
    # Clean the content and conclusion texts
    cleaned_content = clean_typos(content_text).strip()
    cleaned_conclusion = clean_typos(conclusion_text).strip()

    return cleaned_content, cleaned_conclusion

def identify_disease(text):
    if 'tak tampak kelainan radiologik pada jantung dan paru.' in text and 'elevasi diafragma kanan' in text:
        return 1, text
    # Regex pattern to match various normal phrases
    pattern = r'(tak|tidak)\s+tampak\s+(kelainan)\s+(radiologik|radiologis)|dalam\s+batas\s+normal|tulang\s+tulang\s+intak\.$'
    pattern2 = r'tidak tampak kelainan.*'
    pattern3 = r'struktur tulang relatif baik.*'
    if bool(re.search(pattern, text)) or bool(re.search(pattern2, text)) or bool(re.search(pattern3, text)):
        return 0, None
    elif 'tulang tulang intak.' in text and len(text.split('.')) > 1:
        return 1, text
    
    return 1, text

def detect_location(location_sentence, body_part):
    skip_body_part = ['abdomen', 'kepala', 'os_sacro_coccygeus', 'pelvis', 'thorax', 'vertebra_cervical', 'vertebra_lumbosakral', 'vertebra_thoracolumbal']
    print('llll',location_sentence)
    if body_part in skip_body_part:
        return None
    elif len(location_sentence) > 0:
        
        pattern_left = '(sinistra|kiri)'
        pattern_right = '(dextra|kanan)'
        pattern_bilateral = 'bilateral'
        if re.search(pattern_left, location_sentence):
            return 'sinistra'
        elif re.search(pattern_right, location_sentence):
            return 'dextra'
        elif re.search(pattern_bilateral, location_sentence):
            return 'bilateral'
    else:
        return None


def cleaning(text, base_dir, filename, body_part):
    # generate_separate_report(text, base_dir, filename)
    text = text.strip()
    text = text.lower()
    text = remove_punctuations(text)
    text, content_text, conclusion_text, location_sentence = remove_start_end(text)
    content_text, conclusion_text = remove_unnecessary_word(content_text, conclusion_text)
    content_text, conclusion_text = clean_words(content_text, conclusion_text)
    # print(content_text, conclusion_text)
    disease_status, disease_category = identify_disease(conclusion_text)
    location = detect_location(location_sentence, body_part)
    return text, content_text, conclusion_text, disease_status, disease_category, location

In [85]:
# directories
base_dir = os.path.join('..','datasets','radiology_clean')
word_list_dir = os.path.join('..', 'datasets_wordlist')
# missing_kesan_file = os.path.join('..','report_analysis','all_files_missing_kesan.txt')

# with open(missing_kesan_file, 'r', encoding='utf-8') as file:
#     missing_kesan_file_list = [line.strip() for line in file]

# aggregated_report_list = ['BNO-231027-report-1.txt', 'clavicual_kanan-Series_1001_0000-gabung-report-1.txt', 'pelvis-2310022-gabung-report-1.txt', 'pelvis-2310024-report-1.txt', 'pelvis-2310025-report-1.txt', 'pelvis-2310300066-report-1.txt', 'Thorax-231020_Brobkhopneumonia-report-1.txt', 'Thorax-231020_Cardiomegaly-report-1.txt', 'Thorax-231020_Pneumonia-Pneumonia-report-1.txt', 'Thorax-231021_231022-Cardiomegaly-report-1.txt', 'Thorax-231021_231022-diagnosa_gabung-report-3.txt', 'Thorax-231021_231022-Pneumonia-report-2.txt', 'Thorax-231021_231022_Cardiomegaly-Cardiomegaly-report-1.txt', 'Thorax-231021_231022_Pneumonia-Pneumonia-report-1.txt', 'Thorax-231023-Bronkhopneumonia-report-1.txt', 'Thorax-231023-gabung-report-3.txt', 'Thorax-231023-Pneumonia-report-2.txt', 'Thorax-231024-pneumonia-report-2.txt', 'Thorax-231024-report-3.txt', 'Thorax-231025-gabung-report-1.txt', 'Thorax-231025-pneumonia-report-3.txt', 'Thorax-231025-pneumonia_cardiomegaly-report-2.txt', 'Thorax-231026-gabung-report-2.txt', 'Thorax-231026-pneumonia-report-3.txt', 'Thorax-231027-pneumonia-report-3.txt', 'Thorax-231028-pneumonia-report-3.txt', 'Thorax-231028-report-4.txt', 'Thorax-231029-gabung-report-1.txt', 'Thorax-231030-report-1.txt', 'Thorax-231101-report-1.txt', 'Thorax-231103-report-1.txt', 'Thorax-231105-report-1.txt', 'Thorax-231106-report-1.txt', 'Thorax-231107-report-1.txt', 'Thorax-231108-report-1.txt', 'Thorax-231109-report-1.txt', 'Thorax-231110-report-1.txt', 'Thorax-231111-report-1.txt', 'Thorax-231112-report-1.txt', 'Thorax-231113-report-1.txt', 'Thorax-231114-report-1.txt', 'Thorax-231115-report-1.txt', 'Thorax-231116-report-1.txt', 'Thorax-231118-report-1.txt', 'Thorax-231119-report-1.txt', 'Thorax-231120-report-1.txt', 'Thorax-231121-report-1.txt', 'Thorax-231122-report-1.txt', 'Thorax-231123-report-1.txt', 'Thorax-231124-report-1.txt', 'Thorax-231125-report-1.txt', 'Thorax-231126-report-1.txt', 'Thorax-231127-report-1.txt', 'Thorax-231128-report-1.txt', 'Thorax-231129-report-1.txt', 'Thorax-231130-report-1.txt', 'Thorax-231201-report-1.txt', 'Thorax-231202-report-1.txt', 'Thorax-231203-report-1.txt', 'Thorax-231204-report-1.txt', 'Thorax-231205-report-1.txt', 'Thorax-231206-report-1.txt', 'Thorax-231207-report-1.txt', 'Thorax-231208-report-1.txt', 'Thorax-231209-report-1.txt', 'Thorax-231210-report-1.txt', 'Thorax-231211-report-1.txt', 'Thorax-231212-report-1.txt', 'Thorax-231213-report-1.txt', 'Thorax-231214-report-1.txt', 'Thorax-231215-report-1.txt', 'Thorax-231216-report-1.txt', 'Thorax-231217-report-1.txt', 'Thorax-231218-report-1.txt', 'Thorax-231222-report-1.txt', 'Thorax-240101-cardiomegaly-report-2.txt', 'Thorax-240101-report-4.txt', 'Thorax-240102-report-1.txt', 'Thorax-240103-report-1.txt', 'Thorax-240104-report-1.txt', 'Thorax-240105-report-1.txt', 'Thorax-240106-report-1.txt', 'Thorax-240107-report-1.txt', 'Thorax-240108-report-1.txt', 'Thorax-240109-report-1.txt', 'Thorax-240110-report-1.txt', 'Thorax-240111-report-1.txt', 'Thorax-240112-report-1.txt', 'Thorax-240113-report-1.txt', 'Thorax-240114-report-1.txt', 'Thorax-240115-report-1.txt', 'Thorax-240117-report-1.txt', 'Thorax-240119-report-1.txt', 'Thorax-240120-report-1.txt', 'Thorax-240121-report-1.txt', 'Thorax-240122-report-1.txt', 'Thorax-240124-report-1.txt', 'Thorax-240126-report-1.txt', 'Thorax-240127-report-1.txt', 'Thorax-240128-report-1.txt']

# body part grouping
body_part_group = [f.lower() for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
print("body part group:", body_part_group, "\n")

df_rows = []

id_counter = 1 
folder_ids = {}
single_file_report = {}

body_part_renaming = {
    'abodemen_3_posisi': 'abdomen',
    'BNO': 'abdomen',
    'thorax_clean': 'thorax',
    'clavicula_clean': 'bahu',
    'ankle_bilateral': 'ankle',        
    'ankle_dextra': 'ankle',             
    'ankle_sinistra': 'ankle',    
    'calvaria': 'kepala',
    'schedel4': 'kepala',
    'femur_bilateral': 'femur',
    'femur_dextra': 'femur',
    'femur_sinistra': 'femur',
    'humerus_biateral': 'humerus',     
    'humerus_dextra': 'humerus',    
    'humerus_sinistra': 'humerus',
    'pedis_bilateral': 'pedis',       
    'pedis_dextra': 'pedis',
    'pedis_sinistra': 'pedis',
    "sinus_paranasal": "kepala",
    'wrist_joint_bilateral': 'wrist',       
    'wrist_joint_dextra': 'wrist',   
    'wrist_joint_sinistra': 'wrist'
}

for root, dirs, files in os.walk(base_dir):
    for file in files:
        # get full filename
        filename = os.path.join(root, file)
        # get body part
        body_part = Path(filename).parent.parent.name
        folder_name = Path(filename).parent.name
        folder_name_with_body_part = body_part+folder_name

        if body_part in ['Thorax', 'clavicual_kanan', 'ankle_joint_bilateral', 'ankle_joint_dextra', 'humerus', 'appendicogram', 'mastoid_bilateral']: break

        body_part = body_part_renaming.get(body_part, body_part)

        # if txt file -> report
        if txt_file(filename):
            # if file in missing_kesan_file_list: break
            # if file in aggregated_report_list: break
            f = open(filename, "r", encoding="unicode_escape")
            text = f.read()
            text, content_text, conclusion_text, disease_status, disease_category, location = cleaning(text, base_dir, filename, body_part)

            # Assign an ID to every folder; all txt files in that folder will have the same ID
            if folder_name_with_body_part not in folder_ids:
                folder_ids[folder_name_with_body_part] = id_counter
                id_counter += 1

            df_rows.append({
                'id': folder_ids[folder_name_with_body_part],  
                'report_file': file,                  
                'body_part': body_part,
                'content': content_text,
                'conclusion': conclusion_text,
                'disease_status': disease_status, # 0 for normal, 1 for disease
                'disease_type': disease_category, # None for normal, text for disease
                'location': location
            })
                
report_df = pd.DataFrame(df_rows)
print(report_df.groupby('body_part').size())
print(report_df['body_part'].nunique())

body part group: ['abodemen_3_posisi', 'ankle_bilateral', 'ankle_dextra', 'ankle_joint_bilateral', 'ankle_joint_dextra', 'ankle_sinistra', 'antebrachii', 'appendicogram', 'bahu', 'bno', 'calvaria', 'clavicual_kanan', 'clavicula_clean', 'cruris', 'cubiti', 'femur_bilateral', 'femur_dextra', 'femur_sinistra', 'genu', 'hip_joint', 'humerus', 'humerus_biateral', 'humerus_dextra', 'humerus_sinistra', 'manus', 'mastoid_bilateral', 'os_sacro_coccygeus', 'pedis_bilateral', 'pedis_dextra', 'pedis_sinistra', 'pelvis', 'schedel4', 'sinus_paranasal', 'thorax', 'thorax_clean', 'vertebra_cervical', 'vertebra_lumbosakral', 'vertebra_thoracolumbal', 'wrist_joint_bilateral', 'wrist_joint_dextra', 'wrist_joint_sinistra'] 

llll pemeriksaan abdomen 3 posisi dengan hasil sebagai berikut
llll pemeriksaan abdomen 3 posisi dengan hasil sebagai berikut
llll pemeriksaan abdomen 3 posisi dengan hasil sebagai berikut
llll pemeriksaan abdomen 3 posisi dengan hasil sebagai berikut
llll pemeriksaan abdomen 3 posisi

In [86]:
report_df['content'].get(16)

'preperitoneal fat dan otot psoas line simetris. distribusi udara usus minimal di distal colon. tidak tampak dilatasi usus. tidak dijumpai penebalan dinding usus. tak tampak gambaran udara di luar permukaan usus atau udara bebas sub diafragma. tidak dijumpai adanya kalsifikasi kelainan di dalam abdominal. tulang vertebra dan pelvis intak.'

In [87]:
report_df['conclusion'].get(16)

'tidak tampak kelainan radiologik.'

In [88]:
pd.options.display.max_colwidth = 2000
report_df[report_df['id'] == 830]

Unnamed: 0,id,report_file,body_part,content,conclusion,disease_status,disease_type,location
829,830,pelvis-2312080137_Series_1001-report-1.txt,pelvis,tulang pelvis dalam batas normal. dilakukan pemasangan kateter pada distal uretra dilakukan pengisian kontras sebanyak 17 ml tampak kontras mengisi distal urethra sampai ke kantung urinaria. tampak penyempitan lumen urethra pada belakang urethra dengan dilatasi lumen pada distal urethra. tidak terlihat extravasasi kontras ke extra lumen urethra.,gambaran penyempitan belakang urethra.,1,gambaran penyempitan belakang urethra.,


In [89]:
# report id -> filename, image_path
#   ok          ok          ok

In [90]:
image_df_rows = []
id_counter = 0
prev_folder = None
base_dir = os.path.join('..','datasets','radiology_clean')
trigger = False

def sort_key(filepath):
    # Return a tuple where the first element is 0 if it ends with ".txt", otherwise 1
    return (0 if filepath.endswith(".txt") else 1, filepath)


for root, dirs, files in os.walk(base_dir):
    sorted_files = sorted(files, key=sort_key)
    trigger = True
    for file in sorted_files:
        # get full filename
        filename = os.path.join(root, file)
        folder_name = Path(filename).parent.name
        body_part = Path(filename).parent.parent.name

        if body_part in ['Thorax', 'clavicual_kanan', 'ankle_joint_bilateral', 'ankle_joint_dextra', 'humerus', 'appendicogram', 'mastoid_bilateral']: break

        body_part = body_part_renaming.get(body_part, body_part)

        # missing body txtfile
        if folder_name in ['2401100109_Series_1001']: trigger = False

        # remove files in missing kesan or aggregated report
        # if txt_file(filename) :
        #     if file in missing_kesan_file_list or file in aggregated_report_list: 
        #         trigger = False
        #     else: trigger = True

        # get body part
        if image_file(filename) and trigger != False:
            folder_name = Path(filename).parent.name

            # Check if the folder has changed
            if folder_name != prev_folder:
                id_counter += 1  # Increment the counter for a new folder
                prev_folder = folder_name  # Update the previous folder
            else:
                id_counter = id_counter  # Keep the same ID for files in the same folder

            filename = os.path.normpath(filename).replace('\\', '/')

            # Append the data to the DataFrame rows
            image_df_rows.append({
                'id': id_counter,
                'filename': file,
                'image_path': filename
            })
           
images_df = pd.DataFrame(image_df_rows)
print(images_df['id'].max())
print(report_df['id'].max())

1719
1719


In [91]:
def export_to_csv(df, file_name):
    dataset_dir = os.path.join('..', 'datasets')
    output_path = os.path.join(dataset_dir, file_name)
    df.to_csv(output_path, index=False)

In [92]:
# export_to_csv(report_df, 'report_dataset_w_fullstops.csv')
# export_to_csv(images_df, 'images_dataset.csv')
# export_to_csv(images_df, 'images_dataset_clean4.csv')
export_to_csv(report_df, 'report_dataset_clean5.csv')

In [93]:
import pandas as pd
df = pd.read_csv('../datasets/report_dataset_clean4.csv')
print(df.groupby('body_part').size())

body_part
abdomen                   107
ankle                      51
antebrachii                42
bahu                       75
cruris                     56
cubiti                     18
femur                      56
genu                      149
hip_joint                  37
humerus                    46
kepala                     39
manus                      51
os_sacro_coccygeus         19
pedis                      78
pelvis                     66
thorax                    406
vertebra_cervical          92
vertebra_lumbosakral      226
vertebra_thoracolumbal     51
wrist                      54
dtype: int64
