In [1]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')

# drive.flush_and_unmount()

!ls '/content/drive/My Drive'

Mounted at /content/drive
 checkpoints  'Colab Notebooks'   Efficient+MEdbert.ipynb   GP1   GP2


##**First text augmentation (Synonym Replacement)**

In [None]:
import json
import random
import re

input_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/image_balanced.json"
output_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented1.json"

with open(input_file, "r") as file:
    data = json.load(file)

categories_to_augment = [
    "Trauma & Physical Injuries",
    "Neoplasm - Benign & Sarcoma",
    "Vascular & Circulatory",
    "Infections",
    "Neoplasm - Carcinoma",
    "Inflammatory & Autoimmune",
    "Metabolic & Endocrine",
    "Cysts & Degenerative Conditions",
    "Obstruction & Structural Abnormalities"
]

In [None]:
word_swaps = {

    "left": "right",
    "right": "left",
    "anterior": "posterior",
    "posterior": "anterior",
    "upper": "lower",
    "lower": "upper",
    "pain": "ache",
    "ache": "pain",
    "swelling": "inflammation",
    "inflammation": "swelling",
    "shortness of breath": "breathing difficulty",
    "breathing difficulty": "shortness of breath",
    "exam": "evaluation",
    "evaluation": "exam",
    "findings": "observations",
    "observations": "findings",
    "CT": "computed tomography",
    "computed tomography": "CT",
    "MRI": "magnetic resonance imaging",
    "magnetic resonance imaging": "MRI",
    "imaging": "scan",
    "scan": "imaging",
    "biopsy": "tissue sampling",
    "tissue sampling": "biopsy",
    "treatment": "therapy",
    "therapy": "treatment",
    "surgery": "operation",
    "operation": "surgery",
    "patient": "individual",
    "individual": "patient",
    "reported": "described",
    "described": "reported",
    "complains of": "mentions",
    "mentions": "complains of",
    "lesion": "abnormality",
    "abnormality": "lesion",
    "tumor": "mass",
    "mass": "tumor",
    "infection": "inflammation",
    "inflammation": "infection",
    "blockage": "obstruction",
    "obstruction": "blockage",
    "severe": "intense",
    "intense": "severe",
    "mild": "slight",
    "slight": "mild",
    "normal": "unremarkable",
    "unremarkable": "normal"
}

In [None]:
def normalize_and_modify_ages(text):

    def adjust_age(match):
        age = int(match.group(1))
        adjustment = random.choice([-1, 1])
        return f"{age + adjustment}-year-old"

    text = re.sub(r"(\d+)\s*yo", adjust_age, text)
    text = re.sub(r"(\d+)-year-old", adjust_age, text)
    text = re.sub(r"(\d+)\s*years?\s*old", adjust_age, text)
    return text

def apply_word_swaps(text, swaps):

    for word, replacement in swaps.items():
        text = text.replace(word, replacement)
    return text

def modify_image_filenames(images, round_number):
    return [f"{image}_{round_number}" for image in images]

In [None]:
augmented_data = []
for idx, case in enumerate(data):
    label = case.get("Class/Label")
    if label in categories_to_augment:

        augmented_case = case.copy()
        if "Case" in augmented_case:
            for key in augmented_case["Case"]:
                if key != "Title" and isinstance(augmented_case["Case"][key], str):

                    text = augmented_case["Case"][key]
                    text = normalize_and_modify_ages(text)
                    augmented_case["Case"][key] = apply_word_swaps(text, word_swaps)
        if "TAC" in augmented_case:
            augmented_case["TAC"] = modify_image_filenames(augmented_case["TAC"], 1)
        if "MRI" in augmented_case:
            augmented_case["MRI"] = modify_image_filenames(augmented_case["MRI"], 1)
        augmented_data.append(augmented_case)


with open(output_file, "w") as file:
    json.dump(augmented_data, file, indent=4)

print(f"Augmented data for selected categories saved to {output_file}")

Augmented data for selected categories saved to /content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented1.json


In [None]:
import json

def count_records_in_json(file_path):
  # Open and load the JSON file
  with open(file_path, 'r') as file:
      data = json.load(file)

  if isinstance(data, list):
      return len(data)

def count_instances_in_json(file_path):
  # Open and load the JSON file
  with open(file_path, 'r') as file:
      data = json.load(file)
  # Initialize a dictionary to store counts for each class label
  class_counts = {}

  # Iterate through the records and count instances for each class label
  for record in data:
      label = record["Class/Label"]
      if label in class_counts:
          class_counts[label] += 1
      else:
          class_counts[label] = 1

  for label, count in class_counts.items():
      print(f"{label}: {count}")


print("Class Label Counts before balancing:\n")
file_path = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/image_balanced.json"
count_instances_in_json(file_path)
count = count_records_in_json(file_path)
print(f"\nTotal number of records: {count}")

print("\n",50*'=',"\n")

print("Class Label Counts after balancing:\n")
file_path = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented1.json"
count_instances_in_json(file_path)
count = count_records_in_json(file_path)
print(f"\nTotal number of records: {count}")

Class Label Counts before balancing:

Miscellaneous Conditions: 172
Obstruction & Structural Abnormalities: 45
Neoplasm - Carcinoma: 108
Inflammatory & Autoimmune: 72
Neoplasm - Benign & Sarcoma: 114
Neoplasm - Other Malignant: 202
Congenital & Genetic: 88
Metabolic & Endocrine: 51
Vascular & Circulatory: 90
Infections: 82
Trauma & Physical Injuries: 118
Cysts & Degenerative Conditions: 48

Total number of records: 1190


Class Label Counts after balancing:

Obstruction & Structural Abnormalities: 45
Neoplasm - Carcinoma: 108
Inflammatory & Autoimmune: 72
Neoplasm - Benign & Sarcoma: 114
Metabolic & Endocrine: 51
Vascular & Circulatory: 90
Infections: 82
Trauma & Physical Injuries: 118
Cysts & Degenerative Conditions: 48

Total number of records: 728


##**Second text augmentation (Sentence Reordering)**

In [None]:
input_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/image_balanced.json"
output_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented2.json"

with open(input_file, "r") as file:
    data = json.load(file)


categories_to_augment = [
    "Inflammatory & Autoimmune",
    "Metabolic & Endocrine",
    "Cysts & Degenerative Conditions",
    "Obstruction & Structural Abnormalities"
]

In [None]:
def active_to_passive(text):

    if "The doctor" in text:
        return text.replace("The doctor", "The diagnosis was made by the doctor")
    if "performed" in text:
        return text.replace("performed", "was performed by")
    if "treated" in text:
        return text.replace("treated", "was treated by")
    return text

def passive_to_active(text):

    if "was made by the doctor" in text:
        return text.replace("was made by the doctor", "The doctor")
    if "was performed by" in text:
        return text.replace("was performed by", "performed")
    if "was treated by" in text:
        return text.replace("was treated by", "treated")
    return text

def reorder_or_convert_sentences(text):

    sentences = text.split('. ')
    if len(sentences) > 1:
        random.shuffle(sentences)
        return '. '.join(sentences)
    else:

        if random.choice([True, False]):
            return active_to_passive(text)
        else:
            return passive_to_active(text)

def modify_image_filenames(images, round_number):
    return [f"{image}_{round_number}" for image in images]

In [None]:
augmented_data = []
for idx, case in enumerate(data):
    label = case.get("Class/Label")
    if label in categories_to_augment:
        print(f"Processing Case {idx + 1}/{len(data)}: UID = {case.get('UID', 'Unknown')}")

        augmented_case = case.copy()
        if "Case" in augmented_case:
            for key in augmented_case["Case"]:
                if key != "Title" and isinstance(augmented_case["Case"][key], str):
                    augmented_case["Case"][key] = reorder_or_convert_sentences(augmented_case["Case"][key])
        if "TAC" in augmented_case:
            augmented_case["TAC"] = modify_image_filenames(augmented_case["TAC"], 2)
        if "MRI" in augmented_case:
            augmented_case["MRI"] = modify_image_filenames(augmented_case["MRI"], 2)
        augmented_data.append(augmented_case)

with open(output_file, "w") as file:
    json.dump(augmented_data, file, indent=4)

print(f"Augmented data for selected categories with sentence reordering and active-passive conversion saved to {output_file}")

Processing Case 2/1190: UID = Unknown
Processing Case 3/1190: UID = Unknown
Processing Case 4/1190: UID = Unknown
Processing Case 10/1190: UID = Unknown
Processing Case 11/1190: UID = Unknown
Processing Case 12/1190: UID = Unknown
Processing Case 19/1190: UID = Unknown
Processing Case 20/1190: UID = Unknown
Processing Case 21/1190: UID = Unknown
Processing Case 35/1190: UID = Unknown
Processing Case 36/1190: UID = Unknown
Processing Case 37/1190: UID = Unknown
Processing Case 51/1190: UID = Unknown
Processing Case 52/1190: UID = Unknown
Processing Case 53/1190: UID = Unknown
Processing Case 57/1190: UID = Unknown
Processing Case 58/1190: UID = Unknown
Processing Case 59/1190: UID = Unknown
Processing Case 67/1190: UID = Unknown
Processing Case 68/1190: UID = Unknown
Processing Case 69/1190: UID = Unknown
Processing Case 77/1190: UID = Unknown
Processing Case 78/1190: UID = Unknown
Processing Case 79/1190: UID = Unknown
Processing Case 126/1190: UID = Unknown
Processing Case 127/1190: U

In [None]:
print("Class Label Counts before balancing:\n")
file_path = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/image_balanced.json"
count_instances_in_json(file_path)
count = count_records_in_json(file_path)
print(f"\nTotal number of records: {count}")

print("\n",50*'=',"\n")

print("Class Label Counts after balancing:\n")
file_path = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented2.json"
count_instances_in_json(file_path)
count = count_records_in_json(file_path)
print(f"\nTotal number of records: {count}")

Class Label Counts before balancing:

Miscellaneous Conditions: 172
Obstruction & Structural Abnormalities: 45
Neoplasm - Carcinoma: 108
Inflammatory & Autoimmune: 72
Neoplasm - Benign & Sarcoma: 114
Neoplasm - Other Malignant: 202
Congenital & Genetic: 88
Metabolic & Endocrine: 51
Vascular & Circulatory: 90
Infections: 82
Trauma & Physical Injuries: 118
Cysts & Degenerative Conditions: 48

Total number of records: 1190


Class Label Counts after balancing:

Obstruction & Structural Abnormalities: 45
Inflammatory & Autoimmune: 72
Metabolic & Endocrine: 51
Cysts & Degenerative Conditions: 48

Total number of records: 216


##**Third text augmentation (Back Translation)**

In [None]:
# !pip install googletrans

In [None]:
from googletrans import Translator
import json

input_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/image_balanced.json"
output_file = "/content/drive/My Drive/GP2/Balanced_data/augmented_json/text_augmented3.json"

with open(input_file, "r", encoding="utf-8") as file:
    data = json.load(file)

translator = Translator()

categories_to_augment = [
    "Metabolic & Endocrine",
    "Cysts & Degenerative Conditions",
    "Obstruction & Structural Abnormalities"
]

In [None]:
def translate_to_french_and_back_googletrans(text):

    try:
        arabic_text = translator.translate(text, src="en", dest="fr").text
        english_text = translator.translate(arabic_text, src="fr", dest="en").text
        return english_text
    except Exception as e:
        print(f"Translation failed for text: {text[:30]}... Error: {e}")
        return text

def modify_image_filenames(images, round_number):
    return [f"{image}_{round_number}" for image in images]

In [None]:
augmented_data = []
for idx, case in enumerate(data):
    label = case.get("Class/Label")
    if label in categories_to_augment:
        print(f"Translating Case {idx + 1}/{len(data)}: UID = {case.get('UID', 'Unknown')}")

        augmented_case = case.copy()
        if "Case" in augmented_case:
            for key in augmented_case["Case"]:
                if key != "Title" and isinstance(augmented_case["Case"][key], str):

                    augmented_case["Case"][key] = translate_to_french_and_back_googletrans(augmented_case["Case"][key])
        if "TAC" in augmented_case:
            augmented_case["TAC"] = modify_image_filenames(augmented_case["TAC"], 3)
        if "MRI" in augmented_case:
            augmented_case["MRI"] = modify_image_filenames(augmented_case["MRI"], 3)
        augmented_data.append(augmented_case)

with open(output_file, "w", encoding="utf-8") as file:
    json.dump(augmented_data, file, indent=4, ensure_ascii=False)

print(f"Translated data for selected categories saved to {output_file}")

Translating Case 2/1190: UID = Unknown
Translation failed for text: 60-year-old woman presents wit... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: â¢  PA chest radiograph demon... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: This combination of radiograph... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: Left upper lobe collapse cause... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: Lung biopsy proven Small Cell ... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: This woman had known small cel... Error: 'coroutine' object has no attribute 'text'
Translating Case 3/1190: UID = Unknown
Translation failed for text: 60-year-old woman presents wit... Error: 'coroutine' object has no attribute 'text'
Translation failed for text: â¢  PA chest radiograph demon... Error: 'coroutine' object has no attribute 'text'
Translation failed

  return text
