In [None]:
# pip install tensorflow

In [None]:
# pip install tensorflow pandas

**Mounting the drive containing the images**


In [1]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')

# drive.flush_and_unmount()

!ls '/content/drive/My Drive'

Mounted at /content/drive
 Balanced_Images  'Colab Notebooks'   Final_balanced_cleaned_data.json	 Final_Balanced_Images.zip


**Extracting the balanced images from the zipped file**

In [None]:
import zipfile

zip_file_path = '/content/drive/My Drive/Final_Balanced_Images.zip' 

extract_dir = '/content/drive/My Drive/Balanced_Images'  
os.makedirs(extract_dir, exist_ok=True)

try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Images extracted successfully to {extract_dir}")

    extracted_files = os.listdir(extract_dir)
    print("Extracted files:", extracted_files)
except Exception as e:
    print(f"Error extracting zip file: {e}")


Images extracted successfully to /content/drive/My Drive/Balanced_Images
Extracted files: ['images']


**Counting number of images to make sure**

In [None]:
extract_dir = '/content/drive/My Drive/Balanced_Images'

valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')
image_count = sum([1 for file in os.listdir(extract_dir) if file.lower().endswith(valid_extensions)])

print(f"Number of images in the directory: {image_count}")


Number of images in the directory: 3347


**Counting number of cases**

In [None]:
file_path = '/content/drive/My Drive/Final_Combined_Augmented_Data.json'

with open(file_path, 'r') as file:
    data = json.load(file)

if isinstance(data, list):
    record_count = len(data)
elif isinstance(data, dict):
    record_count = len(data)
else:
    record_count = 0

print(f"Number of records: {record_count}")

Number of records: 1101


# **Image Augmentation with duplicating the corresponding records**
A new folder named "Augmented_balanced_images" is created, the folder contains 4 new versions of the image:
- RR (rotated right)
- RL (rotated left)
- FL (fliped horizontally)
- BR (brightness change)

In [3]:
import os
from tensorflow.keras.preprocessing.image import load_img
from PIL import Image, ImageEnhance

image_dir = '/content/drive/My Drive/Balanced_Images' 
augmented_dir = '/content/drive/My Drive/Augmented_Balanced_Images'

os.makedirs(augmented_dir, exist_ok=True)

def augment_image(image_path, base_name, save_dir):
    """
    Perform augmentations on the image and save the results.
    :param image_path: Path to the original image
    :param base_name: Base name of the image (without file extension)
    :param save_dir: Directory to save augmented images
    """
    try:
        img = load_img(image_path)

        # Augmentation 1: Rotate right 30 degrees
        rr_name = f"{base_name}_RR.png"
        img.rotate(-30).save(os.path.join(save_dir, rr_name))

        # Augmentation 2: Rotate left 30 degrees
        rl_name = f"{base_name}_RL.png"
        img.rotate(30).save(os.path.join(save_dir, rl_name))

        # Augmentation 3: Horizontal flip
        fl_name = f"{base_name}_FL.png"
        img.transpose(method=Image.FLIP_LEFT_RIGHT).save(os.path.join(save_dir, fl_name))

        # Augmentation 4: Brightness increase
        br_name = f"{base_name}_BR.png"
        brightened = ImageEnhance.Brightness(img).enhance(1.5)  # Increase brightness by 50%
        brightened.save(os.path.join(save_dir, br_name))

        print(f"Augmented images for {base_name} saved.")

    except Exception as e:
        print(f"Error processing image {base_name}: {e}")

for image_name in os.listdir(image_dir):
    if image_name.endswith(('.png', '.jpg', '.jpeg')):  
        base_name = os.path.splitext(image_name)[0]
        image_path = os.path.join(image_dir, image_name)
        augment_image(image_path, base_name, augmented_dir)

print("Image augmentation completed.")

Image augmentation completed. New JSON file created with augmented data.


**A new JSON file named "Augmented_Data.json" is created, it contains x4 the instances in the original JSON file, each patient has new four records containing their images with the prefixes ("_RR", "_RL" , "_FL" , "_BR")**

In [10]:
import os
import json

# Paths to directories
augmented_dir = '/content/drive/My Drive/Augmented_Balanced_Images'  
data_file = '/content/drive/My Drive/Final_balanced_cleaned_data.json'  
new_json_file = '/content/drive/My Drive/Augmented_Data.json'  

with open(data_file, 'r') as f:
    patient_data = json.load(f)

def create_augmented_json(patient_data, augmented_dir):
    """
    Create a JSON file where each augmentation creates a new patient instance, including original images.
    :param patient_data: Original patient data
    :param augmented_dir: Directory containing augmented images
    :return: List of augmented patient data
    """
    augmented_data = []

    for patient in patient_data:
        base_patient = patient.copy()
        tac_images = patient.get("TAC", [])
        mri_images = patient.get("MRI", [])

        original_patient = base_patient.copy()
        original_patient["TAC"] = tac_images
        original_patient["MRI"] = mri_images
        augmented_data.append(original_patient)

        for aug_type in ["RR", "RL", "FL", "BR"]:
            new_patient = base_patient.copy()
            new_patient["U_id"] = f"{new_patient['U_id']}_{aug_type}"

            if tac_images:
                new_patient["TAC"] = [f"{image_name}_{aug_type}" for image_name in tac_images]

            if mri_images:
                new_patient["MRI"] = [f"{image_name}_{aug_type}" for image_name in mri_images]

            augmented_data.append(new_patient)

    return augmented_data

augmented_data = create_augmented_json(patient_data, augmented_dir)

with open(new_json_file, 'w') as f:
    json.dump(augmented_data, f, indent=4)

print("New JSON file created with augmented data including original images.")

New JSON file created with augmented data including original images.


**Counting Instances to make sure**

In [8]:
import json

original_json_file = '/content/drive/My Drive/Final_balanced_cleaned_data.json'
augmented_json_file = '/content/drive/My Drive/Augmented_Data.json'

def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def count_instances(json_data):
    return len(json_data)

original_data = load_json(original_json_file)
augmented_data = load_json(augmented_json_file)

original_count = count_instances(original_data)
augmented_count = count_instances(augmented_data)

print(f"Number of instances in the original JSON file: {original_count}")
print(f"Number of instances in the augmented JSON file: {augmented_count}")

Number of instances in the original JSON file: 1101
Number of instances in the augmented JSON file: 5505


**Adding "Location" and "Location Category" from the descrptions.json to the case.json**

In [None]:
import json


file_path = 'Descriptions.json'  
with open(file_path, 'r') as file:
    data = json.load(file)

file_path = 'Final JSON data.json' 
with open(file_path, 'r',encoding="utf-8") as file1:
    data1 = json.load(file1)


loc= [{"u_id": data[i]['U_id'], "location": data[i]['Location'],"location Category": data[i]['Location Category']} for i in range(len(data))]

for i in range(len(loc)):
    for j in range(len(data1)):
        if loc[i]['u_id'] in data1[j]['U_id']:
            data1[j]['Location']=loc[i]['location']
            data1[j]['Location Category']=loc[i]['location Category']


output_path = 'Final JSON data.json'
with open(output_path, 'w') as file1:
    json.dump(data1, file1, indent=4)

print(f"Updated JSON saved to: {output_path}")

**Removing words containing '\u' (Unicode escape sequences)**

In [None]:
import json

def remove_unicode_words(text):
    words = text.split() 
    cleaned_words = [word for word in words if "\\u" not in word]
    return " ".join(cleaned_words)

with open("input.json", "r", encoding="utf-8") as file:
    data = json.load(file)

def clean_json(obj):
    if isinstance(obj, dict): 
        return {key: clean_json(value) for key, value in obj.items()}
    elif isinstance(obj, list): 
        return [clean_json(element) for element in obj]
    elif isinstance(obj, str):
        return remove_unicode_words(obj)
    return obj 

cleaned_data = clean_json(data)

with open("cleaned_output.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_data, file, indent=4, ensure_ascii=False)

print("Words containing '\\u' have been removed and the cleaned JSON has been saved.")

**Removing the 'â¢' character**

In [None]:
import json
import re

def clean_corrupted_text(text):
    corrupted_pattern = re.compile(r'[^\x00-\x7F]+') 
    return corrupted_pattern.sub('', text).replace('\u2022', '')

def clean_json(obj):
    if isinstance(obj, dict):  
        return {key: clean_json(value) for key, value in obj.items()}
    elif isinstance(obj, list):  
        return [clean_json(element) for element in obj]
    elif isinstance(obj, str):  
        return clean_corrupted_text(obj)
    return obj  

with open("Final JSON data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

cleaned_data = clean_json(data)

output_path = "Final JSON data.json"
with open(output_path, "w", encoding="utf-8") as file:
    json.dump(cleaned_data, file, indent=4, ensure_ascii=False)

print("Corrupted characters have been removed and the cleaned JSON has been saved.")