# Extra steps if I were to rejoin the dataset then split it up again

### <u>I did not use this notebook, this is just to show the assessor that I can join and split the data</u>

The dataset already came split up when I downloaded it from Kaggle. It was split into train, validation and test sets. 

I made a seperate notebook so that I can show that I can join the datasets and then split them up again. I created new folders and then joined the datasets and then split them up again. 

The dataset contains some duplicate images, named differently. To avoid any issues later on I will use the downloaded train, test and validation sets. Below you will find the way I would code if I joined up the datasets, and continue from there.

---

# Import packages

In [None]:
%pip install -r /workspace/Bone-Fracture-Detection/requirements.txt

In [None]:
import numpy
import os
import numpy as np

---

# Change working directory

### Change the working directory from its current folder to its parent folder.

In [None]:
current_dir = os.getcwd()
current_dir

### Make the parent of the current directory the new current directory.

In [None]:
os.chdir(os.path.dirname(current_dir))

### Confirm the new current directory.

In [None]:
current_dir = os.getcwd()
current_dir

---

# Kaggle

### Install Kaggle

In [None]:
!pip install kaggle

### Change kaggle configuration directory to current working directory and permission of kaggle authentication json.

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

### Set Kaggle Dataset and Download it.

In [None]:
KaggleDatasetPath = "bmadushanirodrigo/fracture-multi-region-x-ray-data"
DestinationFolder = "inputs/fracture_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

### Unzip the downloaded file, delete the zip file.

In [None]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/fracture-multi-region-x-ray-data.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/fracture-multi-region-x-ray-data.zip')

---

# Join datasets

### Set input directory

In [None]:
input_dir = 'inputs/fracture_dataset/bone_fracture/bone_fracture'
parent_folder = 'bones_folder'
child_folders = ['fractured', 'unfractured']
new_folder = os.path.join(input_dir, parent_folder)

### This function creates a new folder, bones_folder, that holds two folders, fractured and unfractured.

In [None]:
def make_new_folder(parent_folder_path, folder_name): 
    path = os.path.join(parent_folder_path, folder_name) 

    try:
        os.makedirs(path)  
          
    except OSError as error:  
        print(error) 


make_new_folder(input_dir, parent_folder)


for child in child_folders:
    make_new_folder(new_folder, child)

### Move all the images from the presplit folders (train, test and val folders) into the new folders created (bones_folder folder).

In [None]:
import shutil


def move_files_to_bones_folder(bone_type):
    for folder in ['test', 'train', 'val']:
        folders_bones = input_dir + '/' + folder + '/' + bone_type
    
        move_bones = os.listdir(folders_bones)
        move_bone_folder = new_folder + '/' + bone_type

        for move in move_bones:
            shutil.move(os.path.join(folders_bones, move), os.path.join(move_bone_folder, move))

In [None]:
move_files_to_bones_folder('fractured')
move_files_to_bones_folder('unfractured')

---

# Data Preparation

### Data cleaning, Checks and removes non image files.

In [None]:
def remove_non_image_files(dataset_path):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(dataset_path) 
    for folder in folders:
        files = os.listdir(dataset_path + '/' + folder)
        
        i = []
        j = []
        for file in files:
            if not file.lower().endswith(image_extension):
                img_delete = dataset_path + '/' + folder + '/' + file
                os.remove(img_delete)
                i.append(1)
            else:
                j.append(1)
                pass
            
        print(f"Folder: {folder} - has {len(j)} images")
        print(f"Folder: {folder} - has {len(i)} non-images")

In [None]:
remove_non_image_files(input_dir)

### Change image type, colour and size

We change all the images to an JPG file, then save the images.

In [None]:
def change_image_type_and_save(bone_folder):
	for path, subdirs, files in os.walk(new_folder + '/' + bone_folder):
		for name in files:
			file_name, file_ext = os.path.splitext(name)
			if file_ext != "jpg":
				os.rename(os.path.join(path, name), os.path.join(path, os.path.basename(file_name) + "." + 'jpg'))

In [None]:
change_image_type_and_save('fractured')
change_image_type_and_save('unfractured')

We transform the image colors to grayscale and change the image size, then save the images.

In [None]:
from PIL import Image, ImageFile


# Fixes truncated oserror
ImageFile.LOAD_TRUNCATED_IMAGES = True


def convert_images_to_grayscale(bone_folder):
    for picture in os.listdir(new_folder + '/' + bone_folder):
        img = Image.open(new_folder + '/' + bone_folder + '/' + picture).convert('L')
        new_size = img.resize((150, 150))
        new_size.save(new_folder + '/' + bone_folder + '/' + picture)

In [None]:
convert_images_to_grayscale('fractured')
convert_images_to_grayscale('unfractured')

# Resplit the datasets

In [None]:
import random
import joblib


def split_bones_folder_into_train_test_val(new_path, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum up to 1.0")
        return

    labels = os.listdir(new_path)
    if 'test' in labels:
        pass
    else:
        for label in labels:
            files = os.listdir(new_path + '/' + label)
            random.shuffle(files)
            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    shutil.move(new_path + '/' + label + '/' + file_name,
                                input_dir + '/train/' + label + '/' + file_name)
                            
                elif count <= (train_set_files_qty + validation_set_files_qty ):
                    shutil.move(new_path + '/' + label + '/' + file_name,
                                input_dir + '/val/' + label + '/' + file_name)

                else:
                    shutil.move(new_path + '/' + label + '/' + file_name,
                            input_dir + '/test/' +label + '/'+ file_name)
            
                count += 1

In [None]:
split_bones_folder_into_train_test_val(new_path = new_folder,
                                    train_set_ratio = 0.7,
                                    validation_set_ratio = 0.1,
                                    test_set_ratio = 0.2
                                    )

---

# Delete the created folder, bones_folder.

In [None]:
for child in child_folders:
    os.rmdir(new_folder + '/' + child)


os.rmdir(new_folder)