In [1]:
import cv2
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from skimage import io
from shutil import copyfile
import sys
import time

import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [2]:
annotation_csv = pd.read_csv("dataset/train/_annotations.csv")

In [4]:
annotation_csv.shape

(3888, 8)

In [6]:
annotation_csv.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,ID_1c731d89f_png_jpg.rf.bcca0c50f0d029a8ee7946...,640,640,Subdural,244,433,363,519
1,ID_1c731d89f_png_jpg.rf.bcca0c50f0d029a8ee7946...,640,640,Subarachnoid,326,306,409,400
2,ID_1d36531b9_png_jpg.rf.bca8c132a8312f50c45039...,640,640,Subdural,415,278,469,337
3,ID_d232cb12d_png_jpg.rf.bd6882218498b7fda44f58...,640,640,Subdural,381,281,483,471
4,ID_2029501f2_png_jpg.rf.beb9180cccc1197e13fc13...,640,640,Subdural,112,245,207,424


In [25]:
annotation_csv['class'].unique()

array(['Subdural', 'Subarachnoid', 'Intraparenchymal', 'Intraventricular'],
      dtype=object)

In [15]:
# Find the label_name for 'Bed', 'Chair' and 'Table' classes
class_name = annotation_csv['class']

In [16]:
class_name

0           Subdural
1       Subarachnoid
2           Subdural
3           Subdural
4           Subdural
            ...     
3883        Subdural
3884        Subdural
3885        Subdural
3886        Subdural
3887        Subdural
Name: class, Length: 3888, dtype: object

In [19]:
subdural_pd = annotation_csv[annotation_csv['class'] == 'Subdural']

In [26]:
subarachnoid_pd = annotation_csv[annotation_csv['class'] == 'Subarachnoid']
intrapar_pd = annotation_csv[annotation_csv['class'] == 'Intraparenchymal']
intraven_pd = annotation_csv[annotation_csv['class'] == 'Intraventricular']

In [47]:
print('There are %d subdural in the dataset' %(len(subdural_pd)))
print('There are %d subarachnoid in the dataset' %(len(subarachnoid_pd)))
print('There are %d intraparanchymal in the dataset' %(len(intrapar_pd)))
print('There are %d intraventricular in the dataset' %(len(intraven_pd)))

There are 2911 subdural in the dataset
There are 405 subarachnoid in the dataset
There are 469 intraparanchymal in the dataset
There are 103 intraventricular in the dataset


In [53]:
def get_random_images_with_specific_injury_samples(df, samples_per_injury=10, total_samples=40):
    # Get a list of unique injury types
    injury_types = df['class'].unique()
    
    # Dictionary to store randomly selected images for each injury type
    selected_images = []
    
    # Loop through each injury type and sample the specified number of images
    for injury_type in injury_types:
        # Filter dataframe for the current injury type
        injury_images = df[df['class'] == injury_type]
        
        # Take a random sample of the specified number of images per injury type
        sampled_images = injury_images.sample(n=samples_per_injury, random_state=42)
        
        # Append sampled images to the selected list
        selected_images.append(sampled_images)
    
    # Concatenate all sampled images into a single DataFrame
    result_df = pd.concat(selected_images).drop_duplicates(subset='filename')
    
    # Ensure we have exactly the total number of unique images
    unique_images = result_df['filename'].unique()
    if len(unique_images) > total_samples:
        result_df = result_df[result_df['filename'].isin(random.sample(list(unique_images), total_samples))]
    
    return result_df

# Get 12 random images with 4 images for each injury type
specific_random_images_df = get_random_images_with_specific_injury_samples(annotation_csv)
specific_random_images_df[['filename', 'class']].drop_duplicates()


Unnamed: 0,filename,class
248,ID_d00ccb179_png_jpg.rf.cdb9f7b65dd7df5191ef83...,Subdural
914,ID_252309b5b_png_jpg.rf.fbfad8226e8be819f94466...,Subdural
3838,ID_8a8da7d04_png_jpg.rf.61d24743d260286efbcdca...,Subdural
2081,ID_5b26226a1_png_jpg.rf.263efd0b09f1ec46ccf19b...,Subdural
3883,ID_7927e4fd8_png_jpg.rf.64a5f071c3ddf23f44fdb1...,Subdural
2875,ID_4956e3fe4_png_jpg.rf.86ee6e00f324f6a0bdb980...,Subdural
982,ID_0338611a4_png_jpg.rf.90daffe91456e080e9eb1e...,Subdural
604,ID_0f856b98b_png_jpg.rf.e68b0d9e757132bebbd8a9...,Subdural
2307,ID_7faf22a5b_png_jpg.rf.31fdfbe475e3393e30576e...,Subdural
3141,ID_1eaddec92_png_jpg.rf.07ef8b3b9bcb73903f88b4...,Subdural


In [55]:
# Save the filtered DataFrame to a new CSV file with all features
filtered_file_path = 'selected_injury_images.csv'
specific_random_images_df.to_csv(filtered_file_path, index=False)


In [59]:
specific_random_images_df.rename(columns = {'filename':'FileName'}, inplace = True)

In [68]:
specific_random_images_df.rename(columns = {'class':'ClassName'}, inplace = True)

In [69]:
specific_random_images_df

Unnamed: 0,FileName,width,height,ClassName,xmin,ymin,xmax,ymax
248,ID_d00ccb179_png_jpg.rf.cdb9f7b65dd7df5191ef83...,640,640,Subdural,55,142,353,538
914,ID_252309b5b_png_jpg.rf.fbfad8226e8be819f94466...,640,640,Subdural,203,262,358,476
3838,ID_8a8da7d04_png_jpg.rf.61d24743d260286efbcdca...,640,640,Subdural,141,379,286,527
2081,ID_5b26226a1_png_jpg.rf.263efd0b09f1ec46ccf19b...,640,640,Subdural,299,399,391,580
3883,ID_7927e4fd8_png_jpg.rf.64a5f071c3ddf23f44fdb1...,640,640,Subdural,207,161,324,505
2875,ID_4956e3fe4_png_jpg.rf.86ee6e00f324f6a0bdb980...,640,640,Subdural,365,185,520,434
982,ID_0338611a4_png_jpg.rf.90daffe91456e080e9eb1e...,640,640,Subdural,474,288,536,413
604,ID_0f856b98b_png_jpg.rf.e68b0d9e757132bebbd8a9...,640,640,Subdural,324,375,405,499
2307,ID_7faf22a5b_png_jpg.rf.31fdfbe475e3393e30576e...,640,640,Subdural,134,228,355,438
3141,ID_1eaddec92_png_jpg.rf.07ef8b3b9bcb73903f88b4...,640,640,Subdural,165,365,275,526


In [67]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Paths to the folders where images will be saved
train_folder = 'train'
test_folder = 'test'

# Create the folders if they don’t exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Split the DataFrame into training and testing sets (e.g., 8 for training, 4 for testing)
train_df, test_df = train_test_split(specific_random_images_df, test_size=8, random_state=42)

# Define a function to copy images to their respective folders
def copy_images(df, folder):
    for filename in df['FileName']:
        # Set source and destination paths for each image file
        src_path = os.path.join("dataset/train", filename)  # Update this with the actual image path
        dst_path = os.path.join(folder, filename)
        
        # Copy the image to the target folder
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
        else:
            print(f"File {filename} not found in source directory.")

# Copy images to train and test folders
copy_images(train_df, train_folder)
copy_images(test_df, test_folder)


In [70]:
# Save train DataFrame to train.csv
train_csv_path = 'train.csv'
train_df.to_csv(train_csv_path, index=False)

# Save test DataFrame to test.csv
test_csv_path = 'test.csv'
test_df.to_csv(test_csv_path, index=False)

print("train.csv and test.csv files have been created successfully.")


train.csv and test.csv files have been created successfully.


In [73]:
test_df

Unnamed: 0,FileName,width,height,class,xmin,ymin,xmax,ymax
2675,ID_1bd692580_png_jpg.rf.7d79ec41a9004accc0af52...,640,640,Subarachnoid,362,233,547,483
1200,ID_bb62bea9e_png_jpg.rf.a0c17951bdba256ca78b77...,640,640,Subarachnoid,212,413,290,474
718,ID_2b0b547ab_png_jpg.rf.ee21d4fd044bdead19a96e...,640,640,Subarachnoid,404,356,491,457
1692,ID_f22d6c931_png_jpg.rf.09df10f3f9a7f6b78db388...,640,640,Intraparenchymal,150,108,311,280
3883,ID_7927e4fd8_png_jpg.rf.64a5f071c3ddf23f44fdb1...,640,640,Subdural,207,161,324,505
3685,ID_555a877b8_png_jpg.rf.55afbccc3fecb8b91d8fc3...,640,640,Subarachnoid,201,411,270,538
1679,ID_104b09f8d_png_jpg.rf.0a7a08506ed3dcd1baca1c...,640,640,Intraventricular,290,174,371,293
66,ID_2f0772df9_png_jpg.rf.c149b1236c8cace980bb2b...,640,640,Intraparenchymal,194,291,219,341


In [75]:
#Write train.csv to annotation.txt

train_df = pd.read_csv('train.csv')

# for training
with open("annotation.txt", "w+") as f:
  for idx, row in train_df.iterrows():
      img = cv2.imread('train/' + row['FileName'])
      height, width = img.shape[:2]
      x1 = int(row['xmin'] * width)
      x2 = int(row['xmax'] * width)
      y1 = int(row['ymin'] * height)
      y2 = int(row['ymax'] * height)
      
      train_file_path = 'D:\Project\train'
      fileName = os.path.join(train_file_path, row['FileName'])
      className = row['class']
      f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')

In [76]:
test_df = pd.read_csv('test.csv')

# for test
with open("test_annotation.txt", "w+") as f:
  for idx, row in test_df.iterrows():
      sys.stdout.write(str(idx) + '\r')
      sys.stdout.flush()
      img = cv2.imread('test/' + row['FileName'])
      height, width = img.shape[:2]
      x1 = int(row['xmin'] * width)
      x2 = int(row['xmax'] * width)
      y1 = int(row['ymin'] * height)
      y2 = int(row['ymax'] * height)
      
      test_file_path = 'D:\Project\test'
      fileName = os.path.join(test_file_path, row['FileName'])
      className = row['class']
      f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')

7