In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import random
from PIL import Image
from resizeimage import resizeimage
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Path to the data directory
BASE_DIR = r"Data/Training"

# Function to create a list of file paths for all image files in a directory
# and also create a list of corresponding labels for each image
def createFileList(myDir, format='.jpg'):
    fileList = []
    labels = []
    
    for root, dirs, files in os.walk(myDir, topdown=False):
        for name in files:
            if name.endswith(format):
                fullName = os.path.join(root, name)
                fileList.append(fullName)
                label = os.path.basename(root)  # The label is the name of the directory
                labels.append(label)
    
    return fileList, labels

# Create the file list and label list
ImageList, Label = createFileList(BASE_DIR)

print(BASE_DIR)
print("Total images found:", len(ImageList))
print("Total labels found:", len(set(Label)))
print('Labels:')
for label in set(Label):  # Use set to print unique labels only
    print(label) 

Data/Training
Total images found: 5575
Total labels found: 4
Labels:
meningioma
notumor
glioma
pituitary


We have 5575 images categorized into four classes: pituitary, meningioma, notumor, and glioma.


In [3]:
# Ensure the sample size is not larger than the number of available images
sample_size = min(700, len(ImageList))  # Adjust sample size if needed
myFileList = random.sample(ImageList, sample_size)

# Print some original image sizes for verification
count = 0
for i in myFileList:
    img_file = Image.open(i)
    width, height = img_file.size
    count += 1
    if count < 10:
        print(width, height)
    
n_pixels = width * height
n_classes = len(set(Label))

print("Number of images: {}".format(len(myFileList)))
print("Number of classes: {}".format(n_classes))
print("Original image sizes: {} pixels by {} pixels".format(height, width))


512 512
512 512
512 512
512 512
225 225
512 512
512 512
512 512
512 512
Number of images: 700
Number of classes: 4
Original image sizes: 512 pixels by 512 pixels


In [4]:
# Create a DataFrame with image data
df = pd.DataFrame()
count = 0
for i in myFileList:
    img_file = Image.open(i)
    img_file = resizeimage.resize_contain(img_file, [50, 50])  # Resize all images to 50x50
    width, height = img_file.size       
    img_gray = img_file.convert('L')
    value1 = np.asarray(img_gray.getdata(), dtype=int).reshape((img_gray.size[1], img_gray.size[0]))
    count += 1
    if count < 10:
        print(img_gray.size[1], img_gray.size[0])  
    df[i] = value1.flatten()

50 50
50 50
50 50
50 50
50 50
50 50
50 50
50 50
50 50


In [5]:
# Transpose the DataFrame and add labels
df = df.T
df['Label'] = [os.path.basename(os.path.dirname(i)) for i in df.index]

# Filter out rows with 'unknown' labels (if necessary)
df = df[df['Label'] != 'unknown']

# Save the DataFrame
df.to_csv('preprocessed_data.csv')

# Check the first few rows of the DataFrame to confirm
print(df.head())
print(df.columns)

                                        0  1  2  3  4  5  6  7  8  9  ...  \
Data/Training/glioma/Tr-gl_1108.jpg     1  1  1  1  1  1  1  1  2  1  ...   
Data/Training/pituitary/Tr-pi_0213.jpg  1  1  2  2  2  3  3  4  4  4  ...   
Data/Training/pituitary/Tr-pi_1023.jpg  0  0  0  0  0  0  0  0  0  0  ...   
Data/Training/pituitary/Tr-pi_0661.jpg  0  0  0  0  0  0  0  0  0  0  ...   
Data/Training/notumor/Tr-no_0914.jpg    0  0  0  0  0  1  2  2  2  2  ...   

                                        2491  2492  2493  2494  2495  2496  \
Data/Training/glioma/Tr-gl_1108.jpg        0     1     1     0     0     0   
Data/Training/pituitary/Tr-pi_0213.jpg    17    17     8     4     3     3   
Data/Training/pituitary/Tr-pi_1023.jpg     0     0     0     0     0     0   
Data/Training/pituitary/Tr-pi_0661.jpg     0     0     0     0     0     0   
Data/Training/notumor/Tr-no_0914.jpg       0     0     1     2     1     0   

                                        2497  2498  2499      Label 

In [6]:
# Save the DataFrame
df.to_csv('preprocessed_data.csv')