In [None]:
%load_ext autoreload
%autoreload 2

### the above lines are to specify that changes in our py files are loaded before every execution. helpful if you're modifying preprocessing.

In [1]:
import pandas as pd
import numpy as np
import sklearn
import plotly.express as px
import matplotlib.pyplot as plt
import os
from PIL import Image
import numpy as np
import random
import sys


# Importing Preprocessing

- if you code in a .py file, you can just use relative imports.

- otherwise, if you're in a notebook, use the below snippet to import preprocessing.
    - notebooks dont allow relative imports outside of known packages. we're just adding the neccessary directory to our syspath


    - you're going to need to have an \_\_init\_\_.py file in your directory if you want to import

In [2]:
current_dir = os.getcwd()

parent_dir = os.path.dirname(current_dir)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from Preprocessing.preprocessing import DATA_ROOT, DATA_FOLDERS, DATA_SUBFOLDERS # Some helpful lists for data access.
from Preprocessing.preprocessing import load_labels, image_generator, image_path_generator, batch_generator

In [3]:
# however, notice that our working directory is unchanged. we just added to system path for the above code.
print(os.getcwd())

/Users/KevinLu/Downloads/Parent/MATH4570-final-project-1/Preprocessing


# About the data:
 - Data is stored in the renders file.
 - DATA_FOLDERS is the folders in the renders file.
 - each folder in DATA_FOLDERS is divided into 4 subfolders, which contain images. 
    - Those 4 subfolders have fairly self-explanatory names.

- each image's file name is it's ID. you can map this to the hemorrhage-labels.csv for the true labels for classification

In [4]:
DATA_ROOT

'data'

In [5]:
DATA_FOLDERS

['epidural',
 'intraparenchymal',
 'intraventricular',
 'multi',
 'normal',
 'subarachnoid',
 'subdural']

In [6]:
DATA_SUBFOLDERS

['brain_bone_window', 'brain_window', 'max_contrast_window', 'subdural_window']

In [7]:
# all the folders that data is stored in.
for folder in DATA_FOLDERS:
    for subfolder in DATA_SUBFOLDERS:
        print(f"{DATA_ROOT}/{folder}/{subfolder}")

data/epidural/brain_bone_window
data/epidural/brain_window
data/epidural/max_contrast_window
data/epidural/subdural_window
data/intraparenchymal/brain_bone_window
data/intraparenchymal/brain_window
data/intraparenchymal/max_contrast_window
data/intraparenchymal/subdural_window
data/intraventricular/brain_bone_window
data/intraventricular/brain_window
data/intraventricular/max_contrast_window
data/intraventricular/subdural_window
data/multi/brain_bone_window
data/multi/brain_window
data/multi/max_contrast_window
data/multi/subdural_window
data/normal/brain_bone_window
data/normal/brain_window
data/normal/max_contrast_window
data/normal/subdural_window
data/subarachnoid/brain_bone_window
data/subarachnoid/brain_window
data/subarachnoid/max_contrast_window
data/subarachnoid/subdural_window
data/subdural/brain_bone_window
data/subdural/brain_window
data/subdural/max_contrast_window
data/subdural/subdural_window


# Let's access the hemorrhage-labels.csv file to get our y_true

In [8]:
# you can directly load the data into a csv like this if you really want.
df = pd.read_csv("../../hemorrhage-labels.csv", index_col="Image")
df

Unnamed: 0_level_0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
Image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_000012eaf,0,0,0,0,0,0
ID_000039fa0,0,0,0,0,0,0
ID_00005679d,0,0,0,0,0,0
ID_00008ce3c,0,0,0,0,0,0
ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...
ID_ffff82e46,0,0,0,0,0,0
ID_ffff922b9,1,0,0,1,0,0
ID_ffffb670a,1,0,0,0,1,0
ID_ffffcbff8,0,0,0,0,0,0


In [9]:
# but i recommend using this to load the data into a dict. it's simple, quick (for access), and effective.

labels_dict = load_labels("../../hemorrhage-labels.csv")

In [10]:
# get labels by ID. our batch generator will give IDs attached to the images.
labels_dict["ID_ffff82e46"]

{'any': 0,
 'epidural': 0,
 'intraparenchymal': 0,
 'intraventricular': 0,
 'subarachnoid': 0,
 'subdural': 0}

# We'll use Generators to access data. 

 - Image data is total 10-11 GB of data
 - Infeasible to load that into memory for every computer.

 - Generator functions are defined in preprocessing.py in the root directory.

# Ensure that the data is loaded in the correct spots locally

In [11]:
# Should look like _______ MATH4570-final-project-1\Preprocessing
# unzip the contents of the OneDrive folder into the parent directory of the git repo.
# so the 3 files should be renders, dcms (empty), hemhorrage-labels
print(os.getcwd())

/Users/KevinLu/Downloads/Parent/MATH4570-final-project-1/Preprocessing


In [12]:
# generators produce an iterable object, which we use to get data as we use it.
ep_bbw = image_generator("../../data/epidural/brain_bone_window")


In [13]:
# this is how you get samples from the single-generator
sample_img, sample_id = next(ep_bbw)
second_img, second_id = next(ep_bbw)

In [14]:
print(len(next(ep_bbw)[0]))

512


# Some different ways to visualize a specific RGB image

In [15]:
# Using plotly express for more in-depth imaging.
px.imshow(sample_img, title=sample_id)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# How to use batch generator

- set batch_size to the desired size
- you'll get given increments of data in specific batches. you can then use this however you want!
  - treat each batch as an epoch! etc.

In [16]:
# instantiate batch generator object
# we'll set the size of batches here.

batch_gen = batch_generator('../../data/epidural/max_contrast_window', batch_size=9)

In [17]:
# this is the first batch of 9. If you re-run this code you'll get different results.
images, ids = next(batch_gen)

In [18]:
images, ids = next(batch_gen)

In [19]:
BATCH_SIZE = 9
print(BATCH_SIZE)

9


In [20]:
batch_gen = batch_generator('../../data/epidural/max_contrast_window', batch_size=BATCH_SIZE)

In [21]:
# # all the folders that data is stored in.
# batch_generators = []
# for folder in DATA_FOLDERS:
#     for subfolder in DATA_SUBFOLDERS:
#         #new_generator = batch_generator(f"../../{DATA_ROOT}/{folder}/{subfolder}", batch_size=BATCH_SIZE)
#         new_generator = batch_generator('../../data/epidural/max_contrast_window', batch_size=BATCH_SIZE)
#         batch_generators.append(new_generator)
#         print(f"{DATA_ROOT}/{folder}/{subfolder}")

In [22]:
train_data = []
first_batch, ids = next(batch_generators[4])
# print(len(first_batch[0]))
# print(ids)
next(batch_generators[4])

NameError: name 'batch_generators' is not defined

In [None]:
px.imshow(first_batch[0], title=sample_id)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [24]:
BATCH_SIZE = 300
batch_generators = []
batch_generators.append(batch_generator("../../data/epidural/max_contrast_window", batch_size=BATCH_SIZE))
batch_generators.append(batch_generator("../../data/intraparenchymal/max_contrast_window", batch_size=BATCH_SIZE))
#batch_generators.append(batch_generator("../../data/normal/brain_bone_window", batch_size=BATCH_SIZE))
batch_generators.append(batch_generator("../../data/intraventricular/max_contrast_window", batch_size=BATCH_SIZE))
#batch_generators.append(batch_generator("../../data/subdural/brain_bone_window", batch_size=BATCH_SIZE))
batch_generators.append(batch_generator("../../data/subarachnoid/max_contrast_window", batch_size=BATCH_SIZE))


# for folder in DATA_FOLDERS:
#     for subfolder in DATA_SUBFOLDERS:
#         new_generator = batch_generator(f"../../{DATA_ROOT}/{folder}/{subfolder}", batch_size=BATCH_SIZE)
#         #new_generator = batch_generator('../../data/epidural/brain_bone_window', batch_size=BATCH_SIZE)
#         batch_generators.append(new_generator)
#         print(f"../../{DATA_ROOT}/{folder}/{subfolder}")

In [25]:
train_data = []
train_ids = []
valid_data = []
valid_ids = []
test_data = []
test_ids = []
for id in range(len(batch_generators)):
    print(id)
    new_train_data, new_train_ids = next(batch_generators[id])
    train_data.extend(new_train_data)
    train_ids.extend(new_train_ids)
    
    new_valid_data, new_valid_ids = next(batch_generators[id])
    valid_data.extend(new_valid_data)
    valid_ids.extend(new_valid_ids)
    
    new_test_data, new_test_ids = next(batch_generators[id])
    test_data.extend(new_test_data)
    test_ids.extend(new_test_ids)

0
1
2
3


In [35]:
import numpy as np
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
# Step 1: Flatten the images
train_data_flat = [image.flatten() for image in train_data][:100]
valid_data_flat = [image.flatten() for image in valid_data][:100]
test_data_flat = [image.flatten() for image in test_data][:100]



In [36]:

# Step 2: Prepare labels using one-hot encoding
train_labels = [labels_dict[train_id] for train_id in train_ids][:100]
valid_labels = [labels_dict[valid_id] for valid_id in valid_ids][:100]
test_labels = [labels_dict[test_id] for test_id in test_ids][:100]

print(train_labels)

[{'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}, {'any': 1, 'epidural': 1, 'intrapare

In [29]:
hemorrage_dict = {'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}
hemorrhage_array = np.array([hemorrage_dict[key] for key in hemorrage_dict.keys()])
hemorrhage_array

array([1, 1, 0, 0, 0, 0])

In [30]:
print(train_labels[0])


{'any': 1, 'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}


In [31]:
import numpy as np
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier

# Step 2: Prepare labels
hemorrhage_types = ['epidural', 'intraparenchymal', 'intraventricular', 'subdural']
train_labels = np.array([[labels_dict[train_id][hem] for hem in hemorrhage_types] for train_id in train_ids])
valid_labels = np.array([[labels_dict[valid_id][hem] for hem in hemorrhage_types] for valid_id in valid_ids])
test_labels = np.array([[labels_dict[test_id][hem] for hem in hemorrhage_types] for test_id in test_ids])

hemorrage_dict = {'epidural': 1, 'intraparenchymal': 0, 'intraventricular': 0, 'subarachnoid': 0, 'subdural': 0}
hemorrhage_array = np.array([hemorrage_dict[key] for key in hemorrage_dict.keys()])
hemorrhage_array

print(train_labels[310])


[0 1 0 0]


In [33]:
# Step 2: Convert labels to NumPy arrays
train_labels_array = np.array(train_labels)
valid_labels_array = np.array(valid_labels)
test_labels_array = np.array(test_labels)


# Save multiple arrays in a single .npz file
np.savez('max_contrast.npz', 
         train_data=train_data_flat, 
         valid_data=valid_data_flat, 
         test_data=test_data_flat, 
         train_labels=train_labels_array, 
         valid_labels=valid_labels_array, 
         test_labels=test_labels_array)

In [37]:
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

num_clusters = 10  # Choose the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(train_data_flat)  # Assuming train_data_flat contains flattened images

# Step 2: Represent each image using bag-of-visual-words
train_data_bow = kmeans.predict(train_data_flat)
valid_data_bow = kmeans.predict(valid_data_flat)
test_data_bow = kmeans.predict(test_data_flat)

In [38]:
# Flatten the one-hot encoded labels
train_labels_flat = np.array([np.argmax(label) for label in train_labels])
valid_labels_flat = np.array([np.argmax(label) for label in valid_labels])
test_labels_flat = np.array([np.argmax(label) for label in test_labels])

print("Flattened train labels shape:", train_labels_flat.shape)
print("Flattened train labels:", train_labels_flat)

Flattened train labels shape: (100,)
Flattened train labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [39]:
from sklearn.svm import SVC


train_data_bow = train_data_bow.reshape(-1, 1)
valid_data_bow = valid_data_bow.reshape(-1, 1)
test_data_bow = test_data_bow.reshape(-1, 1)

# Step 3: Train a classifier using bag-of-visual-words representations
classifier = SVC(kernel='linear')  # Using a linear kernel for simplicity
# If the dataset is large, you might consider using 'rbf' kernel instead, but it's computationally more expensive

# Train the classifier
classifier.fit(train_data_flat, train_labels_flat)

# Step 4: Evaluate on validation data
valid_accuracy = classifier.score(valid_data_flat, valid_labels_flat)
print("Validation Accuracy:", valid_accuracy)

# Step 5: Predict on test data
test_accuracy = classifier.score(test_data_flat, test_labels_flat)
print("Test Accuracy:", test_accuracy)

ValueError: The number of classes has to be greater than one; got 1 class

In [None]:
import pandas as pd

# Create DataFrames for train, validation, and test data, IDs, and labels
train_df = pd.DataFrame({
    'Data_ID': train_ids,
    'Label_1': train_labels[:, 0],  # Assuming train_labels is a 2D array with one-hot encoding
    'Label_2': train_labels[:, 1],
    'Label_3': train_labels[:, 2],
    'Label_4': train_labels[:, 3],
    'Label_5': train_labels[:, 4],
})

valid_df = pd.DataFrame({
    'Data_ID': valid_ids,
    'Label_1': valid_labels[:, 0],  # Assuming valid_labels is a 2D array with one-hot encoding
    'Label_2': valid_labels[:, 1],
    'Label_3': valid_labels[:, 2],
    'Label_4': valid_labels[:, 3],
    'Label_5': valid_labels[:, 4],
})

test_df = pd.DataFrame({
    'Data_Path': test_data_paths,  # Assuming test_data_paths contains paths to images
    'Data_ID': test_ids,
    'Label_1': test_labels[:, 0],  # Assuming test_labels is a 2D array with one-hot encoding
    'Label_2': test_labels[:, 1],
    'Label_3': test_labels[:, 2],
    'Label_4': test_labels[:, 3],
    'Label_5': test_labels[:, 4],
    'Set': ['test'] * len(test_data_paths)  # Add a column to identify the set
})

# Concatenate all DataFrames into a single DataFrame
all_data_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Save DataFrame to CSV
all_data_df.to_csv('all_data.csv', index=False)


In [None]:
import pandas as pd

# Create DataFrames for train, validation, and test data, IDs, and labels
train_df = pd.DataFrame({
    'Data_Path': train_data_paths,  # Assuming train_data_paths contains paths to images
    'Data_ID': train_ids,
    'Label_1': train_labels[:, 0],  # Assuming train_labels is a 2D array with one-hot encoding
    'Label_2': train_labels[:, 1],
    'Label_3': train_labels[:, 2],
    'Label_4': train_labels[:, 3],
    'Label_5': train_labels[:, 4],
    'Set': ['train'] * len(train_data_paths)  # Add a column to identify the set
})

valid_df = pd.DataFrame({
    'Data_Path': valid_data_paths,  # Assuming valid_data_paths contains paths to images
    'Data_ID': valid_ids,
    'Label_1': valid_labels[:, 0],  # Assuming valid_labels is a 2D array with one-hot encoding
    'Label_2': valid_labels[:, 1],
    'Label_3': valid_labels[:, 2],
    'Label_4': valid_labels[:, 3],
    'Label_5': valid_labels[:, 4],
    'Set': ['valid'] * len(valid_data_paths)  # Add a column to identify the set
})

test_df = pd.DataFrame({
    'Data_Path': test_data_paths,  # Assuming test_data_paths contains paths to images
    'Data_ID': test_ids,
    'Label_1': test_labels[:, 0],  # Assuming test_labels is a 2D array with one-hot encoding
    'Label_2': test_labels[:, 1],
    'Label_3': test_labels[:, 2],
    'Label_4': test_labels[:, 3],
    'Label_5': test_labels[:, 4],
    'Set': ['test'] * len(test_data_paths)  # Add a column to identify the set
})

# Concatenate all DataFrames into a single DataFrame
all_data_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Save DataFrame to CSV
all_data_df.to_csv('all_data.csv', index=False)


In [None]:
# Step 3: Train SVM model
svm_model = OneVsRestClassifier(svm.SVC(kernel='linear'))
svm_model.fit(train_data_flat, train_labels)

# Step 4: Evaluate on validation data
valid_predictions = svm_model.predict(valid_data_flat)
valid_accuracy = np.mean(valid_predictions == valid_labels)
print("Validation Accuracy:", valid_accuracy)

# Step 5: Predict on test data
test_predictions = svm_model.predict(test_data_flat)
test_accuracy = np.mean(test_predictions == test_labels)
print("Test Accuracy:", test_accuracy)


In [None]:
from sklearn.multioutput import MultiOutputClassifier

# Step 3: Train SVM model with MultiOutputClassifier
svm_model_multi = MultiOutputClassifier(svm.SVC(kernel='linear'))
svm_model_multi.fit(train_data_flat, train_labels)

# Step 4: Evaluate on validation data
valid_accuracy = svm_model_multi.score(valid_data_flat, valid_labels)
print("Validation Accuracy:", valid_accuracy)

# Step 5: Predict on test data
test_accuracy = svm_model_multi.score(test_data_flat, test_labels)
print("Test Accuracy:", test_accuracy)

ValueError: The number of classes has to be greater than one; got 1 class