In [1]:
import os
import glob
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import pydicom

In [2]:
# read data
INPUT_DIR = 'rsna-2024-lumbar-spine-degenerative-classification'

train = pl.read_csv(f'{INPUT_DIR}/train.csv')
print(train.head())
train_label = pl.read_csv(f'{INPUT_DIR}/train_label_coordinates.csv')
print(train_label[1])
train_desc = pl.read_csv(f'{INPUT_DIR}/train_series_descriptions.csv')
print(train_desc)

shape: (5, 26)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ study_id ┆ spinal_ca ┆ spinal_ca ┆ spinal_ca ┆ … ┆ right_sub ┆ right_sub ┆ right_sub ┆ right_sub │
│ ---      ┆ nal_steno ┆ nal_steno ┆ nal_steno ┆   ┆ articular ┆ articular ┆ articular ┆ articular │
│ i64      ┆ sis_l1_l2 ┆ sis_l2_l3 ┆ sis_l3_l4 ┆   ┆ _stenosis ┆ _stenosis ┆ _stenosis ┆ _stenosis │
│          ┆ ---       ┆ ---       ┆ ---       ┆   ┆ _l2…      ┆ _l3…      ┆ _l4…      ┆ _l5…      │
│          ┆ str       ┆ str       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 4003253  ┆ Normal/Mi ┆ Normal/Mi ┆ Normal/Mi ┆ … ┆ Normal/Mi ┆ Normal/Mi ┆ Normal/Mi ┆ Normal/Mi │
│          ┆ ld        ┆ ld        ┆ ld        ┆   ┆ ld        ┆ ld        ┆

In [None]:
def graph_plot(study_id, series_id):
    train_label_combinations = pl.DataFrame()
    for row in train_label.iter_rows():
        if row[0]==study_id:
            print(pl.DataFrame(row[:3]).transpose())
            data = pl.DataFrame(row[:3]).transpose()
            train_label_combinations=pl.concat([train_label_combinations, data])
    print(train_label_combinations)
    
    #rename columns
    train_label_combinations = train_label_combinations.rename({"column_0":"study_id", "column_1":"series_id", "column_2":"instance_number"})
    #extract unique combination
    train_label_combinations = train_label_combinations.unique(subset=["study_id", "series_id", "instance_number"]).sort(["study_id", "series_id", "instance_number"])
    
    instance_number_list = train_label_combinations.filter((pl.col("study_id")==study_id) & (pl.col("series_id")==series_id)).get_column("instance_number")
    #instance_number_list

    for instance_number in instance_number_list:
        #print(instance_number)
        print(f"=====study_id:{study_id}, series_id:{series_id}, instance_number:{instance_number}=====")
        #read image
        ds = pydicom.read_file(f'{INPUT_DIR}/train_images/{study_id}/{series_id}/{instance_number}.dcm')
        #draw original image
        df_plt = train_label.filter(
            (pl.col('study_id')==study_id)
            &(pl.col('series_id')==series_id)
            &(pl.col('instance_number')==instance_number)
        )
        plt.subplot(1,2,1)
        plt.imshow(ds.pixel_array, cmap='bone')
        #plt.title(f"study_id:{study_id}, series_id:{series_id}, instance_number:{instance_number}")

        #draw original image + label
        #draw image
        df_plt = train_label.filter(
            (pl.col('study_id')==study_id)
            &(pl.col('series_id')==series_id)
            &(pl.col('instance_number')==instance_number)
        )
        plt.subplot(1,2,2)
        plt.imshow(ds.pixel_array, cmap='bone')
        #plt.title(f"study_id:{study_id}, series_id:{series_id}, instance_number:{instance_number}")
        #draw rabel
        for row in df_plt.iter_rows():
            plt.scatter(row[-2], row[-1], color='red')
        plt.show()

study_id, series_id = 4290709089, 3274612423
graph_plot(study_id, series_id)

# Visualize the pixel array

In [None]:
from PIL import Image

def visualizeImage(directory):
    images = [f for f in os.listdir(directory) if f.endswith('.dcm')]

    # Number of images
    grid_size = len(images)

    grid_size = int(grid_size ** 0.5) + 1

    fig, axes = plt.subplots(grid_size, grid_size, figsize=(15,15))

    axes = axes.flatten()

    for idx, file in enumerate(images):
        ds = pydicom.read_file(os.path.join(directory, file))
        axes[idx].imshow(ds.pixel_array, cmap='bone')
        axes[idx].set_title(file)
        axes[idx].axis('off')

    # Hide any remaining empty subplots
    for i in range(idx + 1, len(axes)):
        axes[i].axis('off')

    plt.tight_layout()
    plt.show

directory =f'{INPUT_DIR}/train_images/{study_id}/{series_id}'
visualizeImage(directory)


In [None]:
def list_of_picture(directory):
    all_the_pic = []
    # path_dir = os.path.join(direc)

    for i in os.listdir(directory):
        all_the_pic.append(i)
    return all_the_pic

path_dir = f'{INPUT_DIR}/train_images/{study_id}/{series_id}'
list_of_picture(path_dir)

# Check for missing data

In [None]:
data = pd.read_csv("rsna-2024-lumbar-spine-degenerative-classification/train.csv")
missing_values_count = pd.isnull(data).sum()
print(missing_values_count)

# Checking the distribution of train dataset

In [None]:
df = pd.read_csv(f"{INPUT_DIR}/train.csv")

# Function for melting columns
def melting_columns(df):
    df_melted = pd.melt(df, 
                            id_vars=['study_id'], 
                            value_vars=[
                                'spinal_canal_stenosis_l1_l2', 'spinal_canal_stenosis_l2_l3', 'spinal_canal_stenosis_l3_l4', 
                                'spinal_canal_stenosis_l4_l5', 'spinal_canal_stenosis_l5_s1', 'left_neural_foraminal_narrowing_l1_l2', 
                                'left_neural_foraminal_narrowing_l2_l3', 'left_neural_foraminal_narrowing_l3_l4', 
                                'left_neural_foraminal_narrowing_l4_l5', 'left_neural_foraminal_narrowing_l5_s1', 
                                'right_neural_foraminal_narrowing_l1_l2', 'right_neural_foraminal_narrowing_l2_l3', 
                                'right_neural_foraminal_narrowing_l3_l4', 'right_neural_foraminal_narrowing_l4_l5', 
                                'right_neural_foraminal_narrowing_l5_s1', 'left_subarticular_stenosis_l1_l2', 
                                'left_subarticular_stenosis_l2_l3', 'left_subarticular_stenosis_l3_l4', 
                                'left_subarticular_stenosis_l4_l5', 'left_subarticular_stenosis_l5_s1', 
                                'right_subarticular_stenosis_l1_l2', 'right_subarticular_stenosis_l2_l3', 
                                'right_subarticular_stenosis_l3_l4', 'right_subarticular_stenosis_l4_l5', 
                                'right_subarticular_stenosis_l5_s1'
                            ], 
                            var_name='condition', 
                            value_name='severity')
    return df_melted

# Visualize the distribution
def distribution_graph(ax, df, title):
    df_melted = melting_columns(df) #From avobe functions

    print(df_melted.head())
    print(len(df_melted))

    # Check the distribution of severity levels
    severity_counts = df_melted['severity'].value_counts()
    print(severity_counts)

    # Plot pie chart
    ax.pie(severity_counts, 
           labels=severity_counts.index, 
           autopct='%1.1f%%', 
           startangle=90, 
           colors=plt.get_cmap('Set2').colors)
    ax.set_title(title)
    



# Percentage of distribution each severity on Original Data

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

distribution_graph(ax, df, "Distrbution on original dataset")

# Impute data as most recent value using Skit Learn on train.csv

In [None]:
from sklearn.impute import SimpleImputer

df_copy = df.copy()

categorical_columns = df_copy.select_dtypes(include=['object']).columns

categorical_imputer = SimpleImputer(strategy='most_frequent')
df_copy[categorical_columns] = categorical_imputer.fit_transform(df_copy[categorical_columns])

# Checking missing data again
print(df_copy.isnull().sum())


df_melted_copy = melting_columns(df_copy)


# Compare the before and after imputation

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15,10))

distribution_graph(axes[0], df, "Distribution Graph on Original Dataset")
distribution_graph(axes[1], df_copy, "Distribution Graph on Imputed Dataset")

# Display the plots
plt.tight_layout()
plt.show()

# Creating Custom Dataset from given image data and coordinates

In [4]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pydicom
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def structure_for_train_csv(condition):
        condition = condition.lower().replace(' ', '_').replace('/', '_')
        return condition


class SpinalDataset(Dataset):
    def __init__(self, root_dir, coordinates_file, train, train_data, transform=None): # Setup the necessary attributes
        self.root_dir = root_dir
        self.coordinates= pd.read_csv(coordinates_file)
        self.train_data = pd.read_csv(train_data)
        self.train = train
        self.transform = transform

        # Define label encoder and one hot encoder
        self.label_encoder = LabelEncoder()
        self.onehot_encoder = OneHotEncoder(sparse_output=False)

        # Fit the label encoder and one hot encoder
        conditions = ["Normal/Mild", "Moderate", "Severe"]

        self.label_encoder.fit(conditions)
        integer_encoded = self.label_encoder.transform(conditions).reshape(-1, 1)
        self.onehot_encoder.fit(integer_encoded)

        # Define sample weights
        self.weights = {"Normal/Mild": 1, "Moderate": 2, "Severe": 4}

    def __len__(self): # Returns the length of the Dataframe. More specifically numbers of rows in the dataset
        return len(self.coordinates)
    
    def __getitem__(self, idx): # This method retrieves a single sample (images and label) from the dataset at the specified index (idx).
        row = self.coordinates.iloc[idx]
        study_id = row['study_id']
        series_id = row['series_id']
        instance = row['instance_number']
        condition = row['condition']
        level = row['level']
        x = row['x']
        y = row['y'] 

        # Construct the path to the DICOM
        dicom_file_path = os.path.join(self.root_dir, self.train, str(study_id), str(series_id), f"{instance}.dcm")
        

        # Load the DICOM images
        images = self.load_dicom_image(dicom_file_path)
      
        

        if self.transform:
            images = self.transform(images)

         # Extract condition for the specified level
        condition_column = f'{condition}_{level}'
        condition_column = structure_for_train_csv(condition_column)
        label_str = self.train_data.loc[self.train_data['study_id'] == study_id, condition_column].values[0]
        # Encode the label
        label_encoded = self.label_encoder.transform([label_str])
        label_onehot = self.onehot_encoder.transform(label_encoded.reshape(-1, 1))
        label = torch.tensor(label_onehot, dtype=torch.float32).squeeze()


        print(f"Label string: {label_str}")
        # Calculate weight for the sample
        weight = self.weights.get(label_str, 1)  # Default to 1 if condition not found

        return images, label, weight  
        

        # Load the DICOM images
    def load_dicom_image(self, file_path):
        dicom = pydicom.dcmread(file_path)
        image = dicom.pixel_array
        normalize = image.astype(np.float32) / image.max()
        return normalize


# Example usage
dataset = SpinalDataset(root_dir=INPUT_DIR, 
                        coordinates_file=f'{INPUT_DIR}/train_label_coordinates.csv', 
                        train='train_images',
                        train_data=f'{INPUT_DIR}/train.csv')

dataloader = DataLoader(dataset, batch_size=5, shuffle=True)

for images, labels, weights in dataloader:
    print("Images shape:", images.shape)
    print("Labels:", labels)
    print("Weights:", weights)
    break

Label string: Moderate
Label string: Normal/Mild
Label string: Normal/Mild
Label string: Normal/Mild
Label string: Severe


RuntimeError: stack expects each tensor to be equal size, but got [320, 320] at entry 0 and [512, 512] at entry 1

# Testing OneHotEncoding

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Define the conditions
conditions = ["Normal/Mild", "Moderate", "Severe"]

# Initialize and fit the LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(conditions)

# Convert labels to numerical values
integer_encoded = label_encoder.transform(conditions).reshape(-1, 1)

# Initialize and fit the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoder.fit(integer_encoded)

# Example labels to encode
labels = ["Normal/Mild", "Moderate", "Severe"]

# Convert to numerical labels
integer_encoded = label_encoder.transform(labels).reshape(-1, 1)

# Convert to one-hot encoded vectors
onehot_encoded = onehot_encoder.transform(integer_encoded)

print("Integer Encoded:")
print(integer_encoded)

print("One-Hot Encoded:")
print(onehot_encoded)

# Find minimum Image size minwidth and minheight

In [8]:
# Path to image directory
image_dir = "rsna-2024-lumbar-spine-degenerative-classification/train_images/"

def findingShape():
    # Initialize variable to store minimum width and minimum height
    # min_width, min_height = float('inf'), float('inf')
    total_height, total_width, num_images = 0, 0, 0

    # Iterate through all images in the directoey
    for image_name in os.listdir(image_dir):
        study_dir = os.path.join(image_dir, image_name)
        if not os.path.isdir(study_dir):
            continue

        for series_id in os.listdir(study_dir):
            series_dir = os.path.join(study_dir, series_id)
            if not os.path.isdir(series_dir):
                continue

            for dicom_file in os.listdir(series_dir):
                dicom_path = os.path.join(series_dir, dicom_file)

                try:
                    ds = pydicom.dcmread(dicom_path)
                    image_array = ds.pixel_array
                    
                    if len(image_array.shape) == 2:  # Grayscale size
                        height, width = image_array.shape

                    elif len(image_array.shape) == 3:  # Color including 
                        height, width, _ = image_array.shape
                        
                    else:
                        raise ValueError(f"Unexpected image shape: {image_array.shape}")

                    # Accumulate total size
                    total_width += width
                    total_height += height
                    num_images += 1
                except Exception as e:
                    print(f"Failed to process {dicom_path}: {e}")



    # Calculate average 
    if num_images > 0:
        avg_width = total_width // num_images
        avg_height = total_height // num_images
        print(f"Average Width: {avg_width}, Average Height: {avg_height}")
    else: 
        print("No images")

findingShape()

Average Width: 446, Average Height: 451


# Normalize the images using 

In [None]:
total_height, total_width, num_images = 0, 0, 0

# Iterate through all images in the directoey
for image_name in os.listdir(image_dir):
    study_dir = os.path.join(image_dir, image_name)
    if not os.path.isdir(study_dir):
        continue

    for series_id in os.listdir(study_dir):
        series_dir = os.path.join(study_dir, series_id)
        if not os.path.isdir(series_dir):
            continue

        for dicom_file in os.listdir(series_dir):
            dicom_path = os.path.join(series_dir, dicom_file)
