In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# File path for Kaggle dataset
file_path = "/kaggle/input/brats2020-training-data/BraTS2020_training_data/content/data/meta_data.csv"

# Step 1: Load the original dataset
original_meta_data = pd.read_csv(file_path)

# Step 2: Show missing values before cleaning
print("Missing values before cleaning:\n", original_meta_data.isnull().sum())

# Step 3: Clean the data (drop duplicates, handle missing values, ensure numeric columns)
meta_data = original_meta_data.copy()  # Make a copy for cleaning
meta_data.drop_duplicates(inplace=True)
meta_data.dropna(inplace=True)
meta_data['target'] = pd.to_numeric(meta_data['target'], errors='coerce')
meta_data['volume'] = pd.to_numeric(meta_data['volume'], errors='coerce')
meta_data['slice'] = pd.to_numeric(meta_data['slice'], errors='coerce')

# Step 4: Missing values after cleaning
print("\nMissing values after cleaning:\n", meta_data.isnull().sum())

# Step 5: Compare row count before and after cleaning
print(f"\nOriginal dataset row count: {original_meta_data.shape[0]}")
print(f"Cleaned dataset row count: {meta_data.shape[0]}")

# Step 6: Plot distributions of numerical columns for comparison (before vs after cleaning)

# Distribution for 'target', 'volume', 'slice' in the original data
fig, axes = plt.subplots(3, 2, figsize=(12, 12))

# 'target' distribution
sns.countplot(x='target', data=original_meta_data, ax=axes[0, 0])
axes[0, 0].set_title('Original Target Distribution')
sns.countplot(x='target', data=meta_data, ax=axes[0, 1])
axes[0, 1].set_title('Cleaned Target Distribution')

# 'volume' distribution
sns.histplot(original_meta_data['volume'], kde=True, ax=axes[1, 0], color='blue')
axes[1, 0].set_title('Original Volume Distribution')
sns.histplot(meta_data['volume'], kde=True, ax=axes[1, 1], color='blue')
axes[1, 1].set_title('Cleaned Volume Distribution')

# 'slice' distribution
sns.histplot(original_meta_data['slice'], kde=True, ax=axes[2, 0], color='green')
axes[2, 0].set_title('Original Slice Distribution')
sns.histplot(meta_data['slice'], kde=True, ax=axes[2, 1], color='green')
axes[2, 1].set_title('Cleaned Slice Distribution')

plt.tight_layout()
plt.show()

# Step 7: Save the cleaned data (optional)
cleaned_file_path = "/kaggle/working/meta_data_cleaned.csv"
meta_data.to_csv(cleaned_file_path, index=False)
print(f"\nPreprocessed data saved to: {cleaned_file_path}")


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# File path for Kaggle dataset
file_path = "/kaggle/input/brats2020-training-data/BraTS2020_training_data/content/data/name_mapping.csv"

# Step 1: Load the original dataset
original_name_mapping = pd.read_csv(file_path)

# Step 2: Show missing values before cleaning
print("Missing values before cleaning:\n", original_name_mapping.isnull().sum())

# Step 3: Remove duplicates
name_mapping = original_name_mapping.copy()  # Make a copy for cleaning
name_mapping.drop_duplicates(inplace=True)

# Step 4: Handle missing values (Drop rows with missing values for simplicity)
name_mapping.dropna(inplace=True)

# Step 5: Ensure correct formatting for columns (e.g., `BraTS_2017_subject_ID` etc.)
# If any columns are meant to be numeric, you can convert them as follows (assuming ID columns are strings):
name_mapping['BraTS_2017_subject_ID'] = name_mapping['BraTS_2017_subject_ID'].astype(str)
name_mapping['BraTS_2018_subject_ID'] = name_mapping['BraTS_2018_subject_ID'].astype(str)
name_mapping['BraTS_2019_subject_ID'] = name_mapping['BraTS_2019_subject_ID'].astype(str)
name_mapping['BraTS_2020_subject_ID'] = name_mapping['BraTS_2020_subject_ID'].astype(str)

# Step 6: Validate file paths (optional) - Check if the paths in 'BraTS_2017_subject_ID' column exist
invalid_paths = [path for path in name_mapping['BraTS_2017_subject_ID'] if not os.path.exists(path)]
if invalid_paths:
    print(f"Invalid paths found: {len(invalid_paths)}")
else:
    print("All paths are valid.")

# Step 7: Missing values after cleaning
print("\nMissing values after cleaning:\n", name_mapping.isnull().sum())

# Step 8: Compare row count before and after cleaning
print(f"\nOriginal dataset row count: {original_name_mapping.shape[0]}")
print(f"Cleaned dataset row count: {name_mapping.shape[0]}")

# Step 9: Visualize distribution of 'Grade' (or other categorical features)
plt.figure(figsize=(8, 6))
sns.countplot(x='Grade', data=name_mapping)
plt.title("Distribution of Grades in the Cleaned Data")
plt.xlabel('Grade')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Step 10: Save the cleaned data
cleaned_file_path = "/kaggle/working/name_mapping_cleaned.csv"
name_mapping.to_csv(cleaned_file_path, index=False)
print(f"\nPreprocessed data saved to: {cleaned_file_path}")


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# File path for Kaggle dataset
file_path = "/kaggle/input/brats2020-training-data/BraTS2020_training_data/content/data/survival_info.csv"

# Step 1: Load the original dataset
original_survival_info = pd.read_csv(file_path)

# Step 2: Show missing values before cleaning
print("Missing values before cleaning:\n", original_survival_info.isnull().sum())

# Step 3: Remove duplicates
survival_info = original_survival_info.copy()  # Make a copy for cleaning
survival_info.drop_duplicates(inplace=True)

# Step 4: Handle missing values (drop rows with missing values for simplicity)
survival_info.dropna(inplace=True)

# Step 5: Ensure correct formatting for columns (e.g., `Age` and `Survival_days`)
survival_info['Age'] = pd.to_numeric(survival_info['Age'], errors='coerce')
survival_info['Survival_days'] = pd.to_numeric(survival_info['Survival_days'], errors='coerce')

# Step 6: Check if there are any rows with invalid data (NA values after coercion)
survival_info.dropna(inplace=True)

# Step 7: Missing values after cleaning
print("\nMissing values after cleaning:\n", survival_info.isnull().sum())

# Step 8: Compare row count before and after cleaning
print(f"\nOriginal dataset row count: {original_survival_info.shape[0]}")
print(f"Cleaned dataset row count: {survival_info.shape[0]}")

# Step 9: Visualize the distribution of 'Age' and 'Survival_days'
plt.figure(figsize=(12, 6))

# Age Distribution
plt.subplot(1, 2, 1)
sns.histplot(survival_info['Age'], kde=True, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Survival Days Distribution
plt.subplot(1, 2, 2)
sns.histplot(survival_info['Survival_days'], kde=True, color='green')
plt.title('Survival Days Distribution')
plt.xlabel('Survival Days')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Step 10: Visualizing the Extent of Resection
plt.figure(figsize=(8, 6))
sns.countplot(x='Extent_of_Resection', data=survival_info, palette="Set2")
plt.title("Extent of Resection Distribution")
plt.xlabel('Extent of Resection')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Step 11: Save the cleaned data
cleaned_file_path = "/kaggle/working/survival_info_cleaned.csv"
survival_info.to_csv(cleaned_file_path, index=False)
print(f"\nPreprocessed data saved to: {cleaned_file_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File path for the BraTS Training Metadata dataset
file_path = '/kaggle/input/brats2020-training-data/BraTS20 Training Metadata.csv'

# Step 1: Load the dataset
metadata = pd.read_csv(file_path)

# Step 2: Display the first few rows of the dataset to understand its structure
print(metadata.head())

# Step 3: Check for missing values
print("\nMissing values before cleaning:\n", metadata.isnull().sum())

# Step 4: Remove duplicates if any
metadata.drop_duplicates(inplace=True)

# Step 5: Handle missing values by dropping rows with NaN (optional: use imputation for specific columns)
metadata.dropna(inplace=True)

# Step 6: Convert data types where necessary
metadata['label0_pxl_cnt'] = pd.to_numeric(metadata['label0_pxl_cnt'], errors='coerce')
metadata['label1_pxl_cnt'] = pd.to_numeric(metadata['label1_pxl_cnt'], errors='coerce')
metadata['label2_pxl_cnt'] = pd.to_numeric(metadata['label2_pxl_cnt'], errors='coerce')
metadata['background_ratio'] = pd.to_numeric(metadata['background_ratio'], errors='coerce')

# Step 7: Check if data is now clean
print("\nMissing values after cleaning:\n", metadata.isnull().sum())

# Step 8: Visualize distributions of key columns (e.g., pixel counts, background ratio)
plt.figure(figsize=(18, 6))

# Distribution of label pixel counts
plt.subplot(1, 3, 1)
sns.histplot(metadata['label0_pxl_cnt'], kde=True, color='blue')
plt.title('Label 0 Pixel Count Distribution')
plt.xlabel('Label 0 Pixel Count')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
sns.histplot(metadata['label1_pxl_cnt'], kde=True, color='green')
plt.title('Label 1 Pixel Count Distribution')
plt.xlabel('Label 1 Pixel Count')
plt.ylabel('Frequency')

plt.subplot(1, 3, 3)
sns.histplot(metadata['label2_pxl_cnt'], kde=True, color='red')
plt.title('Label 2 Pixel Count Distribution')
plt.xlabel('Label 2 Pixel Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Step 9: Visualize background ratio distribution
plt.figure(figsize=(8, 6))
sns.histplot(metadata['background_ratio'], kde=True, color='purple')
plt.title('Background Ratio Distribution')
plt.xlabel('Background Ratio')
plt.ylabel('Frequency')
plt.show()

# Step 10: Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = metadata[['label0_pxl_cnt', 'label1_pxl_cnt', 'label2_pxl_cnt', 'background_ratio']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Step 11: Pairplot for numerical features
sns.pairplot(metadata[['label0_pxl_cnt', 'label1_pxl_cnt', 'label2_pxl_cnt', 'background_ratio']])
plt.suptitle('Pairplot for Pixel Counts and Background Ratio', y=1.02)
plt.show()

# Step 12: Countplot for categorical features ('target' and 'slice')
plt.figure(figsize=(10, 6))
sns.countplot(x='target', data=metadata, palette='Set2')
plt.title('Distribution of Target Class')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='slice', data=metadata, palette='Set2')
plt.title('Distribution of Slices')
plt.xlabel('Slice')
plt.ylabel('Count')
plt.show()

# Step 13: Violin Plot for the label pixel counts
plt.figure(figsize=(18, 6))
sns.violinplot(x='target', y='label0_pxl_cnt', data=metadata, palette='Blues')
plt.title('Violin Plot of Label 0 Pixel Count by Target Class')
plt.xlabel('Target')
plt.ylabel('Label 0 Pixel Count')
plt.show()

# Step 14: Scatter plot between label pixel counts
plt.figure(figsize=(10, 6))
sns.scatterplot(x='label0_pxl_cnt', y='label1_pxl_cnt', data=metadata, hue='target', palette='Set2')
plt.title('Scatter Plot of Label 0 vs Label 1 Pixel Count')
plt.xlabel('Label 0 Pixel Count')
plt.ylabel('Label 1 Pixel Count')
plt.show()

# Step 15: Save the cleaned dataset
cleaned_metadata_file_path = '/kaggle/working/BraTS20_Training_Metadata_cleaned.csv'
metadata.to_csv(cleaned_metadata_file_path, index=False)
print(f"\nCleaned dataset saved to: {cleaned_metadata_file_path}")


In [None]:
import h5py
import numpy as np

# File path to the HDF5 file on Kaggle (adjust the path as needed)
h5_file_path = "/kaggle/input/brats2020-training-data/BraTS2020_training_data/content/data/volume_100_slice_0.h5"

# Open the HDF5 file and explore its structure
with h5py.File(h5_file_path, 'r') as f:
    # List all keys (datasets or groups) in the file
    print("Keys in the file:", list(f.keys()))

    # Check for 'image' dataset
    image_dataset_name = 'image'
    mask_dataset_name = 'mask'

    if image_dataset_name in f.keys():
        image_dataset = f[image_dataset_name]

        # Inspect the shape and properties of the image dataset
        print(f"Dataset: {image_dataset_name}")
        print(f"Shape: {image_dataset.shape}")
        print(f"Number of dimensions: {len(image_dataset.shape)}")

        # Check if the image dataset is 2D or 3D
        if len(image_dataset.shape) == 2:
            print("This is a 2D image dataset.")
            # Example: Normalize the image data for 2D
            image_data = image_dataset[:]
            image_data_normalized = image_data / np.max(image_data)  # Normalize by max value
            print("Image data normalized (2D).")
        elif len(image_dataset.shape) == 3:
            print("This is a 3D image dataset.")
            # Example: Normalize the image data for 3D
            image_data = image_dataset[:]
            image_data_normalized = image_data / np.max(image_data)  # Normalize by max value
            print("Image data normalized (3D).")
        else:
            print("Unexpected image dataset structure.")

        # Optionally, save the normalized image data to a new file
        save_path = "/kaggle/working/normalized_volume_100_slice_0_image.h5"
        with h5py.File(save_path, 'w') as new_f:
            new_f.create_dataset('normalized_image', data=image_data_normalized)
            print(f"Normalized image data saved to: {save_path}")

    else:
        print(f"Dataset '{image_dataset_name}' not found in the file.")

    # Check for 'mask' dataset
    if mask_dataset_name in f.keys():
        mask_dataset = f[mask_dataset_name]

        # Inspect the shape and properties of the mask dataset
        print(f"Dataset: {mask_dataset_name}")
        print(f"Shape: {mask_dataset.shape}")
        print(f"Number of dimensions: {len(mask_dataset.shape)}")

        # Check if the mask dataset is 2D or 3D
        if len(mask_dataset.shape) == 2:
            print("This is a 2D mask dataset.")
            # Example: Normalize the mask data for 2D
            mask_data = mask_dataset[:]
            mask_data_normalized = mask_data / np.max(mask_data)  # Normalize by max value
            print("Mask data normalized (2D).")
        elif len(mask_dataset.shape) == 3:
            print("This is a 3D mask dataset.")
            # Example: Normalize the mask data for 3D
            mask_data = mask_dataset[:]
            mask_data_normalized = mask_data / np.max(mask_data)  # Normalize by max value
            print("Mask data normalized (3D).")
        else:
            print("Unexpected mask dataset structure.")

        # Optionally, save the normalized mask data to a new file
        save_path = "/kaggle/working/normalized_volume_100_slice_0_mask.h5"
        with h5py.File(save_path, 'w') as new_f:
            new_f.create_dataset('normalized_mask', data=mask_data_normalized)
            print(f"Normalized mask data saved to: {save_path}")

    else:
        print(f"Dataset '{mask_dataset_name}' not found in the file.")


In [None]:
import random
import matplotlib.pyplot as plt

# List all .h5 files
all_files = [file for file in os.listdir(data_folder) if file.endswith('.h5')]

# Shuffle the files for random splitting
random.shuffle(all_files)

# Split into training (70%), validation (15%), and test (15%)
train_size = int(0.7 * len(all_files))
val_size = int(0.15 * len(all_files))
test_size = len(all_files) - train_size - val_size

# Define the splits
train_files = all_files[:train_size]
val_files = all_files[train_size:train_size + val_size]
test_files = all_files[train_size + val_size:]

# Plot the distribution in a pie chart
sizes = [len(train_files), len(val_files), len(test_files)]
labels = ['Training Set', 'Validation Set', 'Test Set']

plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#99ff99', '#ffcc99'])
plt.title("Dataset Splitting (Train-Val-Test)")
plt.axis('equal')
plt.show()

# Print the counts for verification
print(f"Training Set: {len(train_files)} files")
print(f"Validation Set: {len(val_files)} files")
print(f"Test Set: {len(test_files)} files")


In [None]:
import h5py
import numpy as np
import os
import cv2

# Define the path to the data folder and the output folder for processed files
data_folder = '/kaggle/input/brats2020-training-data/BraTS2020_training_data/content/data/'
processed_folder = '/kaggle/working/preprocessed_images/'

# Create the output folder if it doesn't exist
os.makedirs(processed_folder, exist_ok=True)

# Loop through all H5 files in the data folder
for file_name in os.listdir(data_folder):
    if file_name.endswith('.h5'):  # Only process .h5 files
        file_path = os.path.join(data_folder, file_name)

        # Open the H5 file
        with h5py.File(file_path, 'r') as h5_file:
            if 'image' in h5_file:  # Ensure 'image' dataset exists
                image_data = h5_file['image'][:]

                # Normalize the image data (assuming it's a 3D volume)
                image_data_normalized = image_data / np.max(image_data)

                # Resize each 2D slice to a fixed size (e.g., 128x128)
                resized_slices = [cv2.resize(slice, (128, 128)) for slice in image_data_normalized]
                resized_slices = np.array(resized_slices)

                # Ensure the shape is correct after resizing (add channel dimension if needed)
                resized_slices = np.expand_dims(resized_slices, axis=-1)

                # Save the processed image data into a new H5 file
                save_image_path = os.path.join(processed_folder, file_name.replace('.h5', '_processed_image.h5'))
                with h5py.File(save_image_path, 'w') as save_image_file:
                    save_image_file.create_dataset('image', data=resized_slices)

                print(f"Processed and saved: {file_name}")
