# Import configuration and setup

In [1]:
from medulloblastoma.config import PROJ_ROOT, RAW_DATA_DIR, INTERIM_DATA_DIR, PROCESSED_DATA_DIR, EXTERNAL_DATA_DIR, MODELS_DIR, REPORTS_DIR, FIGURES_DIR
import os
import numpy as np
import pandas as pd
os.chdir(PROJ_ROOT)
from medulloblastoma.dataset import download_data, prepare_data
from medulloblastoma.features import main as preprocess_pipeline
from medulloblastoma.features import load_data
# Execute R script for GSE85217 data download
# ! Rscript {os.path.join(PROJ_ROOT, 'medulloblastoma','get_data.R')}

[32m2025-12-12 11:38:08.753[0m | [1mINFO    [0m | [36mmedulloblastoma.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /mnt/d/Repos/bitsxlamarato-medulloblastoma[0m
2025-12-12 11:38:15.615482: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-12 11:38:15.621734: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-12 11:38:15.633517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765535895.653792  912282 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765535895.659408  912282 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plug

# Prepare Data

In [2]:
# Download microarray gene expression data
download_data(save_path=RAW_DATA_DIR,remove_gz=True)

[32m2025-12-12 11:38:20.158[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mdownload_data[0m:[36m77[0m - [1mDownloading GSE85217 dataset from GEO...[0m
Downloading: 100.0%
File downloaded successfully: /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt.gz
[32m2025-12-12 11:38:43.395[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mdownload_data[0m:[36m96[0m - [1mExtracting compressed file...[0m
[32m2025-12-12 11:38:51.770[0m | [32m[1mSUCCESS [0m | [36mmedulloblastoma.dataset[0m:[36mdownload_data[0m:[36m101[0m - [32m[1mFile extracted to: /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt[0m
[32m2025-12-12 11:38:51.790[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mdownload_data[0m:[36m106[0m - [1mRemoved compressed .gz file[0m


'/mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt'

In [3]:
# Structure data so that it is easier to handle
prepare_data(
    expression_file=os.path.join(RAW_DATA_DIR,'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt'),
    metadata_path = os.path.join(RAW_DATA_DIR,'GSE85217_metadata.csv'),
    save_path=RAW_DATA_DIR
)

[32m2025-12-12 11:38:51.803[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mprepare_data[0m:[36m178[0m - [1mLoading gene expression data from /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt[0m
[32m2025-12-12 11:38:56.555[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mprepare_data[0m:[36m183[0m - [1mLoaded expression data with shape: (21641, 767)[0m
[32m2025-12-12 11:38:56.557[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mprepare_data[0m:[36m188[0m - [1mExtracting gene correspondence information...[0m
[32m2025-12-12 11:38:56.631[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mprepare_data[0m:[36m192[0m - [1mExpression data after removing gene info: (21641, 763)[0m
[32m2025-12-12 11:38:56.631[0m | [1mINFO    [0m | [36mmedulloblastoma.dataset[0m:[36mprepare_data[0m:[36m195[0m - [1mLoading metadata from /mnt/d/Repos/bitsxlamarato-medulloblas

('/mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/cavalli.csv',
 '/mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/cavalli_subgroups.csv')

# Preprocessing

Selecting genes that are lowly expressed, lowly variant, and outlier genes. We also check that there are no missing data.

In [4]:
preprocess_pipeline(
    data_path=os.path.join(RAW_DATA_DIR,'cavalli.csv'),
    metadata_path=os.path.join(RAW_DATA_DIR,'cavalli_subgroups.csv'),
    save_path=PROCESSED_DATA_DIR,
    per=0.2,
    cutoff=0.1,
    alpha=0.05
)

Path to save data: /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/processed
[32m2025-12-12 11:39:25.116[0m | [1mINFO    [0m | [36mmedulloblastoma.features[0m:[36mload_data[0m:[36m84[0m - [1mLoading data from /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/cavalli.csv[0m
[32m2025-12-12 11:39:29.511[0m | [1mINFO    [0m | [36mmedulloblastoma.features[0m:[36mload_data[0m:[36m87[0m - [1mLoading metadata from /mnt/d/Repos/bitsxlamarato-medulloblastoma/data/raw/cavalli_subgroups.csv[0m
[32m2025-12-12 11:39:29.518[0m | [1mINFO    [0m | [36mmedulloblastoma.features[0m:[36mload_data[0m:[36m91[0m - [1mInitial data shape: (21641, 763), metadata shape: (763,)[0m
[32m2025-12-12 11:39:29.518[0m | [1mINFO    [0m | [36mmedulloblastoma.features[0m:[36mload_data[0m:[36m94[0m - [1mTransposing data to match metadata dimensions[0m
[32m2025-12-12 11:39:29.518[0m | [32m[1mSUCCESS [0m | [36mmedulloblastoma.features[0m:[36mload_data[0m:[36m106[0m -

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

In [5]:
# Load preprocessed gene expression data
data=pd.read_csv(os.path.join(PROCESSED_DATA_DIR,'cavalli_maha.csv'),index_col=0)

In [6]:
# Load metadata
metadata=pd.read_csv(os.path.join(RAW_DATA_DIR,'cavalli_subgroups.csv'),index_col=0).squeeze()
# Adapt names of groups of interests so they are shorter
metadata=metadata.map({'Group3':'G3','Group4':'G4'})
metadata.name = 'Subgroup'
# Select groups of interest in metedata
metadata_g3g4=metadata[metadata.isin(['G3','G4'])]
print(metadata.shape,metadata_g3g4.shape)

(763,) (470,)


In [7]:
# Select groups of interest in gene expression data
data_g3g4 = data.loc[metadata_g3g4.index]
print(data_g3g4.shape)

(470, 14347)


In [8]:
metadata.value_counts()

Subgroup
G4    326
G3    144
Name: count, dtype: int64

In [9]:
metadata_g3g4.value_counts()

Subgroup
G4    326
G3    144
Name: count, dtype: int64

In [10]:
# # Load final datasets
# data,metadata=load_data(
#     data_path=os.path.join(PROCESSED_DATA_DIR,'cavalli_maha.csv'),
#     metadata_path=os.path.join(PROCESSED_DATA_DIR,'g3g4_maha.csv')
# )
# print(data.shape,metadata.shape)

# UMAP Visualization of Preprocessed Data

In [11]:
from medulloblastoma.plots import plot_umap_binary

# Discrete color mapping for G3/G4 subtypes
dict_medulloblastoma = {
    'G3': 'red',  # Red for G3
    'G4': 'blue'   # Blue for G4
}

# Generate UMAP with discrete subtype coloring
plot_umap_binary(
    data=data_g3g4,
    clinical=metadata_g3g4,
    colors_dict=dict_medulloblastoma,
    n_components=2,
    save_fig=True,
    save_as="initial_medulloblastoma_umap",
    seed=2023,
    title="Medulloblastoma G3/G4 Gene Expression UMAP",
    marker_size=20
)

X_umap.shape (470, 2)
color_series.shape (470,)
len(all_patients) 470
color_series.loc[all_patients].values.shape (470,)
Saved UMAP plotly figure to: initial_medulloblastoma_umap.png
Saved UMAP plotly figure to: initial_medulloblastoma_umap.pdf
Saved UMAP plotly figure to: initial_medulloblastoma_umap.svg


# Model Training and Reconstruction

In [12]:
# BLANK SECTION FOR MODEL TRAINING
# This section will be implemented during the hackathon
# Plan for:
# - CVAE architecture definition
# - Training loop with G3/G4 labels
# - Hyperparameter optimization
# - Model evaluation and validation
# Clue: for architecture and hyperparameter optimization, you can use ax (see https://ax.dev/)

# Final UMAP with Continuous Scoring

In [13]:
from medulloblastoma.plots import plot_umap_spectrum

In [14]:
# For testing purposes, we assign a random score between 0 and 1 to each patient
score=pd.Series(np.random.rand(470),name='score',index=metadata_g3g4.index)

In [15]:
plot_umap_spectrum(
    data=data_g3g4,
    clinical=score,  # Continuous scores instead of discrete labels
    colormap='RdBu',
    n_components=2,
    save_fig=True,
    save_as="final_scored_medulloblastoma_umap",
    seed=2023,
    title="Medulloblastoma G3/G4 Scores",
    marker_size=20
)

X_umap.shape (470, 2)
color_series.shape (470,)
color_series range: [0.001, 1.000]
len(all_patients) 470
Saved UMAP spectrum figure to: final_scored_medulloblastoma_umap.png
Saved UMAP spectrum figure to: final_scored_medulloblastoma_umap.pdf
Saved UMAP spectrum figure to: final_scored_medulloblastoma_umap.svg
