From ee973b37ef92272968bd85fe1e4202ba97c4236d Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Thu, 30 Jan 2025 14:31:02 -0800 Subject: [PATCH 1/3] fixed bug where 'split_class' wouldn't be removed from `dataset.experiments` if mix-set splits were generated with stratification --- coderdata/dataset/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index c00a8661..bf2cebf3 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -936,13 +936,16 @@ def train_test_validate( sss_1.split(X=df_full, y=df_full['split_class']) ) df_train = df_full.iloc[idx_train] + df_train = df_train.drop(labels=['split_class'], axis=1) df_other = df_full.iloc[idx_other] # Splitting 'other' further into test and validate idx_test, idx_val = next( sss_2.split(X=df_other, y=df_other['split_class']) ) df_test = df_other.iloc[idx_test] + df_test = df_test.drop(labels=['split_class'], axis=1) df_val = df_other.iloc[idx_val] + df_val = df_val.drop(labels=['split_class'], axis=1) # using StratifiedGroupKSplit for the stratified drug-/sample- # blind splits. From c3c15c7e5587459f702682817dbfb334c34b2fb2 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Thu, 30 Jan 2025 14:36:41 -0800 Subject: [PATCH 2/3] fixed issue where genes.tsv would not be downloaded if only one specific dataset was retrieved --- coderdata/download/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderdata/download/downloader.py b/coderdata/download/downloader.py index a513963a..88587806 100644 --- a/coderdata/download/downloader.py +++ b/coderdata/download/downloader.py @@ -68,7 +68,7 @@ def download( file for file in data['files'] - if file['name'].startswith(name) + if file['name'].startswith(name) or 'genes' in file['name'] ] else: filtered_files = data['files'] From 2107ff624f5c5979b8fa00d01319a3fdeea99f4e Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Thu, 30 Jan 2025 16:04:04 -0800 Subject: [PATCH 3/3] changed orientation of wide format tables returned for omic types 'transcriptomics', 'mutations' & 'copy_number' such that rows are improve_sample_id and columns are 'entrez_id' --- coderdata/dataset/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index bf2cebf3..e3ca6377 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -494,7 +494,7 @@ def format( values='transcriptomics', index='entrez_id', columns='improve_sample_id' - ) + ).transpose() elif data_type == "mutations": if data.mutations is None: @@ -516,7 +516,7 @@ def format( columns='improve_sample_id', values='exists', fill_value=0, - ) + ).transpose() elif data_type == "copy_number": if data.copy_number is None: @@ -531,7 +531,7 @@ def format( columns='improve_sample_id', values='copy_number', aggfunc='mean', - ) + ).transpose() if copy_call: ret = ret.apply( pd.cut, @@ -552,7 +552,7 @@ def format( values='proteomics', index='entrez_id', columns='improve_sample_id' - ) + ).transpose() elif data_type == "experiments": if data.experiments is None: