From ceb330ad335a1cd57eddacfed3fc5a44f8fdd3ea Mon Sep 17 00:00:00 2001 From: RubyFore Date: Tue, 23 Sep 2025 15:32:25 -0700 Subject: [PATCH 1/2] adding complete docstrings For inclusion in coderdata docs site --- coderdata/dataset/dataset.py | 89 +++++++++++++++++++++++++++++++++++- coderdata/utils/stats.py | 21 +++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index a19ce465..b0a2dda5 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -64,8 +64,8 @@ def __init__( Parameters ---------- - name : str - The name of the dataset that is stored in the object + name : str, optional + The name of the dataset that is stored in the object, by default None transcriptomics : pd.DataFrame, optional _description_, by default None proteomics : pd.DataFrame, optional @@ -386,6 +386,14 @@ def train_test_validate( def types(self) -> list: + """ + Get the data types available in the dataset. + + Returns + ------- + list + A list of available data types (e.g., 'transcriptomics', 'proteomics'). + """ data_types = [ 'transcriptomics', 'proteomics', @@ -407,7 +415,18 @@ def types(self) -> list: return data_types_present def save(self, path: Path) -> None: + """ + Save the dataset to a file. + Parameters + ---------- + path : Path + The file path where the dataset will be saved. + + Returns + ------- + None + """ with open(path, 'wb') as f_path: pickle.dump(self, file=f_path) @@ -563,6 +582,22 @@ def format( remove_na: bool=False, **kwargs: dict, ): + """ + Format the dataset according to the specified type. + + Parameters + ---------- + data_type : str + The type of data to format (e.g., 'transcriptomics', 'mutations'). + remove_na : bool, optional + Whether to remove rows with missing values, by default False. + **kwargs : dict + Additional arguments for customization. + + Returns + ------- + Formatted data based on the requested type. + """ if data_type == "transcriptomics": if data.transcriptomics is None: @@ -759,6 +794,31 @@ def split_train_other( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ): + + """ + Split the dataset into training and other subsets. + + Parameters + ---------- + split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional + The type of splitting to perform, by default 'mixed-set'. + ratio : tuple[int, int], optional + Ratio of train to other split sizes, by default (8, 2). + stratify_by : str, optional + Column to use for stratification, if any, by default None. + balance : bool, optional + Whether to balance the split data, by default False. + random_state : int | RandomState | None, optional + Random seed for reproducibility, by default None. + **kwargs : dict + Additional arguments for customization. + + Returns + ------- + TwoWaySplit + The resulting datasets in training and other split. + """ + train, other = _split_two_way( data=data, split_type=split_type, @@ -785,6 +845,31 @@ def split_train_test_validate( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> Split: + + """ + Split the dataset into training, testing, and validation subsets. + + Parameters + ---------- + split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional + The type of splitting strategy to use, by default 'mixed-set'. + ratio : tuple[int, int, int], optional + Ratio for train, test, and validation sizes, by default (8,1,1). + stratify_by : str, optional + Column for stratification, if any, by default None. + balance : bool, optional + Whether to balance the splits, by default False. + random_state : int | RandomState | None, optional + Random seed for reproducible splits, by default None. + **kwargs : dict + Additional arguments for customization. + + Returns + ------- + Split + A Split object with train, test, and validation datasets. + """ + # Type checking split_type if split_type not in [ 'mixed-set', 'drug-blind', 'cancer-blind' diff --git a/coderdata/utils/stats.py b/coderdata/utils/stats.py index 42f0c550..82781ea7 100644 --- a/coderdata/utils/stats.py +++ b/coderdata/utils/stats.py @@ -22,6 +22,27 @@ def plot_2d_respones_metric( metric2: str, **kwargs: dict ) -> None: + """ + Plot a 2D histogram of two response metrics from a dataset. + + Parameters + ---------- + data : cd.Dataset + The dataset containing experiment data. + metric1 : str + The first response metric to plot on the y-axis. + metric2 : str + The second response metric to plot on the x-axis. + **kwargs : dict + Additional keyword arguments for customizing the plot: + - `joint_bins` (int): Number of bins for the joint histogram. Default is 50. + - `marginal_bins` (int): Number of bins for the marginal histograms. Default is 50. + + Returns + ------- + None + Displays the 2D histogram plot. + data_plot = _prepare_2d_hist_data( data=data.experiments, From ff1f26dc68e5014cca30d35157ff3cdbfc4ba731 Mon Sep 17 00:00:00 2001 From: RubyFore Date: Wed, 24 Sep 2025 12:49:14 -0700 Subject: [PATCH 2/2] more docstrings and typo fix A few more changes to documentation for coderdata package --- coderdata/dataset/dataset.py | 156 +++++++++++++++++++++++++++++++++-- coderdata/utils/stats.py | 2 +- 2 files changed, 150 insertions(+), 8 deletions(-) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index b0a2dda5..39c2a5ba 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -322,7 +322,43 @@ def split_train_other( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> TwoWaySplit: + """ + Split the dataset into training and another subset (e.g., testing or validation). + Parameters + ---------- + split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional + The type of split to perform, by default 'mixed-set'. + - `mixed-set`: A random split, disregarding drug or cancer associations. + - `drug-blind`: Ensures disjoint splits by drug ID. + - `cancer-blind`: Ensures disjoint splits by sample or cancer association. + ratio : tuple[int, int], optional + The ratio of train to other subset sizes, by default (8, 2). + For instance, (8, 2) translates to an 80%-20% split. + stratify_by : str, optional + The column used for stratification, if stratification is needed, by default None. + balance : bool, optional + Whether to adjust to balanced splits (equal representation of classes), by default False. + random_state : int | RandomState | None, optional + A seed for reproducibility of the random split, by default None. + **kwargs : dict + Additional arguments for advanced customization of the split. + + Returns + ------- + TwoWaySplit + An object containing the train and other subsets as separate datasets. + + Notes + ----- + This method is a wrapper around the `split_train_other` utility function and + ensures that the split configuration is applied to the dataset (self). + + Examples + -------- + >>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3)) + >>> print(split.train, split.other) + """ split = split_train_other( data=self, split_type=split_type, @@ -347,6 +383,47 @@ def split_train_test_validate( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> Split: + """ + Split the dataset into training, testing, and validation subsets. + + Parameters + ---------- + split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional + Defines the type of splitting to perform, by default 'mixed-set'. + - `mixed-set`: Data is split randomly, disregarding drug or cancer associations. + - `drug-blind`: Ensures disjoint splits by drug association. + - `cancer-blind`: Ensures disjoint splits by sample or cancer association. + ratio : tuple[int, int, int], optional + Defines the ratio of train, test, and validate sizes, e.g., (8,1,1) + means 80% train, 10% test, 10% validation. + stratify_by : str, optional + Column to use for stratification, if required, by default None. + balance : bool, optional + Whether to balance the splits (equal representation of classes), by default False. + random_state : int | RandomState | None, optional + A random seed for reproducibility, by default None. + **kwargs : dict + Additional arguments for customization of the split logic. + + Returns + ------- + Split + A Split object containing the training, testing, and validation subsets. + + Notes + ----- + - This method uses the `split_train_test_validate` utility function internally. + - Ensures disjoint subsets based on the specified splitting criteria, especially + for `drug-blind` and `cancer-blind` splits. + - Includes options for stratifying splits based on a drug response metric. + + Examples + -------- + >>> split = dataset.split_train_test_validate( + ... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc' + ... ) + >>> print(split.train, split.test, split.validate) + """ split = split_train_test_validate( data=self, split_type=split_type, @@ -371,7 +448,46 @@ def train_test_validate( random_state: Optional[Union[int,RandomState]]=None, **kwargs: dict, ) -> Split: + """ + Split the dataset into training, testing, and validation subsets. + Parameters + ---------- + split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional + Defines the type of splitting, by default 'mixed-set'. + - `mixed-set`: Random splitting, disregarding drug or cancer associations. + - `drug-blind`: Ensures disjoint splits based on drug associations. + - `cancer-blind`: Ensures disjoint splits based on cancer or sample associations. + ratio : tuple[int, int, int], optional + The proportion of data for train, test, and validation splits + (e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1). + stratify_by : str, optional + The column used for stratification (e.g., a drug response metric), by default None. + balance : bool, optional + Whether to adjust splits to ensure balanced classes, by default False. + random_state : int | RandomState | None, optional + Random seed for reproducibility, by default None. + **kwargs : dict + Additional arguments for customization, passed to the stratification logic. + + Returns + ------- + Split + An object containing the training, testing, and validation subsets. + + Notes + ----- + - This method wraps around the `split_train_test_validate` utility function. + - Useful for creating disjoint and optionally stratified splits of the dataset. + - Supports reproducibility through `random_state`. + + Examples + -------- + >>> split = dataset.train_test_validate( + ... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc' + ... ) + >>> print(split.train, split.test, split.validate) + """ split = split_train_test_validate( data=self, split_type=split_type, @@ -441,28 +557,54 @@ def load( local_path: Union[str,Path]=Path.cwd(), from_pickle:bool=False ) -> Dataset: + """ - _summary_ + Load a dataset from local files. + + This function allows loading either from raw data files (e.g., CSV, TSV) + or from a pickled file. The raw data is parsed and indexed into a `Dataset` + object based on predefined types. If pickled data is available, it can be + directly loaded for faster access. Parameters ---------- name : str - _description_ - directory : str | Path, optional - _description_, by default Path.cwd() + The name of the dataset to load (used as a filename prefix). + local_path : str | Path, optional + The local directory where the dataset files are located, by default the current working directory. + from_pickle : bool, optional + If True, attempts to load the dataset from a pickled file, by default False. Returns ------- Dataset - _description_ + An object containing the loaded dataset with attributes for specific data types like 'transcriptomics', + 'proteomics', 'mutations', etc. Raises ------ OSError - _description_ + If the specified directory does not exist. TypeError - _description_ + If the provided path is not a valid path. + FileNotFoundError + If no suitable pickled file is found when `from_pickle=True`. + + Notes + ----- + - When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`. + - The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.). + - When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`. + + Examples + -------- + Load a dataset from raw files: + >>> dataset = load(name='my_dataset', local_path='/data/datasets') + + Load a dataset from a pickled file: + >>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True) """ + data_types_to_load = ( 'transcriptomics', diff --git a/coderdata/utils/stats.py b/coderdata/utils/stats.py index 82781ea7..74f5886b 100644 --- a/coderdata/utils/stats.py +++ b/coderdata/utils/stats.py @@ -42,7 +42,7 @@ def plot_2d_respones_metric( ------- None Displays the 2D histogram plot. - + """ data_plot = _prepare_2d_hist_data( data=data.experiments,