Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 236 additions & 9 deletions coderdata/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def __init__(

Parameters
----------
name : str
The name of the dataset that is stored in the object
name : str, optional
The name of the dataset that is stored in the object, by default None
transcriptomics : pd.DataFrame, optional
_description_, by default None
proteomics : pd.DataFrame, optional
Expand Down Expand Up @@ -322,7 +322,43 @@ def split_train_other(
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> TwoWaySplit:
"""
Split the dataset into training and another subset (e.g., testing or validation).

Parameters
----------
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
The type of split to perform, by default 'mixed-set'.
- `mixed-set`: A random split, disregarding drug or cancer associations.
- `drug-blind`: Ensures disjoint splits by drug ID.
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
ratio : tuple[int, int], optional
The ratio of train to other subset sizes, by default (8, 2).
For instance, (8, 2) translates to an 80%-20% split.
stratify_by : str, optional
The column used for stratification, if stratification is needed, by default None.
balance : bool, optional
Whether to adjust to balanced splits (equal representation of classes), by default False.
random_state : int | RandomState | None, optional
A seed for reproducibility of the random split, by default None.
**kwargs : dict
Additional arguments for advanced customization of the split.

Returns
-------
TwoWaySplit
An object containing the train and other subsets as separate datasets.

Notes
-----
This method is a wrapper around the `split_train_other` utility function and
ensures that the split configuration is applied to the dataset (self).

Examples
--------
>>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3))
>>> print(split.train, split.other)
"""
split = split_train_other(
data=self,
split_type=split_type,
Expand All @@ -347,6 +383,47 @@ def split_train_test_validate(
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> Split:
"""
Split the dataset into training, testing, and validation subsets.

Parameters
----------
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
Defines the type of splitting to perform, by default 'mixed-set'.
- `mixed-set`: Data is split randomly, disregarding drug or cancer associations.
- `drug-blind`: Ensures disjoint splits by drug association.
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
ratio : tuple[int, int, int], optional
Defines the ratio of train, test, and validate sizes, e.g., (8,1,1)
means 80% train, 10% test, 10% validation.
stratify_by : str, optional
Column to use for stratification, if required, by default None.
balance : bool, optional
Whether to balance the splits (equal representation of classes), by default False.
random_state : int | RandomState | None, optional
A random seed for reproducibility, by default None.
**kwargs : dict
Additional arguments for customization of the split logic.

Returns
-------
Split
A Split object containing the training, testing, and validation subsets.

Notes
-----
- This method uses the `split_train_test_validate` utility function internally.
- Ensures disjoint subsets based on the specified splitting criteria, especially
for `drug-blind` and `cancer-blind` splits.
- Includes options for stratifying splits based on a drug response metric.

Examples
--------
>>> split = dataset.split_train_test_validate(
... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc'
... )
>>> print(split.train, split.test, split.validate)
"""
split = split_train_test_validate(
data=self,
split_type=split_type,
Expand All @@ -371,7 +448,46 @@ def train_test_validate(
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> Split:
"""
Split the dataset into training, testing, and validation subsets.

Parameters
----------
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
Defines the type of splitting, by default 'mixed-set'.
- `mixed-set`: Random splitting, disregarding drug or cancer associations.
- `drug-blind`: Ensures disjoint splits based on drug associations.
- `cancer-blind`: Ensures disjoint splits based on cancer or sample associations.
ratio : tuple[int, int, int], optional
The proportion of data for train, test, and validation splits
(e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1).
stratify_by : str, optional
The column used for stratification (e.g., a drug response metric), by default None.
balance : bool, optional
Whether to adjust splits to ensure balanced classes, by default False.
random_state : int | RandomState | None, optional
Random seed for reproducibility, by default None.
**kwargs : dict
Additional arguments for customization, passed to the stratification logic.

Returns
-------
Split
An object containing the training, testing, and validation subsets.

Notes
-----
- This method wraps around the `split_train_test_validate` utility function.
- Useful for creating disjoint and optionally stratified splits of the dataset.
- Supports reproducibility through `random_state`.

Examples
--------
>>> split = dataset.train_test_validate(
... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc'
... )
>>> print(split.train, split.test, split.validate)
"""
split = split_train_test_validate(
data=self,
split_type=split_type,
Expand All @@ -386,6 +502,14 @@ def train_test_validate(


def types(self) -> list:
"""
Get the data types available in the dataset.

Returns
-------
list
A list of available data types (e.g., 'transcriptomics', 'proteomics').
"""
data_types = [
'transcriptomics',
'proteomics',
Expand All @@ -407,7 +531,18 @@ def types(self) -> list:
return data_types_present

def save(self, path: Path) -> None:
"""
Save the dataset to a file.

Parameters
----------
path : Path
The file path where the dataset will be saved.

Returns
-------
None
"""
with open(path, 'wb') as f_path:
pickle.dump(self, file=f_path)

Expand All @@ -422,28 +557,54 @@ def load(
local_path: Union[str,Path]=Path.cwd(),
from_pickle:bool=False
) -> Dataset:

"""
_summary_
Load a dataset from local files.

This function allows loading either from raw data files (e.g., CSV, TSV)
or from a pickled file. The raw data is parsed and indexed into a `Dataset`
object based on predefined types. If pickled data is available, it can be
directly loaded for faster access.

Parameters
----------
name : str
_description_
directory : str | Path, optional
_description_, by default Path.cwd()
The name of the dataset to load (used as a filename prefix).
local_path : str | Path, optional
The local directory where the dataset files are located, by default the current working directory.
from_pickle : bool, optional
If True, attempts to load the dataset from a pickled file, by default False.

Returns
-------
Dataset
_description_
An object containing the loaded dataset with attributes for specific data types like 'transcriptomics',
'proteomics', 'mutations', etc.

Raises
------
OSError
_description_
If the specified directory does not exist.
TypeError
_description_
If the provided path is not a valid path.
FileNotFoundError
If no suitable pickled file is found when `from_pickle=True`.

Notes
-----
- When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`.
- The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.).
- When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`.

Examples
--------
Load a dataset from raw files:
>>> dataset = load(name='my_dataset', local_path='/data/datasets')

Load a dataset from a pickled file:
>>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True)
"""


data_types_to_load = (
'transcriptomics',
Expand Down Expand Up @@ -563,6 +724,22 @@ def format(
remove_na: bool=False,
**kwargs: dict,
):
"""
Format the dataset according to the specified type.

Parameters
----------
data_type : str
The type of data to format (e.g., 'transcriptomics', 'mutations').
remove_na : bool, optional
Whether to remove rows with missing values, by default False.
**kwargs : dict
Additional arguments for customization.

Returns
-------
Formatted data based on the requested type.
"""

if data_type == "transcriptomics":
if data.transcriptomics is None:
Expand Down Expand Up @@ -759,6 +936,31 @@ def split_train_other(
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
):

"""
Split the dataset into training and other subsets.

Parameters
----------
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
The type of splitting to perform, by default 'mixed-set'.
ratio : tuple[int, int], optional
Ratio of train to other split sizes, by default (8, 2).
stratify_by : str, optional
Column to use for stratification, if any, by default None.
balance : bool, optional
Whether to balance the split data, by default False.
random_state : int | RandomState | None, optional
Random seed for reproducibility, by default None.
**kwargs : dict
Additional arguments for customization.

Returns
-------
TwoWaySplit
The resulting datasets in training and other split.
"""

train, other = _split_two_way(
data=data,
split_type=split_type,
Expand All @@ -785,6 +987,31 @@ def split_train_test_validate(
random_state: Optional[Union[int,RandomState]]=None,
**kwargs: dict,
) -> Split:

"""
Split the dataset into training, testing, and validation subsets.

Parameters
----------
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
The type of splitting strategy to use, by default 'mixed-set'.
ratio : tuple[int, int, int], optional
Ratio for train, test, and validation sizes, by default (8,1,1).
stratify_by : str, optional
Column for stratification, if any, by default None.
balance : bool, optional
Whether to balance the splits, by default False.
random_state : int | RandomState | None, optional
Random seed for reproducible splits, by default None.
**kwargs : dict
Additional arguments for customization.

Returns
-------
Split
A Split object with train, test, and validation datasets.
"""

# Type checking split_type
if split_type not in [
'mixed-set', 'drug-blind', 'cancer-blind'
Expand Down
21 changes: 21 additions & 0 deletions coderdata/utils/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,27 @@ def plot_2d_respones_metric(
metric2: str,
**kwargs: dict
) -> None:
"""
Plot a 2D histogram of two response metrics from a dataset.

Parameters
----------
data : cd.Dataset
The dataset containing experiment data.
metric1 : str
The first response metric to plot on the y-axis.
metric2 : str
The second response metric to plot on the x-axis.
**kwargs : dict
Additional keyword arguments for customizing the plot:
- `joint_bins` (int): Number of bins for the joint histogram. Default is 50.
- `marginal_bins` (int): Number of bins for the marginal histograms. Default is 50.

Returns
-------
None
Displays the 2D histogram plot.
"""

data_plot = _prepare_2d_hist_data(
data=data.experiments,
Expand Down