In [1]:
import scanpy as sc
dt = sc.read_h5ad("/Users/kostisfortounas/Downloads/PsychAD_r0_Dec_28_2022.h5ad")
dt

AnnData object with n_obs × n_vars = 1406788 × 34890
    obs: 'SubID', 'Channel', 'Brain_bank', 'round_num', 'poolID', 'Sex', 'Age', 'Dx_AD', 'Ethnicity', 'pH', 'PMI', 'class', 'subclass', 'subtype'
    obsm: 'X_pca', 'X_pca_regressed_harmony', 'X_umap'

In [3]:
import scanpy as sc
import anndata as ad
import numpy as np

def random_reduce_h5ad(input_file, output_file, max_obs, max_vars):
  """
  Randomly reduces an H5ad matrix to specified number of observations and variables.

  Args:
    input_file: Path to the input H5ad file.
    output_file: Path to the output H5ad file.
    max_obs: Maximum number of observations.
    max_vars: Maximum number of variables.
  """

  adata = ad.read_h5ad(input_file)

  # Randomly select observations
  obs_idx = np.random.choice(adata.shape[0], max_obs, replace=False)
  adata = adata[obs_idx]

  # Randomly select variables
  var_idx = np.random.choice(adata.shape[1], max_vars, replace=False)
  adata = adata[:, var_idx]

  adata.write(output_file)

# Example usage:
input_file = "/Users/kostisfortounas/Downloads/PsychAD_r0_Dec_28_2022.h5ad"
output_file = "reduced_data.h5ad"
max_obs = 5000
max_vars = 34890

random_reduce_h5ad(input_file, output_file, max_obs, max_vars)

In [4]:
random_reduce_h5ad = sc.read_h5ad("reduced_data.h5ad")
random_reduce_h5ad

AnnData object with n_obs × n_vars = 5000 × 34890
    obs: 'SubID', 'Channel', 'Brain_bank', 'round_num', 'poolID', 'Sex', 'Age', 'Dx_AD', 'Ethnicity', 'pH', 'PMI', 'class', 'subclass', 'subtype'
    obsm: 'X_pca', 'X_pca_regressed_harmony', 'X_umap'

In [None]:
import anndata as ad

def remove_subset_elements(original_adata, subset_adata):
    """
    Removes observations and columns from the original AnnData object that are present in the subset.

    Args:
        original_adata: The original AnnData object.
        subset_adata: The AnnData object representing the subset.

    Returns:
        The modified original AnnData object.
    """

    # Get the indices of observations and columns in the original dataset that are in the subset
    obs_idx = original_adata.obs_names.isin(subset_adata.obs_names)
    
    # Remove the corresponding observations and columns
    original_adata = original_adata[~obs_idx]

    return original_adata

# Example usage:
original_data = ad.read_h5ad("/Users/kostisfortounas/Downloads/PsychAD_r0_Dec_28_2022.h5ad")
subset_data = ad.read_h5ad("reduced_data.h5ad")

modified_original_data = remove_subset_elements(original_data, subset_data)
modified_original_data.write("Original1")

In [1]:
import scanpy as sc
import anndata as ad
import numpy as np

def create_and_save_subsets_unique_obs(input_file, output_prefix, num_subsets, subset_size, max_vars):
    """
    Creates and saves multiple subsets with unique observations from an H5ad dataset.

    Args:
        input_file: Path to the input H5ad file.
        output_prefix: Prefix for the output H5ad files.
        num_subsets: Number of subsets to create.
        subset_size: Number of observations per subset.
        max_vars: Maximum number of variables (if you want to limit).

    Returns:
        None
    """

    adata = ad.read_h5ad(input_file)

    # Ensure max_vars is not greater than the actual number of variables
    max_vars = min(max_vars, adata.shape[1])

    remaining_obs = set(adata.obs_names)

    for i in range(num_subsets):
        if len(remaining_obs) < subset_size:
            raise ValueError("Not enough remaining observations for subset creation.")

        # Randomly select subset_size observations from the remaining pool
        obs_idx = np.random.choice(list(remaining_obs), subset_size, replace=False)

        # Create subset
        subset_adata = adata[obs_idx, :max_vars]

        # Remove selected observations from the remaining pool
        remaining_obs -= set(subset_adata.obs_names)

        # Save subset to file
        output_file = f"{output_prefix}_subset_{i}.h5ad"
        subset_adata.write(output_file)

# Example usage:
input_file = "/Users/kostisfortounas/Downloads/PsychAD_r0_Dec_28_2022.h5ad"
output_prefix = "subset"
num_subsets = 10
subset_size = 45000
max_vars = 34890

create_and_save_subsets_unique_obs(input_file, output_prefix, num_subsets, subset_size, max_vars)

AnnData object with n_obs × n_vars = 45000 × 34890
    obs: 'SubID', 'Channel', 'Brain_bank', 'round_num', 'poolID', 'Sex', 'Age', 'Dx_AD', 'Ethnicity', 'pH', 'PMI', 'class', 'subclass', 'subtype'
    obsm: 'X_pca', 'X_pca_regressed_harmony', 'X_umap'