In [13]:
import os
from scipy.io import mmread
import pandas as pd
import anndata
import gc
import gzip
import shutil

In [7]:
# Base directory where sample folders are stored
base_dir = "/nfs/lab/projects/mega_heart/Upload/reference/"


In [8]:
    # Define file paths for the counts matrix, features, and barcodes
    matrix_path = os.path.join(base_dir, "ATAC_counts.mtx")
    features_path = os.path.join(base_dir, "ATAC_features.tsv")
    barcodes_path = os.path.join(base_dir, "Multiome_Barcodes.tsv")
    
    # Load the sparse matrix and convert to CSR format for efficient slicing
    matrix = mmread(matrix_path).tocsr()
    
    # Load features and barcodes. Each file is assumed to have one entry per line without headers.
    features = pd.read_csv(features_path, sep="\t", header=None, squeeze=True).tolist()
    barcodes = pd.read_csv(barcodes_path, sep="\t", header=None, squeeze=True).tolist()
    
    # Create an AnnData object using the matrix (X), features as variables, and barcodes as observations.
    adata = anndata.AnnData(
        X=matrix.T,  # Now rows are cells, columns are features
        obs=pd.DataFrame(index=barcodes),  # (cells
        var=pd.DataFrame(index=features)     # features
    )

In [11]:
# Define your output directory and file name
output_dir = "/nfs/lab/projects/mega_heart/Upload/reference"
os.makedirs(output_dir, exist_ok=True)
h5ad_path = os.path.join(output_dir, "ATAC_FNIH_Multiome.h5ad")

# Save the AnnData object in h5ad format
adata.write_h5ad(h5ad_path)

In [14]:
# Define input and output paths
input_path = "/nfs/lab/projects/mega_heart/Upload/reference/ATAC_FNIH_Multiome.h5ad"
output_path = "/nfs/lab/projects/mega_heart/Upload/reference/ATAC_FNIH_Multiome.h5ad.gz"

# Compress the file
with open(input_path, 'rb') as f_in:
    with gzip.open(output_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Compressed file saved to: {output_path}")

Compressed file saved to: /nfs/lab/projects/mega_heart/Upload/reference/ATAC_FNIH_Multiome.h5ad.gz


In [None]:
## CareHF

In [None]:
import os
from scipy.io import mmread
import pandas as pd
import anndata

# Base directory containing sample folders
base_dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/4_Doublet_cleanup/MM_counts/"

# List all sample folders (each folder is a sample)
sample_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

adata_list = []
sample_names = []

for sample in sample_dirs:
    sample_path = os.path.join(base_dir, sample)
    
    # Define file paths for the counts matrix, features, and barcodes
    counts_file = os.path.join(sample_path, "ATAC_counts.mtx")
    features_file = os.path.join(sample_path, "ATAC_features.tsv")
    barcodes_file = os.path.join(sample_path, "Barcodes.tsv")
    
    # Load the sparse counts matrix (in CSR format)
    matrix = mmread(counts_file).tocsr()
    
    # Load features and barcodes; each file is assumed to have one entry per line
    features = pd.read_csv(features_file, sep="\t", header=None, squeeze=True).tolist()
    barcodes = pd.read_csv(barcodes_file, sep="\t", header=None, squeeze=True).tolist()
    
    # Create an AnnData object.
    # Transpose so that cells (barcodes) become rows (observations)
    adata = anndata.AnnData(
        X=matrix.T,  # Now shape: (n_cells, n_features)
        obs=pd.DataFrame(index=barcodes),   # Observations: cells
        var=pd.DataFrame(index=features)      # Variables: features
    )
    
    # Optionally, add the sample name as an observation-level metadata
    adata.obs["sample"] = sample
    
    # Only add non-empty AnnData objects to the list
    if adata.n_obs > 0:
        adata_list.append(adata)
        sample_names.append(sample)
    else:
        print(f"Warning: Sample {sample} produced an empty AnnData object and will be skipped.")

# Now merge using the keys based on the folder (sample) names
merged_adata = anndata.concat(
    adata_list,         # List of AnnData objects to merge.
    join="outer",       # Use "outer" join to include all features from all samples.
    keys=sample_names,  # Use the validated sample_names list as keys.
    label="sample",     # The new column in obs that will store the sample name.
    axis=0              # Concatenating along the observation (cell) axis.
)

print(merged_adata)

In [None]:
# Define your output directory and file name
output_dir = "/nfs/lab/projects/mega_heart/Upload/ATACpart_CAREHF"
os.makedirs(output_dir, exist_ok=True)
h5ad_path = os.path.join(output_dir, "ATACpart_CAREHF.h5ad")

# Save the AnnData object in h5ad format
merged_adata.write_h5ad(h5ad_path)

In [15]:
# Define input and output paths
input_path = "/nfs/lab/projects/mega_heart/Upload/ATACpart_CAREHF/ATACpart_CAREHF.h5ad"
output_path = "/nfs/lab/projects/mega_heart/Upload/ATACpart_CAREHF/ATAC_CAREHF_Multiome.h5ad.gz"

# Compress the file
with open(input_path, 'rb') as f_in:
    with gzip.open(output_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Compressed file saved to: {output_path}")

Compressed file saved to: /nfs/lab/projects/mega_heart/Upload/ATACpart_CAREHF/ATAC_CAREHF_Multiome.h5ad.gz
