In [1]:
import pandas as pd
import h5py
from scipy.sparse import csc_matrix
import os
import numpy as np
def load_h5_data(file_path, annotation_file):
    # Load the HDF5 data
    with h5py.File(file_path, 'r') as h5_file:
        group = h5_file['GRCh38']

        # Load datasets from the HDF5 file
        barcodes = group['barcodes'][:].astype(str)  # Convert bytes to strings
        data = group['data'][:]
        gene_names = group['gene_names'][:].astype(str)  # Gene names
        indices = group['indices'][:]
        indptr = group['indptr'][:]
        shape = tuple(group['shape'][:])

    # Reconstruct the sparse matrix using CSC format
    sparse_matrix = csc_matrix((data, indices, indptr), shape=shape)

    # Convert the sparse matrix to a Pandas DataFrame
    df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix)

    # Load the annotation data
    annotation_df = pd.read_csv(annotation_file, compression='gzip')

    # Extract the barcode part from the cell column
    annotation_df['barcode'] = annotation_df.apply(lambda row: row['cell'].replace(f"{row['sample']}_", ""), axis=1)

    # Create a dictionary to map barcodes to annotations
    annotation_dict = dict(zip(annotation_df['barcode'], annotation_df['anno']))
    print(len(annotation_dict))
    # Map the barcodes in HDF5 data to their annotations
    df.index = gene_names
    df.columns = [annotation_dict.get(b, "Unknown") for b in barcodes]   # Column labels
    return df





ModuleNotFoundError: No module named 'h5py'

In [80]:
main_folder="/data/sr933/scRCC validation/GSE159115_RAW"
label_file="/data/sr933/scRCC validation/GSE159115_RAW/sample alloc"
path_anno_ben = "/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_normal_anno.csv.gz"
path_anno_tum = "/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_ccRCC_anno.csv.gz"
data=[]
labels=[]
label_df=pd.read_csv(label_file)

for folder in os.listdir(main_folder):
    if ".h5" in folder:
        print(folder)
        file_path=os.path.join(main_folder, folder)
        for _, row in label_df.iterrows():
            sample_id = row['sample']  # Column name to match, adjust if different
            alloc=row['class']
            # Check if the sample ID is in the folder name
            if sample_id in folder:
                # Append the relevant row to the data list
                labels.append(alloc)
                break  # Exit loop once a match is found for this fol
        annotation_path=path_anno_ben if alloc=="Benign" else path_anno_tum
        print(annotation_path)
        df=load_h5_data(file_path, annotation_path)
        data.append(df)
        

print(labels)

GSM4819727_SI_18855_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_normal_anno.csv.gz
6135
GSM4819726_SI_18856_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_ccRCC_anno.csv.gz
20509
GSM4819728_SI_19704_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_ccRCC_anno.csv.gz
20509
GSM4819725_SI_18854_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_ccRCC_anno.csv.gz
20509
GSM4819729_SI_19703_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_normal_anno.csv.gz
6135
GSM4819730_SI_21255_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_normal_anno.csv.gz
6135
GSM4819731_SI_21256_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GSE159115_normal_anno.csv.gz
6135
GSM4819733_SI_22369_filtered_gene_bc_matrices_h5.h5
/data/sr933/scRCC validation/GSE159115_RAW/GS

In [84]:
combined_df = pd.concat(data, axis=1)
# Count how many "Unknown" columns there are
num_unknowns = (combined_df.columns == "Unknown").sum()
print(f"Number of columns with Unknown annotation: {num_unknowns}")

cleaned_df = combined_df.loc[:, ~combined_df.columns.str.contains("Unknown")]

# Print the shape of the DataFrame after cleaning
print(cleaned_df.shape)

Number of columns with Unknown annotation: 14400
(33694, 15099)


In [87]:
print(list(set(cleaned_df)))

['Macro', 'tAL', 'DCT', 'Mono', 'GC', 'DL', 'vSMC', 'Mast', 'unknown', 'TAL', 'PC', 'IC-A', 'Endo_PLVAP', 'Endo_ACKR1', 'Macro_MKI67', 'Mesangial', 'IC-PC', 'PT-A', 'AVR', 'Bcell', 'Tumor', 'CNT', 'Tcell', 'Peri', 'ua', 'PT-B', 'Tcell_CD8', 'NKcell', 'IC-B', 'Plasma', 'AEA-DVR']


In [88]:
import pickle
validation_data={"X_data": cleaned_df}
# Path to save the pickle file
pickle_file_path = "/data/sr933/scRCC validation/processed data/scRCC_validation_data.pkl"  # Update with your desired path

# Save the validation_data dictionary as a pickle file
with open(pickle_file_path, "wb") as pkl_file:
    pickle.dump(validation_data, pkl_file)