In [None]:
# I got the dataset from NCBI. The title of the Paper is Glycosphingolipid synthesis mediates immune evasion in Kras-driven cancer

In [2]:
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix

In [None]:
def read_h5_sparse_matrix(file):
    with h5py.File(file, 'r') as f:
        # Read the sparse matrix components
        barcodes = f['/matrix/barcodes'][()].astype(str)
        data = f['/matrix/data'][()]
        indices = f['/matrix/indices'][()]
        indptr = f['/matrix/indptr'][()]
        gene_ids = f['/matrix/features/id'][()].astype(str)
        
        # Manually set the shape based on gene IDs and barcodes
        shape = (len(gene_ids), len(barcodes))
        print(f"Manually set shape: {shape}")
        print(f"indptr Length: {len(indptr)}")
        
        # Create a sparse matrix in CSC format
        sparse_matrix = csc_matrix((data, indices, indptr), shape=shape)
        
        # Convert to dense matrix for further processing
        dense_matrix = sparse_matrix.toarray()
        
        # Create DataFrame with gene IDs and barcodes
        count_df = pd.DataFrame(dense_matrix, index=gene_ids, columns=barcodes)
        
    return count_df

# Define file paths
files = ["C:\\Users\\tobij\\Downloads\\GSM8349496_AB_1_filtered_feature_bc_matrix.h5",
        "C:\\Users\\tobij\\Downloads\\GSM8349498_AB_2_filtered_feature_bc_matrix.h5",
        "C:\\Users\\tobij\\Downloads\\GSM8349499_PMXS_1_filtered_feature_bc_matrix.h5"]

# Initialize a list to store DataFrames
count_list = []

# Read each file and store the count matrix DataFrame
for file in files:
    count_list.append(read_h5_sparse_matrix(file))

# Combine the count data into a single DataFrame
combined_counts = pd.concat(count_list, axis=1)

# Display the combined counts DataFrame
print(combined_counts)

In [None]:
df = pd.read_csv('combined_counts.csv')

In [14]:
df.rename(columns={'Unnamed: 0': 'Gene_ID'}, inplace=True)

In [15]:
df

Unnamed: 0,Gene_ID,AAACCCACACAAGCTT-1,AAACCCATCAATCTCT-1,AAACGAAAGACGCCCT-1,AAACGAAAGAGACAAG-1,AAACGAAAGATGCAGC-1,AAACGAAAGCTGAAAT-1,AAACGAAAGGAATCGC-1,AAACGAACAAACCATC-1,AAACGAACATCGTGGC-1,...,TTTGGAGAGTATGGCG-1,TTTGGAGCAATCTCGA-1,TTTGGAGGTGGGCTTC-1,TTTGGAGTCCTACACC-1,TTTGGTTTCCTTGACC-1,TTTGGTTTCTCCAATT-1,TTTGTTGAGAGGCGGA-1,TTTGTTGAGGGTTTCT-1,TTTGTTGCACTACAGT-1,TTTGTTGTCCGTCAAA-1
0,ENSMUSG00000051951,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSMUSG00000089699,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSMUSG00000102331,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSMUSG00000102343,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSMUSG00000025900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32280,ENSMUSG00000095523,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32281,ENSMUSG00000095475,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32282,ENSMUSG00000094855,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32283,ENSMUSG00000095019,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.to_csv('combined_counts.csv')