In [1]:
import pysam
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.io import mmwrite

In [16]:
# === Step 1: Read BAM and extract gene + cell tags ===
bam = pysam.AlignmentFile("k562.allele.flt.M.bam", "rb")

In [17]:
genes = []
cells = []

In [18]:
for read in bam:
    if read.has_tag("CB:Z") and read.has_tag("GX:Z"):
        cells.append(read.get_tag("CB:Z"))
        genes.append(read.get_tag("GX:Z"))
bam.close()

In [19]:
# === Step 2: Build a count table ===
df = pd.DataFrame({"cell": cells, "gene": genes})
counts = df.groupby(["gene", "cell"]).size().reset_index(name="count")

In [20]:
print(df.head())

                         cell             gene
0  SCG0074_CACGCTAAGGGCTAAA-1  ENSG00000230699
1  SCG0074_AGCAACAAGTTCCTCA-1  ENSG00000188976
2  SCG0074_TAGGTGCGTTAGCGTA-1  ENSG00000188976
3  SCG0074_CCTGGATCAAAGGCCA-1  ENSG00000187961
4  SCG0074_AACCTTGCAGCACGAA-1  ENSG00000187961


In [21]:
print(counts.head())

              gene                        cell  count
0  ENSG00000000457  SCG0074_AATACCGGTGATGGCT-1      1
1  ENSG00000000457  SCG0074_CAATCGCCAGGTTAAA-1      1
2  ENSG00000000457  SCG0074_CAGCTAAGTTGAGGTC-1      1
3  ENSG00000000457  SCG0074_CAGGAAGGTCATTAGG-1      1
4  ENSG00000000457  SCG0074_CCGTTGCGTCCTTCTC-1      1


In [22]:
# Map gene/cell to integer indices
gene_ids = pd.Index(counts["gene"].unique(), name="gene")
cell_ids = pd.Index(counts["cell"].unique(), name="cell")

In [23]:
gene_map = {g: i for i, g in enumerate(gene_ids)}
cell_map = {c: i for i, c in enumerate(cell_ids)}

In [24]:
rows = counts["gene"].map(gene_map).values
cols = counts["cell"].map(cell_map).values
vals = counts["count"].values

In [25]:
# === Step 3: Create sparse matrix and save ===
mat = coo_matrix((vals, (rows, cols)), shape=(len(gene_ids), len(cell_ids)))
mmwrite("matrix.mtx", mat)

In [26]:
# Save cell barcodes
cell_ids.to_series().to_csv("barcodes.tsv", index=False, header=False)

In [27]:
# Save genes (3-column format required by Seurat)
features = pd.DataFrame({
    "gene_id": gene_ids,
    "gene_name": gene_ids,
    "feature_type": ["Gene Expression"] * len(gene_ids)
})
features.to_csv("features.tsv", sep="\t", index=False, header=False)