In [1]:
import sys, os, platform, scanpy, anndata, polars as pl, pandas as pd

print("Python:", sys.version.split()[0])
print("OS:", platform.system(), platform.release())
print("scanpy:", scanpy.__version__)
print("anndata:", anndata.__version__)
print("polars:", pl.__version__)
print("pandas:", pd.__version__)

Python: 3.10.13
OS: Linux 4.18.0-305.10.2.el8_4.x86_64
scanpy: 1.11.0
anndata: 0.11.3
polars: 1.24.0
pandas: 2.2.3


In [2]:
import pandas as pd

## Read the GTF into a DataFrame
gtf = pd.read_csv(
    "Homo_sapiens.GRCh38.113.gtf",
    sep="\t", # GTFs are tab-separated
    comment="#", # Skip header/comment lines beginning with '#'
    header=None,
    names=[
        "seqname", "source", "feature", "start", "end",
        "score", "strand", "frame", "attribute"
    ],
    dtype={"attribute": str},
)

# Extract the fields we care about via regex on the 'attribute' column
gtf["transcript_id"] = gtf["attribute"].str.extract(r'transcript_id "([^"]+)"')
gtf["gene_id"] = gtf["attribute"].str.extract(r'gene_id "([^"]+)"')
gtf["gene_name"] = gtf["attribute"].str.extract(r'gene_name "([^"]+)"')

# Filter for transcript‐level entries with valid ENSG/ENST IDs
tx = (
    gtf[
        gtf["gene_id"].str.startswith("ENSG", na=False) &
        gtf["transcript_id"].str.startswith("ENST", na=False)
    ]
    .loc[:, ["transcript_id", "gene_id", "gene_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Write out the transcript‐level table and sav
tx.to_csv("transcript_annotation_key_info.tsv", sep="\t", index=False)

# --- If you also want the gene‐level table as in your earlier snippet: ---
genes = (
    gtf[
        gtf["gene_id"].str.startswith("ENSG", na=False)
    ]
    .loc[:, ["gene_id", "gene_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

genes.to_csv("gene_annotation_key_info.tsv", sep="\t", index=False)

  gtf = pd.read_csv(


In [3]:
import polars as pl

#Utilize file from above
gene_annot = pl.read_csv("gene_annotation_key_info.tsv", separator="\t")

# Extract gene IDs and gene names
gene_ids = gene_annot.get_column("gene_id").to_list()
gene_names = gene_annot.get_column("gene_name").to_list()

# Construct first row: header label + all gene IDs
row_1 = ["gene_ids"] + gene_ids

# Construct second row: header label + all gene names
row_2 = ["gene_name"] + gene_names

# Build dictionary: column index → [row_1_value, row_2_value]
row_dict = {f"col_{i}": [row_1[i], row_2[i]] for i in range(len(row_1))}

# Create DataFrame: 2 rows, many columns
transposed_df = pl.DataFrame(row_dict)

# Save as TSV
transposed_df.write_csv("gene_annotation_key_info_transposed.tsv", separator="\t")

In [2]:
import os
import polars as pl

def snag_and_write_gene_ids_row_with_label(txt_paths, output_dir):
    """
    For each .txt file:
    - Read using enforced schema (CellID as Utf8, all else as Float64)
    - Save full dataframe as .parquet
    - Create a single horizontal row:
        - First cell: 'GENEIDs'
        - Remaining cells: gene IDs from transposed data
    - Save this single-row labeled gene ID row as .parquet
    """
    for path in txt_paths:
        sample_name = os.path.basename(path).replace(".txt", "")
        print(f"🔍 Reading: {sample_name}")

        # STEP 1: Load header to get column names
        with open(path, 'r') as f:
            header_line = f.readline().strip()

        columns = header_line.split("\t")

        # STEP 2: Build schema override
        schema_overrides = {"CellID": pl.Utf8}
        schema_overrides.update({col: pl.Float64 for col in columns if col != "CellID"})

        # STEP 3: Load file using enforced schema
        df = pl.read_csv(
            path,
            separator="\t",
            schema_overrides=schema_overrides,
            infer_schema_length=0,  # Disable inference of datatypes by Polars
            try_parse_dates=False
        )

        # STEP 4: Save full dataframe as parquet
        parquet_out_path = os.path.join(output_dir, f"{sample_name}.parquet")
        df.write_parquet(parquet_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved full dataset to: {parquet_out_path}")

        # STEP 5: Create single-row DataFrame of GENEIDs
        gene_ids = [col for col in df.columns if col != "CellID"]
        row_dict = {f"col_{i}": [val] for i, val in enumerate(["GENEIDs"] + gene_ids)}
        id_df = pl.DataFrame(row_dict)

        # STEP 6: Save gene ID row as parquet
        ids_out_path = os.path.join(output_dir, f"gene_IDs_{sample_name}.parquet")
        id_df.write_parquet(ids_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved labeled gene IDs row to: {ids_out_path}")

    print(f"🎉 Finished processing {len(txt_paths)} files.")

In [3]:
# Use function on gene files
gene_files = [
    "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.txt",
    "InitialFiltering. /PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.txt"
]

snag_and_write_gene_ids_row_with_label(gene_files, output_dir="Parquet_Files/RawData")

🔍 Reading: PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix
✅ Saved full dataset to: Parquet_Files/RawData_v2/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet
✅ Saved labeled gene IDs row to: Parquet_Files/RawData_v2/gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet
🔍 Reading: PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix
✅ Saved full dataset to: Parquet_Files/RawData_v2/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet
✅ Saved labeled gene IDs row to: Parquet_Files/RawData_v2/gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet
🎉 Finished processing 2 files.


In [4]:
gene_annot = pl.read_csv("gene_annotation_key_info_transposed.tsv", separator="\t")

split_folder = "Parquet_Files/RawData"
PBMC1_gene_id = pl.read_parquet(os.path.join(split_folder, "gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))
PBMC2_gene_id = pl.read_parquet(os.path.join(split_folder, "gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))

In [5]:
PBMC1_gene_id.head()

col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,…,col_26400,col_26401,col_26402,col_26403,col_26404,col_26405,col_26406,col_26407,col_26408,col_26409,col_26410,col_26411,col_26412,col_26413,col_26414,col_26415,col_26416,col_26417,col_26418,col_26419,col_26420,col_26421,col_26422,col_26423,col_26424,col_26425,col_26426,col_26427,col_26428,col_26429,col_26430,col_26431,col_26432,col_26433,col_26434,col_26435,col_26436
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""GENEIDs""","""BambuGene10388""","""BambuGene10462""","""BambuGene11222""","""BambuGene1258""","""BambuGene12607""","""BambuGene12713""","""BambuGene13044""","""BambuGene13426""","""BambuGene14746""","""BambuGene14750""","""BambuGene15704""","""BambuGene1572""","""BambuGene17029""","""BambuGene17301""","""BambuGene17766""","""BambuGene18030""","""BambuGene18782""","""BambuGene18815""","""BambuGene18817""","""BambuGene18824""","""BambuGene20479""","""BambuGene22232""","""BambuGene23915""","""BambuGene24769""","""BambuGene24783""","""BambuGene24863""","""BambuGene2496""","""BambuGene25276""","""BambuGene25440""","""BambuGene25620""","""BambuGene26677""","""BambuGene2841""","""BambuGene29""","""BambuGene29202""","""BambuGene29689""","""BambuGene30417""",…,"""ENSG00000310419""","""ENSG00000310421""","""ENSG00000310430""","""ENSG00000310431""","""ENSG00000310435""","""ENSG00000310436""","""ENSG00000310446""","""ENSG00000310455""","""ENSG00000310457""","""ENSG00000310463""","""ENSG00000310465""","""ENSG00000310467""","""ENSG00000310469""","""ENSG00000310471""","""ENSG00000310473""","""ENSG00000310475""","""ENSG00000310476""","""ENSG00000310481""","""ENSG00000310484""","""ENSG00000310485""","""ENSG00000310487""","""ENSG00000310492""","""ENSG00000310496""","""ENSG00000310500""","""ENSG00000310508""","""ENSG00000310517""","""ENSG00000310519""","""ENSG00000310521""","""ENSG00000310523""","""ENSG00000310525""","""ENSG00000310526""","""ENSG00000310527""","""ENSG00000310529""","""ENSG00000310533""","""ENSG00000310535""","""ENSG00000310537""","""ENSG00000310539"""


In [6]:
gene_annot.head()

col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,…,col_78896,col_78897,col_78898,col_78899,col_78900,col_78901,col_78902,col_78903,col_78904,col_78905,col_78906,col_78907,col_78908,col_78909,col_78910,col_78911,col_78912,col_78913,col_78914,col_78915,col_78916,col_78917,col_78918,col_78919,col_78920,col_78921,col_78922,col_78923,col_78924,col_78925,col_78926,col_78927,col_78928,col_78929,col_78930,col_78931,col_78932
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""gene_ids""","""ENSG00000142611""","""ENSG00000284616""","""ENSG00000157911""","""ENSG00000260972""","""ENSG00000224340""","""ENSG00000229280""","""ENSG00000142655""","""ENSG00000232596""","""ENSG00000231510""","""ENSG00000149527""","""ENSG00000171621""","""ENSG00000272235""","""ENSG00000284694""","""ENSG00000224387""","""ENSG00000142583""","""ENSG00000284674""","""ENSG00000224338""","""ENSG00000287727""","""ENSG00000173614""","""ENSG00000215720""","""ENSG00000241326""","""ENSG00000233623""","""ENSG00000204624""","""ENSG00000142606""","""ENSG00000225931""","""ENSG00000171729""","""ENSG00000229393""","""ENSG00000287586""","""ENSG00000227169""","""ENSG00000283356""","""ENSG00000157916""","""ENSG00000157881""","""ENSG00000225387""","""ENSG00000048707""","""ENSG00000225196""","""ENSG00000162444""",…,"""ENSG00000294171""","""ENSG00000294191""","""ENSG00000299852""","""ENSG00000299870""","""ENSG00000299885""","""ENSG00000309338""","""ENSG00000309357""","""ENSG00000297432""","""ENSG00000297449""","""ENSG00000299314""","""ENSG00000297844""","""ENSG00000275063""","""ENSG00000277856""","""ENSG00000298751""","""ENSG00000298769""","""ENSG00000301091""","""ENSG00000303154""","""ENSG00000271254""","""ENSG00000303750""","""ENSG00000301573""","""ENSG00000301587""","""ENSG00000300815""","""ENSG00000304057""","""ENSG00000275987""","""ENSG00000268674""","""ENSG00000277475""","""ENSG00000275405""","""ENSG00000298139""","""ENSG00000298153""","""ENSG00000298169""","""ENSG00000298181""","""ENSG00000309793""","""ENSG00000307722""","""ENSG00000310401""","""ENSG00000302039""","""ENSG00000309831""","""ENSG00000309258"""
"""gene_name""","""PRDM16""",,"""PEX10""",,"""RPL21P21""","""EEF1DP6""","""PEX14""","""LINC01646""","""LINC02782""","""PLCH2""","""SPSB1""",,,,"""SLC2A5""","""LINC02781""","""MTCYBP45""",,"""NMNAT1""","""MFFP1""",,"""PGAM1P11""","""DISP3""","""MMEL1""",,"""TMEM51""",,,,,"""RER1""","""PANK4""",,"""VPS13D""","""RPL10P17""","""RBP7""",…,,,,,,,,,,,,,,,,,,,,,,,,"""U1""",,,"""U1""",,,,,,,,,,


In [7]:
def annotate_gene_ids_from_row(row_df: pl.DataFrame, id_to_name: dict) -> pl.DataFrame:
    """
    Given a single-row DataFrame of gene IDs, return a two-column DataFrame:
    - GENEID
    - GENE_NAME
    """
    # Extract the first (and only) row from the DataFrame as a list of gene IDs
    gene_ids = row_df.row(0) 

    # Optionally skip the first entry if it's a label like 'GENEIDs'
    if gene_ids[0] == "GENEIDs": # Check if the first element is a label string, not a real ID
        gene_ids = gene_ids[1:] # Skip that label to keep only gene IDs

    # Map each gene ID to its corresponding gene name; use None if not found
    gene_names = [id_to_name.get(gid, None) for gid in gene_ids]

    # Create new DataFrame pairing each gene ID with its gene name
    annotated_df = pl.DataFrame({
        "GENEID": gene_ids,
        "GENE_NAME": gene_names
    })

    return annotated_df

In [8]:
##  Convert gene_annot to a Lookup table:
# Extract gene IDs and gene names from gene_annot
gene_ids = gene_annot.row(0)[1:]   # Skip the "gene_ids" label
gene_names = gene_annot.row(1)[1:] # Skip the "gene_name" label

# Create dictionary: gene_id → gene_name
id_to_name = dict(zip(gene_ids, gene_names))

In [9]:
# Annotate PBMC1 and PBMC2 single-row DataFrames using lookup

PBMC1_gene_annotated = annotate_gene_ids_from_row(PBMC1_gene_id, id_to_name)
PBMC2_gene_annotated = annotate_gene_ids_from_row(PBMC2_gene_id, id_to_name)

In [10]:
PBMC1_gene_annotated.tail()

GENEID,GENE_NAME
str,str
"""ENSG00000310529""",
"""ENSG00000310533""",
"""ENSG00000310535""",
"""ENSG00000310537""",
"""ENSG00000310539""","""DDX11L2"""


In [11]:
# Fill missing gene names with empty string to prevent NaN values in merged outputs
PBMC1_gene_annotated = PBMC1_gene_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("") # Replace nulls with empty string
)

PBMC2_gene_annotated = PBMC2_gene_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("") # Replace nulls with empty string
)

In [12]:
PBMC1_gene_annotated.head()

GENEID,GENE_NAME
str,str
"""BambuGene10388""",""""""
"""BambuGene10462""",""""""
"""BambuGene11222""",""""""
"""BambuGene1258""",""""""
"""BambuGene12607""",""""""


In [13]:
# Save annotated tables
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_gene_annotated.write_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC1.parquet"), compression="zstd")
PBMC2_gene_annotated.write_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC2.parquet"), compression="zstd")

In [7]:
import pandas as pd
split_folder = "Parquet_Files/IntermediateFiles"
# Load annotated Parquet files using Polars, then convert to Pandas DataFrames
PBMC1_gene_anno = pl.read_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC1.parquet")).to_pandas()
PBMC2_gene_anno = pl.read_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC2.parquet")).to_pandas()

In [9]:
def create_combined_gene_id(geneid: str, symbol: str) -> str:
    geneid = (geneid or "").strip() # Ensure non-null string and remove extra spaces
    symbol = (symbol or "").strip() # Ensure non-null string and remove extra spaces

    if geneid.startswith("ENSG"):
        return f"{symbol}:{geneid}" if symbol else geneid # Combine symbol:ID if symbol exists; else use ID only
    else:
        return f"{symbol}:{geneid}" if symbol else geneid

In [10]:
# Apply the combination function row-wise to both dataframes
PBMC1_gene_anno["combined_ID"] = PBMC1_gene_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"]),
    axis=1
)

PBMC2_gene_anno["combined_ID"] = PBMC2_gene_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"]),
    axis=1
)

In [11]:
#Check that combined_ID was created with GENE_NAME:GENEID, if GENE_NAME has contents
#If not, combined_ID will only contain GENEID
PBMC1_gene_anno.tail()

Unnamed: 0,GENEID,GENE_NAME,combined_ID
26431,ENSG00000310529,,ENSG00000310529
26432,ENSG00000310533,,ENSG00000310533
26433,ENSG00000310535,,ENSG00000310535
26434,ENSG00000310537,,ENSG00000310537
26435,ENSG00000310539,DDX11L2,DDX11L2:ENSG00000310539


In [18]:
# Save outputs
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_gene_anno.to_parquet(os.path.join(split_folder, "PBMC1_gene_IDs_with_combined.parquet"), compression="zstd")
PBMC2_gene_anno.to_parquet(os.path.join(split_folder, "PBMC2_gene_IDs_with_combined.parquet"), compression="zstd")
print("✅ Combined ID columns created and saved.")

✅ Combined ID columns created and saved.


In [3]:
split_folder = "Parquet_Files/RawData"

# Step 4: Load the updated annotated and sorted data
PBMC1_gene = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))
PBMC2_gene = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))

In [4]:
PBMC1_gene.head()

Unnamed: 0,CellID,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,ENSG00000310526,ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,ENSG00000310539
0,PBMC1_ACGTAAACATTTATAC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PBMC1_CAGACAGAACGCAGGA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC1_AGAGCAACACCGAGGT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC1_AACAATCACCGGCATC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC1_AGTAAGTACCGTAGGT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
PBMC1_gene.shape

(19772, 26437)

In [12]:
PBMC1_gene_anno.shape

(26436, 3)

In [13]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_gene, excluding CellID
pbmc1_columns = [col for col in PBMC1_gene.columns if col != "CellID"]

# Get GENEID list from PBMC1_gene_anno
pbmc1_geneids = PBMC1_gene_anno["GENEID"].tolist()

# Assert that column names match GENEID list
assert pbmc1_columns == pbmc1_geneids, "PBMC1_gene_id mismatch"

print("✅ PBMC1 gene columns aligned with annotation IDs.")

✅ PBMC1 gene columns aligned with annotation IDs.


In [14]:
# Get the gene columns from PBMC1_gene, excluding CellID
pbmc2_columns = [col for col in PBMC2_gene.columns if col != "CellID"]

# Get GENEID list from PBMC1_gene_anno
pbmc2_geneids = PBMC2_gene_anno["GENEID"].tolist()

# Assert that column names match GENEID list
assert pbmc2_columns == pbmc2_geneids, "PBMC2_gene_id mismatch"

print("✅ PBMC2 gene columns aligned with annotation IDs.")

✅ PBMC2 gene columns aligned with annotation IDs.


In [15]:
# Prepare first row: 'combined_IDs' followed by combined IDs
first_row = pd.Series(["combined_IDs"] + PBMC1_gene_anno["combined_ID"].tolist(), index=PBMC1_gene.columns)
first_row_2 = pd.Series(["combined_IDs"] + PBMC2_gene_anno["combined_ID"].tolist(), index=PBMC2_gene.columns)

# Prepend this row to PBMC1_gene or PBMC2_gene
PBMC1_gene_with_row = pd.concat([first_row.to_frame().T, PBMC1_gene], ignore_index=True)
PBMC2_gene_with_row = pd.concat([first_row_2.to_frame().T, PBMC2_gene], ignore_index=True)

In [16]:
PBMC1_gene_with_row.head()

Unnamed: 0,CellID,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,ENSG00000310526,ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,ENSG00000310539
0,combined_IDs,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,WASH7P:ENSG00000310526,WASH9P:ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,DDX11L2:ENSG00000310539
1,PBMC1_ACGTAAACATTTATAC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC1_CAGACAGAACGCAGGA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC1_AGAGCAACACCGAGGT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC1_AACAATCACCGGCATC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
PBMC2_gene_with_row.head()

Unnamed: 0,CellID,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,ENSG00000310526,ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,ENSG00000310539
0,combined_IDs,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,WASH7P:ENSG00000310526,WASH9P:ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,DDX11L2:ENSG00000310539
1,PBMC2_CAGGATCGAGCGATCC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC2_CATACCAAATGGATAT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC2_ATTGAAGTCCGAAGAA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC2_CCTAACACACAAACAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Assign new headers
PBMC1_gene.columns = ["CellID"] + PBMC1_gene_anno["combined_ID"].tolist()
PBMC2_gene.columns = ["CellID"] + PBMC2_gene_anno["combined_ID"].tolist()

In [19]:
PBMC1_gene.head()

Unnamed: 0,CellID,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,WASH7P:ENSG00000310526,WASH9P:ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,DDX11L2:ENSG00000310539
0,PBMC1_ACGTAAACATTTATAC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PBMC1_CAGACAGAACGCAGGA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC1_AGAGCAACACCGAGGT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC1_AACAATCACCGGCATC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC1_AGTAAGTACCGTAGGT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
PBMC2_gene.head()

Unnamed: 0,CellID,BambuGene10388,BambuGene10462,BambuGene11222,BambuGene1258,BambuGene12607,BambuGene12713,BambuGene13044,BambuGene13426,BambuGene14746,...,ENSG00000310521,ENSG00000310523,ENSG00000310525,WASH7P:ENSG00000310526,WASH9P:ENSG00000310527,ENSG00000310529,ENSG00000310533,ENSG00000310535,ENSG00000310537,DDX11L2:ENSG00000310539
0,PBMC2_CAGGATCGAGCGATCC_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PBMC2_CATACCAAATGGATAT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC2_ATTGAAGTCCGAAGAA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC2_CCTAACACACAAACAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC2_AGATACCGCAAAAGAT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Save annotated matrices
split_folder = "Parquet_Files/IntermediateFiles"

PBMC1_gene.to_parquet(os.path.join(split_folder, "PBMC1_gene_expr_annotated.parquet"), compression="zstd")
PBMC2_gene.to_parquet(os.path.join(split_folder, "PBMC2_gene_expr_annotated.parquet"), compression="zstd")

print("✅ Annotated gene expression matrices saved.")

✅ Annotated gene expression matrices saved.


In [2]:
import polars as pl

# Load raw isoform annotation table
iso_annot = pl.read_csv("transcript_annotation_key_info.tsv", separator="\t")

# Extract TXNames and GeneIDs
tx_names = iso_annot.get_column("transcript_id").to_list()
gene_ids = iso_annot.get_column("gene_id").to_list()
gene_names = iso_annot.get_column("gene_name").to_list()

# Row 1: header label + TXNames
row_1 = ["transcript_id"] + tx_names

# Row 2: header label + gene_ids
row_2 = ["gene_id"] + gene_ids

# Row 3: header label + gene_ids
row_3 = ["gene_name"] + gene_names

# Build transposed table: 2 rows, many columns
row_dict = {f"col_{i}": [row_1[i], row_2[i], row_3[i]] for i in range(len(row_1))}

# Create dataframe
transposed_iso_annot = pl.DataFrame(row_dict)

# Save transposed annotation
transposed_iso_annot.write_csv("transcript_annotation_key_info_transposed.tsv", separator="\t")
print("✅ Isoform annotation transposed and saved.")

✅ Isoform annotation transposed and saved.


In [22]:
import os
import polars as pl

def snag_and_write_isoform_ids_row_with_label(txt_paths, output_dir):
    """
    For each .txt file:
    - Read using enforced schema (CellID as Utf8, all else as Float64)
    - Save full dataframe as .parquet
    - Create a two-row DataFrame:
        - Row 1: 'TXID', followed by TXID from column names
        - Row 2: 'GENEID', followed by GENEID from column names
    - Save this two-row isoform ID annotation as .parquet
    """
    for path in txt_paths:
        sample_name = os.path.basename(path).replace(".txt", "")
        print(f"🔍 Reading: {sample_name}")

        # STEP 1: Load header to get column names
        with open(path, 'r') as f:
            header_line = f.readline().strip()

        columns = header_line.split("\t")

        # STEP 2: Build schema override
        schema_overrides = {"CellID": pl.Utf8}
        schema_overrides.update({col: pl.Float64 for col in columns if col != "CellID"})

        # STEP 3: Load file with enforced schema
        df = pl.read_csv(
            path,
            separator="\t",
            schema_overrides=schema_overrides,
            infer_schema_length=0,
            try_parse_dates=False
        )

        # STEP 4: Save full dataframe as parquet
        parquet_out_path = os.path.join(output_dir, f"{sample_name}.parquet")
        df.write_parquet(parquet_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved full dataset to: {parquet_out_path}")

        # STEP 5: Parse TXID and GENEID
        tx_columns = [col for col in df.columns if col != "CellID"]
        tx_ids = [col.split("|")[0] for col in tx_columns]
        gene_ids = [col.split("|")[1] if "|" in col else None for col in tx_columns]

        row_1 = ["TXID"] + tx_ids
        row_2 = ["GENEID"] + gene_ids

        # Build two-row DataFrame
        row_dict = {f"col_{i}": [row_1[i], row_2[i]] for i in range(len(row_1))}
        id_df = pl.DataFrame(row_dict)

        # STEP 6: Save isoform ID annotation as parquet
        ids_out_path = os.path.join(output_dir, f"isoform_IDs_{sample_name}.parquet")
        id_df.write_parquet(ids_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved TXID + GENEID row to: {ids_out_path}")

    print(f"🎉 Finished processing {len(txt_paths)} files.")

In [23]:
# Use function on gene files
iso_files = [
     "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.txt",
     "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.txt"
]

snag_and_write_isoform_ids_row_with_label(iso_files, output_dir="Parquet_Files/RawData_v2")

🔍 Reading: PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix
✅ Saved full dataset to: Parquet_Files/RawData_v2/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet
✅ Saved TXID + GENEID row to: Parquet_Files/RawData_v2/isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet
🔍 Reading: PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix
✅ Saved full dataset to: Parquet_Files/RawData_v2/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet
✅ Saved TXID + GENEID row to: Parquet_Files/RawData_v2/isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet
🎉 Finished processing 2 files.


In [24]:
split_folder = "Parquet_Files/RawData"
PBMC1_iso_id = pl.read_parquet(os.path.join(split_folder, "isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC2_iso_id = pl.read_parquet(os.path.join(split_folder, "isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))

In [25]:
PBMC1_iso_id.head()

col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,…,col_100979,col_100980,col_100981,col_100982,col_100983,col_100984,col_100985,col_100986,col_100987,col_100988,col_100989,col_100990,col_100991,col_100992,col_100993,col_100994,col_100995,col_100996,col_100997,col_100998,col_100999,col_101000,col_101001,col_101002,col_101003,col_101004,col_101005,col_101006,col_101007,col_101008,col_101009,col_101010,col_101011,col_101012,col_101013,col_101014,col_101015
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""TXID""","""BambuTx1""","""BambuTx10""","""BambuTx100""","""BambuTx101""","""BambuTx102""","""BambuTx103""","""BambuTx104""","""BambuTx105""","""BambuTx106""","""BambuTx107""","""BambuTx108""","""BambuTx109""","""BambuTx11""","""BambuTx110""","""BambuTx111""","""BambuTx112""","""BambuTx113""","""BambuTx114""","""BambuTx115""","""BambuTx116""","""BambuTx118""","""BambuTx119""","""BambuTx12""","""BambuTx120""","""BambuTx121""","""BambuTx122""","""BambuTx123""","""BambuTx124""","""BambuTx125""","""BambuTx127""","""BambuTx128""","""BambuTx129""","""BambuTx13""","""BambuTx131""","""BambuTx132""","""BambuTx133""",…,"""ENST00000850579""","""ENST00000850580""","""ENST00000850582""","""ENST00000850583""","""ENST00000850584""","""ENST00000850585""","""ENST00000850587""","""ENST00000850588""","""ENST00000850589""","""ENST00000850590""","""ENST00000850591""","""ENST00000850593""","""ENST00000850595""","""ENST00000850596""","""ENST00000850598""","""ENST00000850599""","""ENST00000850601""","""ENST00000850602""","""ENST00000850603""","""ENST00000850604""","""ENST00000850605""","""ENST00000850607""","""ENST00000850613""","""ENST00000850614""","""ENST00000850615""","""ENST00000850616""","""ENST00000850617""","""ENST00000850621""","""ENST00000850622""","""ENST00000850623""","""ENST00000850624""","""ENST00000850641""","""ENST00000850642""","""ENST00000850646""","""ENST00000850647""","""ENST00000850667""","""ENST00000850670"""
"""GENEID""","""BambuGene29""","""ENSG00000287919""","""BambuGene54231""","""ENSG00000196284""","""ENSG00000196284""","""ENSG00000146453""","""ENSG00000308813""","""BambuGene55251""","""ENSG00000002822""","""ENSG00000003147""","""ENSG00000239789""","""BambuGene58472""","""BambuGene5968""","""BambuGene58523""","""BambuGene58808""","""ENSG00000253409""","""BambuGene60991""","""ENSG00000284048""","""BambuGene59055""","""ENSG00000283128""","""ENSG00000298363""","""BambuGene62234""","""BambuGene2841""","""BambuGene62255""","""ENSG00000164989""","""ENSG00000155875""","""ENSG00000198642""","""ENSG00000305069""","""ENSG00000309071""","""BambuGene64092""","""BambuGene64098""","""BambuGene66205""","""BambuGene6393""","""BambuGene67466""","""BambuGene68084""","""ENSG00000297760""",…,"""ENSG00000173762""","""ENSG00000160305""","""ENSG00000108423""","""ENSG00000277075""","""ENSG00000182318""","""ENSG00000154548""","""ENSG00000087301""","""ENSG00000087301""","""ENSG00000125735""","""ENSG00000113328""","""ENSG00000108349""","""ENSG00000135622""","""ENSG00000135622""","""ENSG00000135622""","""ENSG00000147854""","""ENSG00000130254""","""ENSG00000148803""","""ENSG00000008516""","""ENSG00000179889""","""ENSG00000179889""","""ENSG00000179889""","""ENSG00000137504""","""ENSG00000154124""","""ENSG00000173846""","""ENSG00000128699""","""ENSG00000268500""","""ENSG00000131558""","""ENSG00000163568""","""ENSG00000162695""","""ENSG00000149308""","""ENSG00000170265""","""ENSG00000144199""","""ENSG00000046647""","""ENSG00000176909""","""ENSG00000053501""","""ENSG00000244625""","""ENSG00000281903"""


In [26]:
import polars as pl

# Load the original, clean annotation file
iso_annot = pl.read_csv("transcript_annotation_key_info.tsv", separator="\t")

# Build dictionary: transcript_id → (gene_id, gene_name)
tx_to_gene = {
    tx: (gid, gname)
    for tx, gid, gname in zip(
        iso_annot["transcript_id"],
        iso_annot["gene_id"],
        iso_annot["gene_name"]
    )
}

In [27]:
# From your isoform annotation table
tx_to_gene_info = dict(zip(
    iso_annot["transcript_id"].to_list(),
    zip(iso_annot["gene_id"].to_list(), iso_annot["gene_name"].to_list())
))

In [28]:
def annotate_isoform_ids_from_row(row_df: pl.DataFrame, tx_to_gene_info: dict) -> pl.DataFrame:
    """
    Given a 2- or 3-row wide DataFrame with TXID, GENEID (and optionally GENE_NAME),
    convert to a long-format table with columns: TXID, GENEID, GENE_NAME
    """
    # Get each row as list
    tx_ids = row_df.row(0)[1:]   # skip 'TXID'
    gene_ids = row_df.row(1)[1:] # skip 'GENEID'

    # Look up gene names
    gene_names = []
    for tx in tx_ids:
        _, gene_name = tx_to_gene_info.get(tx, (None, None))
        gene_names.append(gene_name)

    # Build long-format DataFrame
    df = pl.DataFrame({
        "TXID": tx_ids,
        "GENEID": gene_ids,
        "GENE_NAME": gene_names
    })

    return df

In [29]:
PBMC1_iso_annotated = annotate_isoform_ids_from_row(PBMC1_iso_id, tx_to_gene)
PBMC2_iso_annotated = annotate_isoform_ids_from_row(PBMC2_iso_id, tx_to_gene)

In [30]:
PBMC1_iso_annotated.tail()

TXID,GENEID,GENE_NAME
str,str,str
"""ENST00000850642""","""ENSG00000046647""","""GEMIN8"""
"""ENST00000850646""","""ENSG00000176909""","""MAMSTR"""
"""ENST00000850647""","""ENSG00000053501""","""USE1"""
"""ENST00000850667""","""ENSG00000244625""","""MIATNB"""
"""ENST00000850670""","""ENSG00000281903""","""ASMER1"""


In [31]:
# Then fill in missing gene names with empty string (not NaN) to avoid NaNs in combined_ID
PBMC1_iso_annotated = PBMC1_iso_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("")
)

PBMC2_iso_annotated = PBMC2_iso_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("")
)

In [32]:
PBMC1_iso_annotated.head()

TXID,GENEID,GENE_NAME
str,str,str
"""BambuTx1""","""BambuGene29""",""""""
"""BambuTx10""","""ENSG00000287919""",""""""
"""BambuTx100""","""BambuGene54231""",""""""
"""BambuTx101""","""ENSG00000196284""",""""""
"""BambuTx102""","""ENSG00000196284""",""""""


In [33]:
# Save annotated tables
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_iso_annotated.write_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC1.parquet"), compression="zstd")
PBMC2_iso_annotated.write_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC2.parquet"), compression="zstd")

In [3]:
import pandas as pd
split_folder = "Parquet_Files/IntermediateFiles"
# Load annotated files as pandas
PBMC1_iso_anno = pl.read_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC1.parquet")).to_pandas()
PBMC2_iso_anno = pl.read_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC2.parquet")).to_pandas()

In [4]:
import pandas as pd

def create_combined_gene_id(geneid: str, symbol: str, txname: str) -> str:
    symbol = (symbol or "").strip()
    geneid = (geneid or "").strip()
    txname = (txname or "").strip()

    if geneid.startswith("ENSG"):
        return f"{symbol}:{geneid}:{txname}" if symbol else  f"{geneid}:{txname}"
    else:
        return f"{symbol}:{geneid}: {txname}" if symbol else f"{geneid}:{txname}"

In [5]:
# Apply row-wise in Pandas:
PBMC1_iso_anno["combined_ID"] = PBMC1_iso_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"], row["TXID"]),
    axis=1
)

PBMC2_iso_anno["combined_ID"] = PBMC2_iso_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"], row["TXID"]),
    axis=1
)

In [6]:
PBMC1_iso_anno.head()

Unnamed: 0,TXID,GENEID,GENE_NAME,combined_ID
0,BambuTx1,BambuGene29,,BambuGene29:BambuTx1
1,BambuTx10,ENSG00000287919,,ENSG00000287919:BambuTx10
2,BambuTx100,BambuGene54231,,BambuGene54231:BambuTx100
3,BambuTx101,ENSG00000196284,,ENSG00000196284:BambuTx101
4,BambuTx102,ENSG00000196284,,ENSG00000196284:BambuTx102


In [7]:
PBMC1_iso_anno.tail()

Unnamed: 0,TXID,GENEID,GENE_NAME,combined_ID
101010,ENST00000850642,ENSG00000046647,GEMIN8,GEMIN8:ENSG00000046647:ENST00000850642
101011,ENST00000850646,ENSG00000176909,MAMSTR,MAMSTR:ENSG00000176909:ENST00000850646
101012,ENST00000850647,ENSG00000053501,USE1,USE1:ENSG00000053501:ENST00000850647
101013,ENST00000850667,ENSG00000244625,MIATNB,MIATNB:ENSG00000244625:ENST00000850667
101014,ENST00000850670,ENSG00000281903,ASMER1,ASMER1:ENSG00000281903:ENST00000850670


In [39]:
# Save outputs
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_iso_anno.to_parquet(os.path.join(split_folder, "PBMC1_iso_IDs_with_combined.parquet"), compression="zstd")
PBMC2_iso_anno.to_parquet(os.path.join(split_folder, "PBMC2_iso_IDs_with_combined.parquet"), compression="zstd")
print("✅ Combined ID columns created and saved.")

✅ Combined ID columns created and saved.


In [3]:
split_folder = "Parquet_Files/RawData"  # Update this if your path is different
split_dir = "Parquet_Files/IntermediateFiles"  # Update this if your path is different

# Step 4: Load the data
PBMC1_iso = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC2_iso = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC1_iso_anno = pd.read_parquet(os.path.join(split_dir, "PBMC1_iso_IDs_with_combined.parquet"))
PBMC2_iso_anno = pd.read_parquet(os.path.join(split_dir, "PBMC2_iso_IDs_with_combined.parquet"))

In [4]:
print(PBMC1_iso.shape)
print(PBMC1_iso_anno.shape)

(19592, 101016)
(101015, 4)


In [42]:
PBMC1_iso.head

<bound method NDFrame.head of                                         CellID  BambuTx1|BambuGene29  \
0      PBMC1_ACTCAGCTAGGCCACA_filtered_mapq_10                   0.0   
1      PBMC1_ATTGATAAATTTAACT_filtered_mapq_10                   0.0   
2      PBMC1_CAAGACTCCCCTCACT_filtered_mapq_10                   0.0   
3      PBMC1_CACTAACAATAACAAG_filtered_mapq_10                   0.0   
4      PBMC1_CCCTAATCACCCATTT_filtered_mapq_10                   0.0   
...                                        ...                   ...   
19587  PBMC1_AGATATTCCACGACCT_filtered_mapq_10                   0.0   
19588  PBMC1_ACAGATGAATCACAGC_filtered_mapq_10                   0.0   
19589  PBMC1_ATGACCTGCCGTCCGC_filtered_mapq_10                   0.0   
19590  PBMC1_AATAATACAGCAAGGT_filtered_mapq_10                   0.0   
19591  PBMC1_ATACCCTGAAACCACT_filtered_mapq_10                   0.0   

       BambuTx10|ENSG00000287919  BambuTx100|BambuGene54231  \
0                            0.0          

In [43]:
PBMC1_iso_anno.head

<bound method NDFrame.head of                    TXID           GENEID GENE_NAME  \
0              BambuTx1      BambuGene29             
1             BambuTx10  ENSG00000287919             
2            BambuTx100   BambuGene54231             
3            BambuTx101  ENSG00000196284             
4            BambuTx102  ENSG00000196284             
...                 ...              ...       ...   
101010  ENST00000850642  ENSG00000046647    GEMIN8   
101011  ENST00000850646  ENSG00000176909    MAMSTR   
101012  ENST00000850647  ENSG00000053501      USE1   
101013  ENST00000850667  ENSG00000244625    MIATNB   
101014  ENST00000850670  ENSG00000281903    ASMER1   

                                   combined_ID  
0                         BambuGene29:BambuTx1  
1                    ENSG00000287919:BambuTx10  
2                    BambuGene54231:BambuTx100  
3                   ENSG00000196284:BambuTx101  
4                   ENSG00000196284:BambuTx102  
...                        

In [6]:
print(PBMC2_iso.shape)
print(PBMC2_iso_anno.shape)

(32596, 101016)
(101015, 4)


In [5]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_iso, excluding CellID
pbmc1_columns_full = [col for col in PBMC1_iso.columns if col != "CellID"]

# Extract only the isoform names before "|" for matching
pbmc1_columns = [col.split("|")[0] for col in pbmc1_columns_full]

# Get isoform IDs from annotation table
pbmc1_isoids = PBMC1_iso_anno["TXID"].tolist()

# Validate alignment
assert pbmc1_columns == pbmc1_isoids, "❌ PBMC1 isoform IDs do not align with matrix columns"

print("✅ PBMC1 isoform columns aligned with annotation IDs.")

✅ PBMC1 isoform columns aligned with annotation IDs.


In [7]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_iso, excluding CellID
pbmc2_columns_full = [col for col in PBMC2_iso.columns if col != "CellID"]

# Extract only the isoform names before "|" for matching
pbmc2_columns = [col.split("|")[0] for col in pbmc2_columns_full]

# Get isoform IDs from annotation table
pbmc2_isoids = PBMC2_iso_anno["TXID"].tolist()

# Validate alignment
assert pbmc2_columns == pbmc2_isoids, "❌ PBMC2 isoform IDs do not align with matrix columns"

print("✅ PBMC2 isoform columns aligned with annotation IDs.")

✅ PBMC2 isoform columns aligned with annotation IDs.


In [8]:
# Prepare first row: 'combined_IDs' followed by combined IDs
first_row = pd.Series(["combined_IDs"] + PBMC1_iso_anno["combined_ID"].tolist(), index=PBMC1_iso.columns)


# Prepend this row to PBMC1_iso or PBMC2_iso
PBMC1_iso_with_row = pd.concat([first_row.to_frame().T, PBMC1_iso], ignore_index=True)


In [9]:
first_row = pd.Series(["combined_IDs"] + PBMC2_iso_anno["combined_ID"].tolist(), index=PBMC2_iso.columns)
PBMC2_iso_with_row = pd.concat([first_row.to_frame().T, PBMC2_iso], ignore_index=True)

In [10]:
PBMC1_iso_with_row.head()

Unnamed: 0,CellID,BambuTx1|BambuGene29,BambuTx10|ENSG00000287919,BambuTx100|BambuGene54231,BambuTx101|ENSG00000196284,BambuTx102|ENSG00000196284,BambuTx103|ENSG00000146453,BambuTx104|ENSG00000308813,BambuTx105|BambuGene55251,BambuTx106|ENSG00000002822,...,ENST00000850621|ENSG00000163568,ENST00000850622|ENSG00000162695,ENST00000850623|ENSG00000149308,ENST00000850624|ENSG00000170265,ENST00000850641|ENSG00000144199,ENST00000850642|ENSG00000046647,ENST00000850646|ENSG00000176909,ENST00000850647|ENSG00000053501,ENST00000850667|ENSG00000244625,ENST00000850670|ENSG00000281903
0,combined_IDs,BambuGene29:BambuTx1,ENSG00000287919:BambuTx10,BambuGene54231:BambuTx100,ENSG00000196284:BambuTx101,ENSG00000196284:BambuTx102,ENSG00000146453:BambuTx103,ENSG00000308813:BambuTx104,BambuGene55251:BambuTx105,ENSG00000002822:BambuTx106,...,AIM2:ENSG00000163568:ENST00000850621,SLC30A7:ENSG00000162695:ENST00000850622,NPAT:ENSG00000149308:ENST00000850623,ZNF282:ENSG00000170265:ENST00000850624,FAHD2B:ENSG00000144199:ENST00000850641,GEMIN8:ENSG00000046647:ENST00000850642,MAMSTR:ENSG00000176909:ENST00000850646,USE1:ENSG00000053501:ENST00000850647,MIATNB:ENSG00000244625:ENST00000850667,ASMER1:ENSG00000281903:ENST00000850670
1,PBMC1_ACTCAGCTAGGCCACA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC1_ATTGATAAATTTAACT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC1_CAAGACTCCCCTCACT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC1_CACTAACAATAACAAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
PBMC2_iso_with_row.head()

Unnamed: 0,CellID,BambuTx1|BambuGene29,BambuTx10|ENSG00000287919,BambuTx100|BambuGene54231,BambuTx101|ENSG00000196284,BambuTx102|ENSG00000196284,BambuTx103|ENSG00000146453,BambuTx104|ENSG00000308813,BambuTx105|BambuGene55251,BambuTx106|ENSG00000002822,...,ENST00000850621|ENSG00000163568,ENST00000850622|ENSG00000162695,ENST00000850623|ENSG00000149308,ENST00000850624|ENSG00000170265,ENST00000850641|ENSG00000144199,ENST00000850642|ENSG00000046647,ENST00000850646|ENSG00000176909,ENST00000850647|ENSG00000053501,ENST00000850667|ENSG00000244625,ENST00000850670|ENSG00000281903
0,combined_IDs,BambuGene29:BambuTx1,ENSG00000287919:BambuTx10,BambuGene54231:BambuTx100,ENSG00000196284:BambuTx101,ENSG00000196284:BambuTx102,ENSG00000146453:BambuTx103,ENSG00000308813:BambuTx104,BambuGene55251:BambuTx105,ENSG00000002822:BambuTx106,...,AIM2:ENSG00000163568:ENST00000850621,SLC30A7:ENSG00000162695:ENST00000850622,NPAT:ENSG00000149308:ENST00000850623,ZNF282:ENSG00000170265:ENST00000850624,FAHD2B:ENSG00000144199:ENST00000850641,GEMIN8:ENSG00000046647:ENST00000850642,MAMSTR:ENSG00000176909:ENST00000850646,USE1:ENSG00000053501:ENST00000850647,MIATNB:ENSG00000244625:ENST00000850667,ASMER1:ENSG00000281903:ENST00000850670
1,PBMC2_AAACAAGTATTGCCAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC2_CCAGATATATGTATAA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC2_ATCTCCCGCACTACTG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC2_CCGTCAGTAAACCCAT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Assign new headers
PBMC1_iso.columns = ["CellID"] + PBMC1_iso_anno["combined_ID"].tolist()
PBMC2_iso.columns = ["CellID"] + PBMC2_iso_anno["combined_ID"].tolist()

In [13]:
PBMC1_iso.head()

Unnamed: 0,CellID,BambuGene29:BambuTx1,ENSG00000287919:BambuTx10,BambuGene54231:BambuTx100,ENSG00000196284:BambuTx101,ENSG00000196284:BambuTx102,ENSG00000146453:BambuTx103,ENSG00000308813:BambuTx104,BambuGene55251:BambuTx105,ENSG00000002822:BambuTx106,...,AIM2:ENSG00000163568:ENST00000850621,SLC30A7:ENSG00000162695:ENST00000850622,NPAT:ENSG00000149308:ENST00000850623,ZNF282:ENSG00000170265:ENST00000850624,FAHD2B:ENSG00000144199:ENST00000850641,GEMIN8:ENSG00000046647:ENST00000850642,MAMSTR:ENSG00000176909:ENST00000850646,USE1:ENSG00000053501:ENST00000850647,MIATNB:ENSG00000244625:ENST00000850667,ASMER1:ENSG00000281903:ENST00000850670
0,PBMC1_ACTCAGCTAGGCCACA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PBMC1_ATTGATAAATTTAACT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC1_CAAGACTCCCCTCACT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC1_CACTAACAATAACAAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC1_CCCTAATCACCCATTT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
PBMC2_iso.head()

Unnamed: 0,CellID,BambuGene29:BambuTx1,ENSG00000287919:BambuTx10,BambuGene54231:BambuTx100,ENSG00000196284:BambuTx101,ENSG00000196284:BambuTx102,ENSG00000146453:BambuTx103,ENSG00000308813:BambuTx104,BambuGene55251:BambuTx105,ENSG00000002822:BambuTx106,...,AIM2:ENSG00000163568:ENST00000850621,SLC30A7:ENSG00000162695:ENST00000850622,NPAT:ENSG00000149308:ENST00000850623,ZNF282:ENSG00000170265:ENST00000850624,FAHD2B:ENSG00000144199:ENST00000850641,GEMIN8:ENSG00000046647:ENST00000850642,MAMSTR:ENSG00000176909:ENST00000850646,USE1:ENSG00000053501:ENST00000850647,MIATNB:ENSG00000244625:ENST00000850667,ASMER1:ENSG00000281903:ENST00000850670
0,PBMC2_AAACAAGTATTGCCAG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PBMC2_CCAGATATATGTATAA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PBMC2_ATCTCCCGCACTACTG_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PBMC2_CCGTCAGTAAACCCAT_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PBMC2_AATTAGCGACTAACTA_filtered_mapq_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Save annotated matrices
split_folder = "Parquet_Files/IntermediateFiles"

PBMC1_iso.to_parquet(os.path.join(split_folder, "PBMC1_iso_expr_annotated.parquet"), compression="zstd")
PBMC2_iso.to_parquet(os.path.join(split_folder, "PBMC2_iso_expr_annotated.parquet"), compression="zstd")

print("✅ Annotated gene expression matrices saved.")

✅ Annotated gene expression matrices saved.


In [16]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC1_iso_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [17]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

AnnData object with n_obs × n_vars = 19592 × 101015
Empty DataFrame
Columns: []
Index: [PBMC1_ACTCAGCTAGGCCACA_filtered_mapq_10, PBMC1_ATTGATAAATTTAACT_filtered_mapq_10, PBMC1_CAAGACTCCCCTCACT_filtered_mapq_10, PBMC1_CACTAACAATAACAAG_filtered_mapq_10, PBMC1_CCCTAATCACCCATTT_filtered_mapq_10]
Empty DataFrame
Columns: []
Index: [BambuGene29:BambuTx1, ENSG00000287919:BambuTx10, BambuGene54231:BambuTx100, ENSG00000196284:BambuTx101, ENSG00000196284:BambuTx102]
(19592, 101015)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [18]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC1_iso_AnnData_v2.h5ad", compression="gzip")

In [22]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC2_iso_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [23]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

AnnData object with n_obs × n_vars = 32596 × 101015
Empty DataFrame
Columns: []
Index: [PBMC2_AAACAAGTATTGCCAG_filtered_mapq_10, PBMC2_CCAGATATATGTATAA_filtered_mapq_10, PBMC2_ATCTCCCGCACTACTG_filtered_mapq_10, PBMC2_CCGTCAGTAAACCCAT_filtered_mapq_10, PBMC2_AATTAGCGACTAACTA_filtered_mapq_10]
Empty DataFrame
Columns: []
Index: [BambuGene29:BambuTx1, ENSG00000287919:BambuTx10, BambuGene54231:BambuTx100, ENSG00000196284:BambuTx101, ENSG00000196284:BambuTx102]
(32596, 101015)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [24]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC2_iso_AnnData_v2.h5ad", compression="gzip")

In [25]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC1_gene_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [26]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

AnnData object with n_obs × n_vars = 19772 × 26436
Empty DataFrame
Columns: []
Index: [PBMC1_ACGTAAACATTTATAC_filtered_mapq_10, PBMC1_CAGACAGAACGCAGGA_filtered_mapq_10, PBMC1_AGAGCAACACCGAGGT_filtered_mapq_10, PBMC1_AACAATCACCGGCATC_filtered_mapq_10, PBMC1_AGTAAGTACCGTAGGT_filtered_mapq_10]
Empty DataFrame
Columns: []
Index: [BambuGene10388, BambuGene10462, BambuGene11222, BambuGene1258, BambuGene12607]
(19772, 26436)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [27]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC1_gene_AnnData_v2.h5ad", compression="gzip")

In [29]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC2_gene_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [30]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

AnnData object with n_obs × n_vars = 32988 × 26436
Empty DataFrame
Columns: []
Index: [PBMC2_CAGGATCGAGCGATCC_filtered_mapq_10, PBMC2_CATACCAAATGGATAT_filtered_mapq_10, PBMC2_ATTGAAGTCCGAAGAA_filtered_mapq_10, PBMC2_CCTAACACACAAACAG_filtered_mapq_10, PBMC2_AGATACCGCAAAAGAT_filtered_mapq_10]
Empty DataFrame
Columns: []
Index: [BambuGene10388, BambuGene10462, BambuGene11222, BambuGene1258, BambuGene12607]
(32988, 26436)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [31]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC2_gene_AnnData_v2.h5ad", compression="gzip")