In [None]:
import sys, os, platform, scanpy, anndata, polars as pl, pandas as pd

print("Python:", sys.version.split()[0])
print("OS:", platform.system(), platform.release())
print("scanpy:", scanpy.__version__)
print("anndata:", anndata.__version__)
print("polars:", pl.__version__)
print("pandas:", pd.__version__)

In [None]:
import pandas as pd

## Read the GTF into a DataFrame
gtf = pd.read_csv(
    "Homo_sapiens.GRCh38.113.gtf",
    sep="\t", # GTFs are tab-separated
    comment="#", # Skip header/comment lines beginning with '#'
    header=None,
    names=[
        "seqname", "source", "feature", "start", "end",
        "score", "strand", "frame", "attribute"
    ],
    dtype={"attribute": str},
)

# Extract the fields we care about via regex on the 'attribute' column
gtf["transcript_id"] = gtf["attribute"].str.extract(r'transcript_id "([^"]+)"')
gtf["gene_id"] = gtf["attribute"].str.extract(r'gene_id "([^"]+)"')
gtf["gene_name"] = gtf["attribute"].str.extract(r'gene_name "([^"]+)"')

# Filter for transcript‐level entries with valid ENSG/ENST IDs
tx = (
    gtf[
        gtf["gene_id"].str.startswith("ENSG", na=False) &
        gtf["transcript_id"].str.startswith("ENST", na=False)
    ]
    .loc[:, ["transcript_id", "gene_id", "gene_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Write out the transcript‐level table and sav
tx.to_csv("transcript_annotation_key_info.tsv", sep="\t", index=False)

# --- If you also want the gene‐level table as in your earlier snippet: ---
genes = (
    gtf[
        gtf["gene_id"].str.startswith("ENSG", na=False)
    ]
    .loc[:, ["gene_id", "gene_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

genes.to_csv("gene_annotation_key_info.tsv", sep="\t", index=False)

In [None]:
import polars as pl

#Utilize file from above
gene_annot = pl.read_csv("gene_annotation_key_info.tsv", separator="\t")

# Extract gene IDs and gene names
gene_ids = gene_annot.get_column("gene_id").to_list()
gene_names = gene_annot.get_column("gene_name").to_list()

# Construct first row: header label + all gene IDs
row_1 = ["gene_ids"] + gene_ids

# Construct second row: header label + all gene names
row_2 = ["gene_name"] + gene_names

# Build dictionary: column index → [row_1_value, row_2_value]
row_dict = {f"col_{i}": [row_1[i], row_2[i]] for i in range(len(row_1))}

# Create DataFrame: 2 rows, many columns
transposed_df = pl.DataFrame(row_dict)

# Save as TSV
transposed_df.write_csv("gene_annotation_key_info_transposed.tsv", separator="\t")

In [None]:
import os
import polars as pl

def snag_and_write_gene_ids_row_with_label(txt_paths, output_dir):
    """
    For each .txt file:
    - Read using enforced schema (CellID as Utf8, all else as Float64)
    - Save full dataframe as .parquet
    - Create a single horizontal row:
        - First cell: 'GENEIDs'
        - Remaining cells: gene IDs from transposed data
    - Save this single-row labeled gene ID row as .parquet
    """
    for path in txt_paths:
        sample_name = os.path.basename(path).replace(".txt", "")
        print(f"🔍 Reading: {sample_name}")

        # STEP 1: Load header to get column names
        with open(path, 'r') as f:
            header_line = f.readline().strip()

        columns = header_line.split("\t")

        # STEP 2: Build schema override
        schema_overrides = {"CellID": pl.Utf8}
        schema_overrides.update({col: pl.Float64 for col in columns if col != "CellID"})

        # STEP 3: Load file using enforced schema
        df = pl.read_csv(
            path,
            separator="\t",
            schema_overrides=schema_overrides,
            infer_schema_length=0,  # Disable inference of datatypes by Polars
            try_parse_dates=False
        )

        # STEP 4: Save full dataframe as parquet
        parquet_out_path = os.path.join(output_dir, f"{sample_name}.parquet")
        df.write_parquet(parquet_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved full dataset to: {parquet_out_path}")

        # STEP 5: Create single-row DataFrame of GENEIDs
        gene_ids = [col for col in df.columns if col != "CellID"]
        row_dict = {f"col_{i}": [val] for i, val in enumerate(["GENEIDs"] + gene_ids)}
        id_df = pl.DataFrame(row_dict)

        # STEP 6: Save gene ID row as parquet
        ids_out_path = os.path.join(output_dir, f"gene_IDs_{sample_name}.parquet")
        id_df.write_parquet(ids_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved labeled gene IDs row to: {ids_out_path}")

    print(f"🎉 Finished processing {len(txt_paths)} files.")

In [None]:
# Use function on gene files
gene_files = [
    "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.txt",
    "InitialFiltering. /PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.txt"
]

snag_and_write_gene_ids_row_with_label(gene_files, output_dir="Parquet_Files/RawData")

In [None]:
gene_annot = pl.read_csv("gene_annotation_key_info_transposed.tsv", separator="\t")

split_folder = "Parquet_Files/RawData"
PBMC1_gene_id = pl.read_parquet(os.path.join(split_folder, "gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))
PBMC2_gene_id = pl.read_parquet(os.path.join(split_folder, "gene_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))

In [None]:
PBMC1_gene_id.head()

In [None]:
gene_annot.head()

In [None]:
def annotate_gene_ids_from_row(row_df: pl.DataFrame, id_to_name: dict) -> pl.DataFrame:
    """
    Given a single-row DataFrame of gene IDs, return a two-column DataFrame:
    - GENEID
    - GENE_NAME
    """
    # Extract the first (and only) row from the DataFrame as a list of gene IDs
    gene_ids = row_df.row(0) 

    # Optionally skip the first entry if it's a label like 'GENEIDs'
    if gene_ids[0] == "GENEIDs": # Check if the first element is a label string, not a real ID
        gene_ids = gene_ids[1:] # Skip that label to keep only gene IDs

    # Map each gene ID to its corresponding gene name; use None if not found
    gene_names = [id_to_name.get(gid, None) for gid in gene_ids]

    # Create new DataFrame pairing each gene ID with its gene name
    annotated_df = pl.DataFrame({
        "GENEID": gene_ids,
        "GENE_NAME": gene_names
    })

    return annotated_df

In [None]:
##  Convert gene_annot to a Lookup table:
# Extract gene IDs and gene names from gene_annot
gene_ids = gene_annot.row(0)[1:]   # Skip the "gene_ids" label
gene_names = gene_annot.row(1)[1:] # Skip the "gene_name" label

# Create dictionary: gene_id → gene_name
id_to_name = dict(zip(gene_ids, gene_names))

In [None]:
# Annotate PBMC1 and PBMC2 single-row DataFrames using lookup

PBMC1_gene_annotated = annotate_gene_ids_from_row(PBMC1_gene_id, id_to_name)
PBMC2_gene_annotated = annotate_gene_ids_from_row(PBMC2_gene_id, id_to_name)

In [None]:
PBMC1_gene_annotated.tail()

In [None]:
# Fill missing gene names with empty string to prevent NaN values in merged outputs
PBMC1_gene_annotated = PBMC1_gene_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("") # Replace nulls with empty string
)

PBMC2_gene_annotated = PBMC2_gene_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("") # Replace nulls with empty string
)

In [None]:
PBMC1_gene_annotated.head()

In [None]:
# Save annotated tables
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_gene_annotated.write_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC1.parquet"), compression="zstd")
PBMC2_gene_annotated.write_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC2.parquet"), compression="zstd")

In [None]:
import pandas as pd
split_folder = "Parquet_Files/IntermediateFiles"
# Load annotated Parquet files using Polars, then convert to Pandas DataFrames
PBMC1_gene_anno = pl.read_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC1.parquet")).to_pandas()
PBMC2_gene_anno = pl.read_parquet(os.path.join(split_folder, "annotated_gene_IDs_PBMC2.parquet")).to_pandas()

In [None]:
def create_combined_gene_id(geneid: str, symbol: str) -> str:
    geneid = (geneid or "").strip() # Ensure non-null string and remove extra spaces
    symbol = (symbol or "").strip() # Ensure non-null string and remove extra spaces

    if geneid.startswith("ENSG"):
        return f"{symbol}:{geneid}" if symbol else geneid # Combine symbol:ID if symbol exists; else use ID only
    else:
        return f"{symbol}:{geneid}" if symbol else geneid

In [None]:
# Apply the combination function row-wise to both dataframes
PBMC1_gene_anno["combined_ID"] = PBMC1_gene_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"]),
    axis=1
)

PBMC2_gene_anno["combined_ID"] = PBMC2_gene_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"]),
    axis=1
)

In [None]:
#Check that combined_ID was created with GENE_NAME:GENEID, if GENE_NAME has contents
#If not, combined_ID will only contain GENEID
PBMC1_gene_anno.tail()

In [None]:
# Save outputs
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_gene_anno.to_parquet(os.path.join(split_folder, "PBMC1_gene_IDs_with_combined.parquet"), compression="zstd")
PBMC2_gene_anno.to_parquet(os.path.join(split_folder, "PBMC2_gene_IDs_with_combined.parquet"), compression="zstd")
print("✅ Combined ID columns created and saved.")

In [None]:
split_folder = "Parquet_Files/RawData"

# Step 4: Load the updated annotated and sorted data
PBMC1_gene = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))
PBMC2_gene = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_gene.filtered_transposed_expression_matrix.parquet"))

In [None]:
PBMC1_gene.head()

In [None]:
PBMC1_gene.shape

In [None]:
PBMC1_gene_anno.shape

In [None]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_gene, excluding CellID
pbmc1_columns = [col for col in PBMC1_gene.columns if col != "CellID"]

# Get GENEID list from PBMC1_gene_anno
pbmc1_geneids = PBMC1_gene_anno["GENEID"].tolist()

# Assert that column names match GENEID list
assert pbmc1_columns == pbmc1_geneids, "PBMC1_gene_id mismatch"

print("✅ PBMC1 gene columns aligned with annotation IDs.")

In [None]:
# Get the gene columns from PBMC1_gene, excluding CellID
pbmc2_columns = [col for col in PBMC2_gene.columns if col != "CellID"]

# Get GENEID list from PBMC1_gene_anno
pbmc2_geneids = PBMC2_gene_anno["GENEID"].tolist()

# Assert that column names match GENEID list
assert pbmc2_columns == pbmc2_geneids, "PBMC2_gene_id mismatch"

print("✅ PBMC2 gene columns aligned with annotation IDs.")

In [None]:
# Prepare first row: 'combined_IDs' followed by combined IDs
first_row = pd.Series(["combined_IDs"] + PBMC1_gene_anno["combined_ID"].tolist(), index=PBMC1_gene.columns)
first_row_2 = pd.Series(["combined_IDs"] + PBMC2_gene_anno["combined_ID"].tolist(), index=PBMC2_gene.columns)

# Prepend this row to PBMC1_gene or PBMC2_gene
PBMC1_gene_with_row = pd.concat([first_row.to_frame().T, PBMC1_gene], ignore_index=True)
PBMC2_gene_with_row = pd.concat([first_row_2.to_frame().T, PBMC2_gene], ignore_index=True)

In [None]:
PBMC1_gene_with_row.head()

In [None]:
PBMC2_gene_with_row.head()

In [None]:
# Assign new headers
PBMC1_gene.columns = ["CellID"] + PBMC1_gene_anno["combined_ID"].tolist()
PBMC2_gene.columns = ["CellID"] + PBMC2_gene_anno["combined_ID"].tolist()

In [None]:
PBMC1_gene.head()

In [None]:
PBMC2_gene.head()

In [None]:
# Save annotated matrices
split_folder = "Parquet_Files/IntermediateFiles"

PBMC1_gene.to_parquet(os.path.join(split_folder, "PBMC1_gene_expr_annotated.parquet"), compression="zstd")
PBMC2_gene.to_parquet(os.path.join(split_folder, "PBMC2_gene_expr_annotated.parquet"), compression="zstd")

print("✅ Annotated gene expression matrices saved.")

In [None]:
import polars as pl

# Load raw isoform annotation table
iso_annot = pl.read_csv("transcript_annotation_key_info.tsv", separator="\t")

# Extract TXNames and GeneIDs
tx_names = iso_annot.get_column("transcript_id").to_list()
gene_ids = iso_annot.get_column("gene_id").to_list()
gene_names = iso_annot.get_column("gene_name").to_list()

# Row 1: header label + TXNames
row_1 = ["transcript_id"] + tx_names

# Row 2: header label + gene_ids
row_2 = ["gene_id"] + gene_ids

# Row 3: header label + gene_ids
row_3 = ["gene_name"] + gene_names

# Build transposed table: 2 rows, many columns
row_dict = {f"col_{i}": [row_1[i], row_2[i], row_3[i]] for i in range(len(row_1))}

# Create dataframe
transposed_iso_annot = pl.DataFrame(row_dict)

# Save transposed annotation
transposed_iso_annot.write_csv("transcript_annotation_key_info_transposed.tsv", separator="\t")
print("✅ Isoform annotation transposed and saved.")

In [None]:
import os
import polars as pl

def snag_and_write_isoform_ids_row_with_label(txt_paths, output_dir):
    """
    For each .txt file:
    - Read using enforced schema (CellID as Utf8, all else as Float64)
    - Save full dataframe as .parquet
    - Create a two-row DataFrame:
        - Row 1: 'TXID', followed by TXID from column names
        - Row 2: 'GENEID', followed by GENEID from column names
    - Save this two-row isoform ID annotation as .parquet
    """
    for path in txt_paths:
        sample_name = os.path.basename(path).replace(".txt", "")
        print(f"🔍 Reading: {sample_name}")

        # STEP 1: Load header to get column names
        with open(path, 'r') as f:
            header_line = f.readline().strip()

        columns = header_line.split("\t")

        # STEP 2: Build schema override
        schema_overrides = {"CellID": pl.Utf8}
        schema_overrides.update({col: pl.Float64 for col in columns if col != "CellID"})

        # STEP 3: Load file with enforced schema
        df = pl.read_csv(
            path,
            separator="\t",
            schema_overrides=schema_overrides,
            infer_schema_length=0,
            try_parse_dates=False
        )

        # STEP 4: Save full dataframe as parquet
        parquet_out_path = os.path.join(output_dir, f"{sample_name}.parquet")
        df.write_parquet(parquet_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved full dataset to: {parquet_out_path}")

        # STEP 5: Parse TXID and GENEID
        tx_columns = [col for col in df.columns if col != "CellID"]
        tx_ids = [col.split("|")[0] for col in tx_columns]
        gene_ids = [col.split("|")[1] if "|" in col else None for col in tx_columns]

        row_1 = ["TXID"] + tx_ids
        row_2 = ["GENEID"] + gene_ids

        # Build two-row DataFrame
        row_dict = {f"col_{i}": [row_1[i], row_2[i]] for i in range(len(row_1))}
        id_df = pl.DataFrame(row_dict)

        # STEP 6: Save isoform ID annotation as parquet
        ids_out_path = os.path.join(output_dir, f"isoform_IDs_{sample_name}.parquet")
        id_df.write_parquet(ids_out_path, compression="zstd", compression_level=4)
        print(f"✅ Saved TXID + GENEID row to: {ids_out_path}")

    print(f"🎉 Finished processing {len(txt_paths)} files.")

In [None]:
# Use function on gene files
iso_files = [
     "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.txt",
     "InitialFiltering/PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.txt"
]

snag_and_write_isoform_ids_row_with_label(iso_files, output_dir="Parquet_Files/RawData")

In [None]:
split_folder = "Parquet_Files/RawData"
PBMC1_iso_id = pl.read_parquet(os.path.join(split_folder, "isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC2_iso_id = pl.read_parquet(os.path.join(split_folder, "isoform_IDs_PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))

In [None]:
PBMC1_iso_id.head()

In [None]:
import polars as pl

# Load the original, clean annotation file
iso_annot = pl.read_csv("transcript_annotation_key_info.tsv", separator="\t")

# Build dictionary: transcript_id → (gene_id, gene_name)
tx_to_gene = {
    tx: (gid, gname)
    for tx, gid, gname in zip(
        iso_annot["transcript_id"],
        iso_annot["gene_id"],
        iso_annot["gene_name"]
    )
}

In [None]:
# From your isoform annotation table
tx_to_gene_info = dict(zip(
    iso_annot["transcript_id"].to_list(),
    zip(iso_annot["gene_id"].to_list(), iso_annot["gene_name"].to_list())
))

In [None]:
def annotate_isoform_ids_from_row(row_df: pl.DataFrame, tx_to_gene_info: dict) -> pl.DataFrame:
    """
    Given a 2- or 3-row wide DataFrame with TXID, GENEID (and optionally GENE_NAME),
    convert to a long-format table with columns: TXID, GENEID, GENE_NAME
    """
    # Get each row as list
    tx_ids = row_df.row(0)[1:]   # skip 'TXID'
    gene_ids = row_df.row(1)[1:] # skip 'GENEID'

    # Look up gene names
    gene_names = []
    for tx in tx_ids:
        _, gene_name = tx_to_gene_info.get(tx, (None, None))
        gene_names.append(gene_name)

    # Build long-format DataFrame
    df = pl.DataFrame({
        "TXID": tx_ids,
        "GENEID": gene_ids,
        "GENE_NAME": gene_names
    })

    return df

In [None]:
PBMC1_iso_annotated = annotate_isoform_ids_from_row(PBMC1_iso_id, tx_to_gene)
PBMC2_iso_annotated = annotate_isoform_ids_from_row(PBMC2_iso_id, tx_to_gene)

In [None]:
PBMC1_iso_annotated.tail()

In [None]:
# Then fill in missing gene names with empty string (not NaN) to avoid NaNs in combined_ID
PBMC1_iso_annotated = PBMC1_iso_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("")
)

PBMC2_iso_annotated = PBMC2_iso_annotated.with_columns(
    pl.col("GENE_NAME").fill_null("")
)

In [None]:
PBMC1_iso_annotated.head()

In [None]:
# Save annotated tables
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_iso_annotated.write_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC1.parquet"), compression="zstd")
PBMC2_iso_annotated.write_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC2.parquet"), compression="zstd")

In [None]:
import pandas as pd
split_folder = "Parquet_Files/IntermediateFiles"
# Load annotated files as pandas
PBMC1_iso_anno = pl.read_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC1.parquet")).to_pandas()
PBMC2_iso_anno = pl.read_parquet(os.path.join(split_folder, "annotated_iso_IDs_PBMC2.parquet")).to_pandas()

In [None]:
import pandas as pd

def create_combined_gene_id(geneid: str, symbol: str, txname: str) -> str:
    symbol = (symbol or "").strip()
    geneid = (geneid or "").strip()
    txname = (txname or "").strip()

    if geneid.startswith("ENSG"):
        return f"{symbol}:{geneid}:{txname}" if symbol else  f"{geneid}:{txname}"
    else:
        return f"{symbol}:{geneid}: {txname}" if symbol else f"{geneid}:{txname}"

In [None]:
# Apply row-wise in Pandas:
PBMC1_iso_anno["combined_ID"] = PBMC1_iso_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"], row["TXID"]),
    axis=1
)

PBMC2_iso_anno["combined_ID"] = PBMC2_iso_anno.apply(
    lambda row: create_combined_gene_id(row["GENEID"], row["GENE_NAME"], row["TXID"]),
    axis=1
)

In [None]:
PBMC1_iso_anno.head()

In [None]:
PBMC1_iso_anno.tail()

In [None]:
# Save outputs
split_folder = "Parquet_Files/IntermediateFiles"
PBMC1_iso_anno.to_parquet(os.path.join(split_folder, "PBMC1_iso_IDs_with_combined.parquet"), compression="zstd")
PBMC2_iso_anno.to_parquet(os.path.join(split_folder, "PBMC2_iso_IDs_with_combined.parquet"), compression="zstd")
print("✅ Combined ID columns created and saved.")

In [None]:
split_folder = "Parquet_Files/RawData"  # Update this if your path is different
split_dir = "Parquet_Files/IntermediateFiles"  # Update this if your path is different

# Step 4: Load the data
PBMC1_iso = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC1_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC2_iso = pd.read_parquet(os.path.join(split_folder, "PBMC_patient0_JUNE_16_2025_bambu_quant_PBMC2_combined_counts_transcript.filtered_transposed_expression_matrix.parquet"))
PBMC1_iso_anno = pd.read_parquet(os.path.join(split_dir, "PBMC1_iso_IDs_with_combined.parquet"))
PBMC2_iso_anno = pd.read_parquet(os.path.join(split_dir, "PBMC2_iso_IDs_with_combined.parquet"))

In [None]:
print(PBMC1_iso.shape)
print(PBMC1_iso_anno.shape)

In [None]:
PBMC1_iso.head

In [None]:
PBMC1_iso_anno.head

In [None]:
print(PBMC2_iso.shape)
print(PBMC2_iso_anno.shape)

In [None]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_iso, excluding CellID
pbmc1_columns_full = [col for col in PBMC1_iso.columns if col != "CellID"]

# Extract only the isoform names before "|" for matching
pbmc1_columns = [col.split("|")[0] for col in pbmc1_columns_full]

# Get isoform IDs from annotation table
pbmc1_isoids = PBMC1_iso_anno["TXID"].tolist()

# Validate alignment
assert pbmc1_columns == pbmc1_isoids, "❌ PBMC1 isoform IDs do not align with matrix columns"

print("✅ PBMC1 isoform columns aligned with annotation IDs.")

In [None]:
# BEFORE INSERTING combined_IDs, validate row alignment
# Get the gene columns from PBMC1_iso, excluding CellID
pbmc2_columns_full = [col for col in PBMC2_iso.columns if col != "CellID"]

# Extract only the isoform names before "|" for matching
pbmc2_columns = [col.split("|")[0] for col in pbmc2_columns_full]

# Get isoform IDs from annotation table
pbmc2_isoids = PBMC2_iso_anno["TXID"].tolist()

# Validate alignment
assert pbmc2_columns == pbmc2_isoids, "❌ PBMC2 isoform IDs do not align with matrix columns"

print("✅ PBMC2 isoform columns aligned with annotation IDs.")

In [None]:
# Prepare first row: 'combined_IDs' followed by combined IDs
first_row = pd.Series(["combined_IDs"] + PBMC1_iso_anno["combined_ID"].tolist(), index=PBMC1_iso.columns)


# Prepend this row to PBMC1_iso or PBMC2_iso
PBMC1_iso_with_row = pd.concat([first_row.to_frame().T, PBMC1_iso], ignore_index=True)


In [None]:
first_row = pd.Series(["combined_IDs"] + PBMC2_iso_anno["combined_ID"].tolist(), index=PBMC2_iso.columns)
PBMC2_iso_with_row = pd.concat([first_row.to_frame().T, PBMC2_iso], ignore_index=True)

In [None]:
PBMC1_iso_with_row.head()

In [None]:
PBMC2_iso_with_row.head()

In [None]:
# Assign new headers
PBMC1_iso.columns = ["CellID"] + PBMC1_iso_anno["combined_ID"].tolist()
PBMC2_iso.columns = ["CellID"] + PBMC2_iso_anno["combined_ID"].tolist()

In [None]:
PBMC1_iso.head()

In [None]:
PBMC2_iso.head()

In [None]:
# Save annotated matrices
split_folder = "Parquet_Files/IntermediateFiles"

PBMC1_iso.to_parquet(os.path.join(split_folder, "PBMC1_iso_expr_annotated.parquet"), compression="zstd")
PBMC2_iso.to_parquet(os.path.join(split_folder, "PBMC2_iso_expr_annotated.parquet"), compression="zstd")

print("✅ Annotated gene expression matrices saved.")

In [None]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC1_iso_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [None]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

In [None]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC1_iso_AnnData.h5ad", compression="gzip")

In [None]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC2_iso_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [None]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

In [None]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC2_iso_AnnData.h5ad", compression="gzip")

In [None]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC1_gene_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [None]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

In [None]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC1_gene_AnnData.h5ad", compression="gzip")

In [None]:
import polars as pl
import scanpy as sc
import anndata as ad

# Load your annotated expression matrix
df = pl.read_parquet("Parquet_Files/IntermediateFiles/PBMC2_gene_expr_annotated.parquet")

# Extract cell IDs (obs) and expression matrix (X)
obs = pl.DataFrame({"cell_id": df["CellID"]}).to_pandas().set_index("cell_id")
X = df.drop("CellID").to_numpy()

# Extract var: the column names *are* the combined IDs
combined_ids = df.columns[1:]  # skip "CellID"
var = pl.DataFrame({"combined_id": combined_ids}).to_pandas().set_index("combined_id")

# Create AnnData
adata = ad.AnnData(X=X, obs=obs, var=var)

In [None]:
# Basic summary
print(adata)

# View cell metadata
print(adata.obs.head())

# View feature (gene/transcript) metadata
print(adata.var.head())

# View expression matrix shape or a slice
print(adata.X.shape)
print(adata.X[:5, :5])  # first 5 cells × first 5 features

In [None]:
# === Save (optional) ===
adata.write_h5ad("Intermediate_Files/QC_07232025/PBMC2_gene_AnnData.h5ad", compression="gzip")