In [None]:
import sys
import pandas as pd
import numpy as np
import polars as pl
import RNApysoforms as RNApy
import scanpy as sc
from scanpy import read_h5ad
import os
import matplotlib.pyplot as plt
import os
import pandas as pd
import scanpy as sc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import re
import gzip
from pathlib import Path

# Optional (e.g., interactive or advanced visuals)
import seaborn as sns  # If you want fancy plots later
import muon as mu

# Optional: interactive plotting (Jupyter)
%matplotlib inline

In [None]:
import pandas as pd
import gzip

import re

def parse_gtf_attributes(attr_str):
    """
    Parse 'attributes' column in GTF into dictionary. 
    """
    return dict(re.findall(r'(\S+) "([^"]+)"', attr_str))

def rebuild_attributes(attrs):
    """
    Rebuilds attribute dictionary back into GTF attribute setting
    """
    return '; '.join(f'{k} "{v}"' for k, v in attrs.items()) + ';'

## Extract transcript biotypes from the reference GTF
ref_gtf = "Homo_sapiens.GRCh38.113.gtf"
ref_biotype_map = {}

with open(ref_gtf, 'r') as f:
    for line in f:
        if line.startswith("#"):
            continue
        fields = line.strip().split('\t')
        if fields[2] != "transcript":
            continue
        attrs = parse_gtf_attributes(fields[8])
        tx_id = attrs.get("transcript_id")
        # Older Ensembl versions used transcript_type; prefer transcript_biotype when present.
        biotype = attrs.get("transcript_biotype") or attrs.get("transcript_type")
        if tx_id and biotype:
            ref_biotype_map[tx_id] = biotype

# Annotate extended GTF with missing biotypes and tag BambuTx as novel
input_gtf = "extended_annotations.gtf"
output_gtf = "PBMC_JUNE_with_biotypes.gtf"

with open(input_gtf, 'r') as fin, open(output_gtf, 'w') as fout:
    for line in fin:
        if line.startswith("#"):
            fout.write(line)
            continue
        fields = line.strip().split('\t')
        attrs = parse_gtf_attributes(fields[8])
        tx_id = attrs.get("transcript_id", "")

        #If it is a Bambu transcript, mark as "novel" for biotype
        if tx_id.startswith("BambuTx"):
            attrs["transcript_biotype"] = "novel"
        else:
            #Otherwise, use biotype from Ensembl reference
            matched_biotype = ref_biotype_map.get(tx_id)
            if matched_biotype:
                attrs["transcript_biotype"] = matched_biotype
        fields[8] = rebuild_attributes(attrs)
        fout.write('\t'.join(fields) + '\n')

print(f"Wrote annotated GTF with biotypes → {output_gtf}")

In [None]:
# Quick sanity check for BambuTx that didn't get assigned "novel"
with open(output_gtf, 'r') as f:
    for line in f:
        if line.startswith("#"):
            continue
        fields = line.strip().split('\t')
        attrs = dict(re.findall(r'(\S+) "([^"]+)"', fields[8]))
        tx_id = attrs.get("transcript_id", "")
        biotype = attrs.get("transcript_biotype", "")
        if tx_id.startswith("BambuTx") and biotype != "novel":
            print(f"❌ Mismatch: {tx_id} → {biotype}")

In [None]:
import re
import gzip
from pathlib import Path

def open_maybe_gzip(path):
    return gzip.open(path, 'rt') if str(path).endswith('.gz') else open(path, 'r')

# input files
gtf_path = "PBMC_JUNE_with_biotypes.gtf"   # <- your annotated GTF from the previous step

# collect genes with ≥1 novel transcript
genes_with_novel = {}   # gene_id -> gene_name

with open_maybe_gzip(gtf_path) as f:
    for line in f:
        if not line or line.startswith("#"):
            continue
        fields = line.rstrip("\n").split("\t")
        if len(fields) < 9:
            continue

        feature_type = fields[2]
        # Restrict to transcript rows (fast & avoids double counting)
        if feature_type != "transcript":
            continue

        attrs = parse_gtf_attributes(fields[8])
        tx_id   = attrs.get("transcript_id", "")
        biotype = attrs.get("transcript_biotype") or attrs.get("transcript_type")
        gene_id = attrs.get("gene_id", "")
        gene_nm = attrs.get("gene_name", gene_id)

        # "novel" biotypes, or any BambuTx (paranoia check)
        is_novel = (biotype == "novel") or tx_id.startswith("BambuTx")
        if is_novel and gene_id:
            genes_with_novel[gene_id] = gene_nm
            
print("Genes with ≥1 novel transcript:\n")
for gid, gname in sorted(genes_with_novel.items()):
    print(f"{gid}\t{gname}")

# If you want the pure ENSG list (like your example):
ensg_ids_novel_isos = sorted(genes_with_novel.keys())
print("\nensg_ids_novel_isos = [")
for gid in ensg_ids_novel_isos:
    print(f'    "{gid}",')
print("]")

In [None]:
import os
import pandas as pd
import scanpy as sc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# --- Load AnnData object ---
output_dir = 'Intermediate_Files/Clustering'
adata = sc.read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))

## This block is all genes with at least 1 novel isoform
ensg_ids = set([
    "ENSG00000002822", "ENSG00000003147", "ENSG00000062370", "ENSG00000093217", "ENSG00000112763", "ENSG00000124588", "ENSG00000127249",
    "ENSG00000133328", "ENSG00000135951", "ENSG00000138030", "ENSG00000140993", "ENSG00000145217", "ENSG00000145220", "ENSG00000146453", 
    "ENSG00000155875", "ENSG00000161132", "ENSG00000164989", "ENSG00000175773", "ENSG00000185614", "ENSG00000186230", "ENSG00000187118",
    "ENSG00000196236", "ENSG00000196260", "ENSG00000196284", "ENSG00000196431", "ENSG00000198642", "ENSG00000203876", "ENSG00000211685",
    "ENSG00000215375", "ENSG00000228150", "ENSG00000230647", "ENSG00000239789", "ENSG00000242736", "ENSG00000243449", "ENSG00000247595",
    "ENSG00000250999", "ENSG00000253409", "ENSG00000257553", "ENSG00000259212", "ENSG00000259630", "ENSG00000260342", "ENSG00000262165",
    "ENSG00000267320", "ENSG00000275413", "ENSG00000280987", "ENSG00000283128", "ENSG00000284048", "ENSG00000287919", "ENSG00000289582",
    "ENSG00000289740", "ENSG00000290073", "ENSG00000290104", "ENSG00000294609", "ENSG00000295857", "ENSG00000297760", "ENSG00000298363",
    "ENSG00000303581", "ENSG00000304072", "ENSG00000304758", "ENSG00000305069", "ENSG00000305139", "ENSG00000306802", "ENSG00000308813",
    "ENSG00000309071", "ENSG00000309098", "ENSG00000310508"
])

In [None]:
import os
import pandas as pd

# Parse transcript/gene IDs
parsed_ids_3part = adata.var_names.str.extract(
    r'(?P<gene_name>.*?):(?P<gene_id>[^:]+):(?P<transcript_id>.+)'
)
parsed_ids_2part = adata.var_names.str.extract(
    r'(?P<gene_id>[^:]+):(?P<transcript_id>.+)'
)
parsed_ids_2part["gene_name"] = None
parsed_ids = parsed_ids_3part.combine_first(parsed_ids_2part)
parsed_ids.index = adata.var_names
adata.var = parsed_ids

# Subset to genes/transcripts of interest: ENSGs with novel isoforms and all Bambu loci
genes_to_keep = adata.var_names[
    adata.var["gene_id"].isin(ensg_ids)
    | adata.var["gene_id"].str.startswith("BambuGene")
    | adata.var["transcript_id"].str.startswith("BambuTx")
]
adata_sub = adata[:, genes_to_keep].copy()

# Ensure required layers exist  for export to RNAPysoforms
for required in ("counts", "denoised", "log_denoised"):
    if required not in adata_sub.layers:
        raise KeyError(f"Required layer '{required}' not found in adata_sub.layers.")

# Function to build a matrix df from a layer (rows=transcripts, cols=samples)
def layer_to_df(layer_name: str) -> pd.DataFrame:
    df = pd.DataFrame(
        adata_sub.layers[layer_name].T,                 # transcripts x cells
        index=adata_sub.var["transcript_id"],
        columns=adata_sub.obs_names.astype(str),
    )
    df.insert(0, "gene_id", adata_sub.var["gene_id"].to_numpy())
    df.insert(0, "transcript_id", adata_sub.var["transcript_id"].to_numpy())
    return df

# Build matrices to assign to layers for RNAPysoforms
counts_df     = layer_to_df("counts")
autozi_df     = layer_to_df("denoised")       # AutoZI denoised, treat as CPM for RNAPy visualization
autozi_log_df = layer_to_df("log_denoised")   # AutoZI log-denoised, treat as "relative"

# Build metadata dataframe (sample_id must match matrix column headers)
metadata_df = adata_sub.obs.copy()
metadata_df["sample_id"] = adata_sub.obs_names.astype(str)

if "gen_cell_type" in metadata_df.columns:
    metadata_df["condition"] = metadata_df["gen_cell_type"].astype(str)
elif "cell_type" in metadata_df.columns:
    metadata_df["condition"] = metadata_df["cell_type"].astype(str)
else:
    raise KeyError("Could not find a condition column (e.g., 'gen_cell_type' or 'cell_type') in adata_sub.obs")

metadata_df = metadata_df[["sample_id", "condition"]]
metadata_df["sample_id"] = metadata_df["sample_id"].str.strip()

# --- Sanity check overlap ---
def cols_to_samples(df_):
    return df_.columns[2:].astype(str).str.strip()  # skip transcript_id, gene_id

md_ids = set(metadata_df["sample_id"])
counts_ids   = set(cols_to_samples(counts_df))
autozi_ids   = set(cols_to_samples(autozi_df))
autozi_log_ids = set(cols_to_samples(autozi_log_df))

print(f"Counts: {len(counts_ids)} samples | Metadata: {len(md_ids)} | Overlap: {len(counts_ids & md_ids)}")
print(f"AutoZI (denoised): {len(autozi_ids)} samples | Metadata: {len(md_ids)} | Overlap: {len(autozi_ids & md_ids)}")
print(f"AutoZI (log_denoised): {len(autozi_log_ids)} samples | Metadata: {len(md_ids)} | Overlap: {len(autozi_log_ids & md_ids)}")

if not (counts_ids & md_ids) or not (autozi_ids & md_ids) or not (autozi_log_ids & md_ids):
    raise ValueError("No overlapping sample IDs between one or more matrices and metadata. "
                     "Ensure metadata_df['sample_id'] comes from adata_sub.obs_names.")

In [None]:
# --- Export ---
out_dir = "RNAPysoforms/LongRead_FullCounts_PerCell"
os.makedirs(out_dir, exist_ok=True)

counts_df.to_csv(f"{out_dir}/counts_matrix_longread_percell.tsv", sep="\t", index=False)
autozi_df.to_csv(f"{out_dir}/autozi_matrix_longread_percell.tsv", sep="\t", index=False)
autozi_log_df.to_csv(f"{out_dir}/autozi_log_matrix_longread_percell.tsv", sep="\t", index=False)
metadata_df.to_csv(f"{out_dir}/sample_metadata_longread_percell.tsv", sep="\t", index=False)

In [None]:
import pandas as pd

df = pd.read_csv("RNAPysoforms/LongRead_FullCounts_PerCell/counts_matrix_longread_percell.tsv", sep="\t")
bambu_transcripts = df[df["transcript_id"].str.startswith("BambuTx", na=False)]

print(bambu_transcripts["transcript_id"].unique())
print(f"Total BambuTx transcripts: {len(bambu_transcripts)}")

In [None]:
# --- Build metadata dataframe (make sample_id == counts column headers) ---
metadata_df = adata_sub.obs.copy()

# create sample directly from obs_names (cell barcodes)
metadata_df["sample"] = adata_sub.obs_names.astype(str)

# make/rename your condition column
if "gen_cell_type" in metadata_df.columns:
    metadata_df["condition"] = metadata_df["gen_cell_type"].astype(str)
elif "cell_type" in metadata_df.columns:
    metadata_df["condition"] = metadata_df["cell_type"].astype(str)
else:
    raise KeyError("Could not find a condition column (e.g., 'gen_cell_type' or 'cell_type') in adata_sub.obs")

# keep only needed columns
metadata_df = metadata_df[["sample", "condition"]]

# ✅ rename 'sample' -> 'sample_id'
metadata_df = metadata_df.rename(columns={"sample": "sample_id"})

# optional: ensure no whitespace issues
metadata_df["sample_id"] = metadata_df["sample_id"].str.strip()
counts_sample_ids = counts_df.columns[2:].astype(str).str.strip()  # skip transcript_id, gene_id

# quick sanity check
overlap = set(counts_sample_ids).intersection(set(metadata_df["sample_id"]))
print(f"Counts samples: {len(counts_sample_ids)} | Metadata samples: {metadata_df.shape[0]} | Overlap: {len(overlap)}")

if len(overlap) == 0:
    raise ValueError("No overlapping sample IDs between counts columns and metadata sample_id. "
                     "Check that metadata_df['sample_id'] is built from adata_sub.obs_names.")

# --- Export ---
out_dir = "RNAPysoforms/LongRead_FullCounts_PerCell"
os.makedirs(out_dir, exist_ok=True)
metadata_df.to_csv(f"{out_dir}/sample_metadata_longread_percell.tsv", sep="\t", index=False)

In [None]:
counts_df.head

In [None]:
metadata_df.head

In [None]:
## Path to your ENSEMBL GTF file, counts matrix file, and metadata file
## Paths
ensembl_gtf_path      = "PBMC_JUNE_with_biotypes.gtf"
expression_matrix_path = "RNAPysoforms/LongRead_FullCounts_PerCell/counts_matrix_longread_percell.tsv"
metadata_path          = "RNAPysoforms/LongRead_FullCounts_PerCell/sample_metadata_longread_percell.tsv"
autozi_matrix_path     = "RNAPysoforms/LongRead_FullCounts_PerCell/autozi_matrix_longread_percell.tsv"          # denoised
autozi_log_matrix_path = "RNAPysoforms/LongRead_FullCounts_PerCell/autozi_log_matrix_longread_percell.tsv"     # log_denoised

"""
Read ENSEMBL gtf and counts matrix with metadata.
We will NOT let RNAPy compute CPM or relative abundance; we'll inject our own:
  - CPM column  <- denoised (AutoZI)
  - relative_abundance <- log_denoised (AutoZI log)
"""
annotation = RNApy.read_ensembl_gtf(ensembl_gtf_path)
expression_matrix = RNApy.read_expression_matrix(
    expression_matrix_path=expression_matrix_path,
    metadata_path=metadata_path,
    cpm_normalization=False,
    relative_abundance=False
) 

In [None]:
import polars as pl

# Read denoised (AutoZI) and map it to CPM layer
autozi_wide = pl.read_csv(autozi_matrix_path, separator="\t")
autozi_long_cpm = (
    autozi_wide.unpivot(
        on=autozi_wide.columns[2:],                 # all sample columns
        index=["transcript_id", "gene_id"],
        variable_name="sample_id",
        value_name="CPM"                            # treat denoised as CPM
    )
    .with_columns(pl.col("sample_id").cast(pl.Utf8).str.strip_chars())
)

# Read log_denoised and map it to relative_abundance
autozi_log_wide = pl.read_csv(autozi_log_matrix_path, separator="\t")
autozi_long_ra = (
    autozi_log_wide.unpivot(
        on=autozi_log_wide.columns[2:],             # all sample columns
        index=["transcript_id", "gene_id"],
        variable_name="sample_id",
        value_name="relative_abundance"             # treat log_denoised as RA
    )
    .with_columns(pl.col("sample_id").cast(pl.Utf8).str.strip_chars())
)

# Join both onto the RNAPy expression matrix
expression_matrix = (
    expression_matrix
    .join(autozi_long_cpm, on=["transcript_id", "gene_id", "sample_id"], how="left")
    .join(autozi_long_ra,  on=["transcript_id", "gene_id", "sample_id"], how="left")
)

In [None]:
print(annotation.columns)

In [None]:
expression_matrix.head

In [None]:
# --- Filter to gene of interest; order by the Relative Abundance (log_denoised) we just added ---
CMC1_annotation, CMC1_expression_matrix = RNApy.gene_filtering(
    annotation=annotation,
    expression_matrix=expression_matrix,
    target_gene="ENSG00000187118",
    gene_id_column="gene_id",
    order_by_expression=True,
    order_by_expression_column="relative_abundance"
)

# --- Optional: rescale introns (compact view) ---
CMC1_annotation = RNApy.shorten_gaps(CMC1_annotation)


"""
Filter only the desired transcripts and keep them on in the app_annotation.
`make_traces()` only plots transcripts present in both the annotationa and the
expression matrix when both are passed (it does give a warning when that happens),
so only the transcripts you kept in the annotation will be plotted.
"""
#transcript_to_keep = ["ENST00000348990", "ENST00000707133"]
#CMC1_annotation = CMC1_annotation.filter(pl.col("transcript_id").is_in(transcript_to_keep))


## Order transcripts based on `transcripts_to_keep` order
#CMC1_annotation =CMC1_annotation.with_columns(
#    pl.col("transcript_id").cast(pl.Categorical).cast(pl.Utf8).replace(
##        {k: i for i, k in enumerate(transcript_to_keep)}).alias("sort_key")
#        ).sort("sort_key", descending=True).drop("sort_key")


"""
Create traces for plotting, make sure to set the
`order_transcripts_by_expression_matrix` to False so
that the order of the annotation is the one that determines
the order in which the transcripts are plotted.
"""
# --- Build traces: show CPM (denoised) + RA (log_denoised) ---
traces = RNApy.make_traces(
    annotation=CMC1_annotation,
    expression_matrix=CMC1_expression_matrix,
    x_start="rescaled_start", x_end="rescaled_end",
    y="transcript_id",
    annotation_hue="transcript_biotype",
    hover_start="start", hover_end="end",
    expression_columns=["CPM", "relative_abundance"],
    expression_hue="condition",
    marker_size=3, arrow_size=7,
    order_transcripts_by_expression_matrix=False
)

# Hide individual dots on boxplots
for subplot in traces[1:]:
    for trace in subplot:
        if hasattr(trace, "boxpoints"):
            trace.boxpoints = False

In [None]:
fig = RNApy.make_plot(
    traces=traces,
    subplot_titles=["Transcript Structure", "AutoZI denoised", "AutoZI log_denoised"],
    width=900, height=1500, boxgap=0.1, boxgroupgap=0.5
)
fig.show()

In [None]:
### Transcript structure model for CMC1 (Figure 6c)

import polars as pl
import pandas as pd

TARGET_GENE = "ENSG00000187118" # CMC1 Ensembl Gene ID

# --- Subset annotation to the target gene WITHOUT touching expression ---
if isinstance(annotation, pl.DataFrame):
    CMC1_annotation = (
        annotation
        .filter(pl.col("gene_id") == TARGET_GENE)
        .sort(["transcript_id", "start", "end"])
    )
    CMC1_annotation = RNApy.shorten_gaps(CMC1_annotation)
    x_start_col, x_end_col = "rescaled_start", "rescaled_end"

elif isinstance(annotation, pd.DataFrame):
    CMC1_annotation = (
        annotation.loc[annotation["gene_id"] == TARGET_GENE]
        .sort_values(["transcript_id", "start", "end"])
        .copy()
    )
    CMC1_annotation = RNApy.shorten_gaps(CMC1_annotation)
    x_start_col, x_end_col = "rescaled_start", "rescaled_end"

else:
    raise TypeError("annotation must be a Polars or Pandas DataFrame")

# --- Define desired colors for transcript biotypes (hex codes) ---
biotype_colors = {
    "protein_coding": "#1B75BB",                   # light blue
    "protein_coding_CDS_not_defined": "#2E3191",   # dark blue / navy
    "retained_intron": "#00A551",                  # green
    "nonsense_mediated_decay": "#FAAF40",          # orange
    "novel_isoform": "#90278E"                     # purple
}

# --- Build traces for transcript structure ONLY ---
traces = RNApy.make_traces(
    annotation=CMC1_annotation,
    x_start=x_start_col,
    x_end=x_end_col,
    y="transcript_id",
    annotation_hue="transcript_biotype",
    hover_start="start",
    hover_end="end",
    order_transcripts_by_expression_matrix=False
)

# --- Create figure ---
fig = RNApy.make_plot(
    traces=traces,
    subplot_titles=["CMC1 Transcript Structure"],
    width=900,
    height=900,
    boxgap=0.01,
    boxgroupgap=0.1
)

# --- Recolor traces by transcript_biotype ---
for trace in fig.data:
    biotype = getattr(trace, "name", None)
    if biotype in biotype_colors:
        color = biotype_colors[biotype]
        if hasattr(trace, "marker"):
            trace.marker.color = color
        if hasattr(trace, "line"):
            trace.line.color = color
        if hasattr(trace, "fillcolor"):
            trace.fillcolor = color

# --- Optional: bring novel isoforms to the top for visibility ---
fig.data = tuple(sorted(fig.data, key=lambda t: t.name != "novel_isoform"))

# --- Adjust legend and layout ---
fig.update_layout(
    legend=dict(
        x=0.65,
        y=0.98,
        xanchor="left",
        yanchor="top",
        orientation="v"
    )
)
fig.show()

In [None]:
### Transcript structure model for LYAR (Figure 6f)

import polars as pl
import pandas as pd

TARGET_GENE = "ENSG00000145220" # LYAR Ensembl Gene ID

# --- Subset annotation to the target gene WITHOUT touching expression ---
if isinstance(annotation, pl.DataFrame):
    LYAR_annotation = (
        annotation
        .filter(pl.col("gene_id") == TARGET_GENE)
        .sort(["transcript_id", "start", "end"])
    )
    LYAR_annotation = RNApy.shorten_gaps(LYAR_annotation)
    x_start_col, x_end_col = "rescaled_start", "rescaled_end"

elif isinstance(annotation, pd.DataFrame):
    LYAR_annotation = (
        annotation.loc[annotation["gene_id"] == TARGET_GENE]
        .sort_values(["transcript_id", "start", "end"])
        .copy()
    )
    LYAR_annotation = RNApy.shorten_gaps(LYAR_annotation)
    x_start_col, x_end_col = "rescaled_start", "rescaled_end"

else:
    raise TypeError("annotation must be a Polars or Pandas DataFrame")

# --- Define color scheme for transcript biotypes (consistent palette) ---
biotype_colors = {
    "protein_coding": "#1B75BB",                   # blue
    "protein_coding_CDS_not_defined": "#2E3191",   # dark blue / navy
    "retained_intron": "#00A551",                  # green
    "nonsense_mediated_decay": "#FAAF40",          # orange
    "novel_isoform": "#90278E"                     # purple
}

# --- Build traces for transcript structure ONLY ---
traces = RNApy.make_traces(
    annotation=LYAR_annotation,
    x_start=x_start_col,
    x_end=x_end_col,
    y="transcript_id",
    annotation_hue="transcript_biotype",
    hover_start="start",
    hover_end="end",
    order_transcripts_by_expression_matrix=False
)

# --- Plot just the structure track ---
fig = RNApy.make_plot(
    traces=traces,
    subplot_titles=["LYAR Transcript Structure"],
    width=900,
    height=450,
    boxgap=0.01,
    boxgroupgap=0.1
)

# --- Apply custom colors to each biotype ---
for trace in fig.data:
    biotype = getattr(trace, "name", None)
    if biotype in biotype_colors:
        color = biotype_colors[biotype]
        if hasattr(trace, "marker"):
            trace.marker.color = color
        if hasattr(trace, "line"):
            trace.line.color = color
        if hasattr(trace, "fillcolor"):
            trace.fillcolor = color

# --- Optionally bring novel isoforms to front for visibility ---
fig.data = tuple(sorted(fig.data, key=lambda t: t.name != "novel_isoform"))

# --- Legend placement ---
fig.update_layout(
    legend=dict(
        x=1.0,
        y=0.98,
        xanchor="left",
        yanchor="top",
        orientation="v"
    )
)

fig.show()