#### This particular notebook includes a FFPE block age QC comparison within Dataset 3 Xenium 480 (slides 1 and 2 combined).

#### Required input files:
* Filtered transcript file (for each dataset/rep/slide)
* Annotated cell-based data object (for each dataset)

Note: r denotes filtered transcript file (filtering removal)

Environment: Please create and activate the conda environment provided in default_env.yaml before running this notebook

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from adjustText import adjust_text

import seaborn as sns

import scanpy as sc
import squidpy as sq

import gzip
import anndata

import sys

from matplotlib.ticker import FuncFormatter

import matplotlib.transforms as mtransforms
from scipy import stats
import statsmodels.api as sm

In [None]:
!pip list

# Load in data files

Transcript Files

In [None]:
##### Slide1

Slide1_XeniumICI480_transcripts_r = pd.read_csv('/path/Slide1_XeniumICI480_transcripts_r.csv')

Slide1_XeniumICI480_transcripts_r

In [None]:
## Quantify number of transcripts per cell

## To answer the question: How many transcripts does each cell have?

# Make new df
Slide1_XeniumICI480_transcript_counts_per_cell_r = Slide1_XeniumICI480_transcripts_r.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
Slide1_XeniumICI480_transcript_counts_per_cell_r.columns = ['cell_id', 'transcript_count']

# Display
display(Slide1_XeniumICI480_transcript_counts_per_cell_r)

In [None]:
## Table with cell id and feature name values

# Make df with just cell_id and feature_name
Slide1_XeniumICI480_transcripts_r_featurespercell = Slide1_XeniumICI480_transcripts_r[['cell_id', 'feature_name']]

## Print unique features per cell

## To answer the question: What unique features does each cell express?

# Drop duplicate rows with same cell_id and feature_name combo
Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell = Slide1_XeniumICI480_transcripts_r_featurespercell.drop_duplicates(subset=['cell_id', 'feature_name'])

## Quantify the number of unique feature_name values per cell

## To answer the question: How many unique features does each cell express

# Make new df
Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count = Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell.groupby('cell_id')['feature_name'].nunique().reset_index()

# Rename the new column for clarity
Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count.columns = ['cell_id', 'unique_features']

# Display
display(Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count)

In [None]:
##### Slide2

Slide2_XeniumICI480_transcripts_r = pd.read_csv('/path/Slide2_XeniumICI480_transcripts_r.csv')

Slide2_XeniumICI480_transcripts_r

In [None]:
## Quantify number of transcripts per cell

## To answer the question: How many transcripts does each cell have?

# Make new df
Slide2_XeniumICI480_transcript_counts_per_cell_r = Slide2_XeniumICI480_transcripts_r.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
Slide2_XeniumICI480_transcript_counts_per_cell_r.columns = ['cell_id', 'transcript_count']

# Display
display(Slide2_XeniumICI480_transcript_counts_per_cell_r)

In [None]:
## Table with cell id and feature name values

# Make df with just cell_id and feature_name
Slide2_XeniumICI480_transcripts_r_featurespercell = Slide2_XeniumICI480_transcripts_r[['cell_id', 'feature_name']]

## Print unique features per cell

## To answer the question: What unique features does each cell express?

# Drop duplicate rows with same cell_id and feature_name combo
Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell = Slide2_XeniumICI480_transcripts_r_featurespercell.drop_duplicates(subset=['cell_id', 'feature_name'])

## Quantify the number of unique feature_name values per cell

## To answer the question: How many unique features does each cell express

# Make new df
Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count = Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell.groupby('cell_id')['feature_name'].nunique().reset_index()

# Rename the new column for clarity
Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count.columns = ['cell_id', 'unique_features']

# Display
display(Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count)

Cell data info

In [None]:
# Load in the data object
Xenium_Dataset3_480_IntSlides1and2_Annotated = sc.read_h5ad(
    "/path/25_11_12_Xenium_Dataset3_480_IntSlides1and2_Annotated.h5ad"
)

# View data object
display(Xenium_Dataset3_480_IntSlides1and2_Annotated)

# View metadata
display(Xenium_Dataset3_480_IntSlides1and2_Annotated.obs)

In [None]:
## Format

Xenium_Metadata = Xenium_Dataset3_480_IntSlides1and2_Annotated.obs.copy()

# Get rid of index
Xenium_Metadata = Xenium_Metadata.reset_index(drop=True)

# Rename cell id column
Xenium_Metadata = Xenium_Metadata.rename(columns={
    "cell_id": "cell_id_WSlideName"
})

# Duplicate the column
Xenium_Metadata["cell_id"] = Xenium_Metadata["cell_id_WSlideName"]

# Remove the prefix "Slide1-" or "Slide2-" from the new column
Xenium_Metadata["cell_id"] = Xenium_Metadata["cell_id"].str.replace(
    r"^Slide1-|^Slide2-",
    "",
    regex=True
)

Xenium_Metadata

In [None]:
## Print unique values

for val in Xenium_Metadata['25_06_10_Block_age'].unique():
    print(val)

In [None]:
## Remove cells without block age info

Xenium_Metadata = Xenium_Metadata[
    ~Xenium_Metadata['25_06_10_Block_age'].isin(['slide_1_TMA2', 'slide_2_TMA1'])
]

# Check
display(len(Xenium_Metadata))

for val in Xenium_Metadata['25_06_10_Block_age'].unique():
    print(val)

In [None]:
## Print unique values

for val in Xenium_Metadata['25_06_13_core_ROIs_removed_ileum'].unique():
    print(val)

In [None]:
## Remove cells where the core_ROIs value is Ileum_cores
# These cells/samples were taken from a different colon location than the rest of the data

Xenium_Metadata = Xenium_Metadata[
    Xenium_Metadata['25_06_13_core_ROIs_removed_ileum'] != "Ileum_cores"
]


# Check filtering 
display(len(Xenium_Metadata))

for val in Xenium_Metadata['25_06_13_core_ROIs_removed_ileum'].unique():
    print(val)

In [None]:
# Check for NaNs -- Return sum of any NaN values per column
Xenium_Metadata.isna().sum()

In [None]:
## We will add block name as a new column 
# This will copy the Core_ID_ROI values to a new column, while also removing the underscore follwed by a single digit for blocks with multiple cores
# Specifically _2 and _3

# Make a new column as a copy
Xenium_Metadata['Block_Name'] = (
    Xenium_Metadata['25_06_13_core_ROIs_removed_ileum']
    .str.replace(r'_\d$', '', regex=True)
)

display(Xenium_Metadata)

In [None]:
# Print unique blocks

Xenium_Metadata['Block_Name'].unique()

In [None]:
# Quantify unique cores and blocks

display(Xenium_Metadata['25_06_13_core_ROIs_removed_ileum'].nunique())

display(Xenium_Metadata['Block_Name'].nunique())

# Move to formatting

### Combine number of transcripts and unique features per cell data across slides

In [None]:
## Number of transcripts per cell

# Add a cell_id_WSlideName column for both slides
Slide1_XeniumICI480_transcript_counts_per_cell_r["cell_id_WSlideName"] = (
    "Slide1-" + Slide1_XeniumICI480_transcript_counts_per_cell_r["cell_id"]
)

# View
display(Slide1_XeniumICI480_transcript_counts_per_cell_r)

# Add a cell_id_WSlideName column for both slides
Slide2_XeniumICI480_transcript_counts_per_cell_r["cell_id_WSlideName"] = (
    "Slide2-" + Slide2_XeniumICI480_transcript_counts_per_cell_r["cell_id"]
)

# View
display(Slide2_XeniumICI480_transcript_counts_per_cell_r)

# Combine df's
CombinedSlides_XeniumICI480_transcript_counts_per_cell_r = pd.concat(
    [Slide1_XeniumICI480_transcript_counts_per_cell_r, Slide2_XeniumICI480_transcript_counts_per_cell_r],
    axis=0,
    ignore_index=True
)

# View
display(CombinedSlides_XeniumICI480_transcript_counts_per_cell_r)

In [None]:
## Number of unique features per cell

# Add a cell_id_WSlideName column for both slides
Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count["cell_id_WSlideName"] = (
    "Slide1-" + Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count["cell_id"]
)

# View
display(Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count)

# Add a cell_id_WSlideName column for both slides
Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count["cell_id_WSlideName"] = (
    "Slide2-" + Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count["cell_id"]
)

# View
display(Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count)

# Combine df's
CombinedSlides_XeniumICI480_transcripts_r_uniquefeaturespercell_count = pd.concat(
    [Slide1_XeniumICI480_transcripts_r_uniquefeaturespercell_count, Slide2_XeniumICI480_transcripts_r_uniquefeaturespercell_count],
    axis=0,
    ignore_index=True
)

# View
display(CombinedSlides_XeniumICI480_transcripts_r_uniquefeaturespercell_count)

### Merge Xenium metadata with block info, number of transcripts per cell, and number of unique features per cell info into one combined df (includes both slides)

In [None]:
## Merge based on block_age_CombinedSlides

# Merge with CombinedSlides_XeniumICI480_transcript_counts_per_cell_r
BlockAgeExamination_CombinedSlides = Xenium_Metadata.merge(
    CombinedSlides_XeniumICI480_transcript_counts_per_cell_r, on="cell_id_WSlideName", how="left"
)

# Merge with CombinedSlides_XeniumICI480_transcripts_r_uniquefeaturespercell_count
BlockAgeExamination_CombinedSlides = BlockAgeExamination_CombinedSlides.merge(
    CombinedSlides_XeniumICI480_transcripts_r_uniquefeaturespercell_count, on="cell_id_WSlideName", how="left"
)

# View
display(BlockAgeExamination_CombinedSlides)

In [None]:
# Check for NaNs -- Return sum of any NaN values per column

BlockAgeExamination_CombinedSlides.isna().sum()

### Format combined df

In [None]:
# Step 1: Select columns that we want to keep
BlockAgeExamination_CombinedSlides = BlockAgeExamination_CombinedSlides[
    ['25_06_10_Block_age', '25_06_13_core_ROIs_removed_ileum', 'Block_Name', 'transcript_count', 'unique_features']
]

display(BlockAgeExamination_CombinedSlides)

In [None]:
# Step 2: # Group and aggregate with both median and first (for a categorical or consistent value like block age)

### Group by Block_Name

df = BlockAgeExamination_CombinedSlides.copy()

# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Build aggregation dict
agg_dict = {
    '25_06_10_Block_age': 'first',   # categorical-like
}
agg_dict.update({col: 'median' for col in numeric_cols})

BlockAgeExamination_CombinedSlides_MedianPerBlock = (
    df.groupby('Block_Name', as_index=False)
      .agg(agg_dict)
)

# Display the result
display(BlockAgeExamination_CombinedSlides_MedianPerBlock)

In [None]:
# Order by lowest transcript_count
BlockAgeExamination_CombinedSlides_MedianPerBlock = BlockAgeExamination_CombinedSlides_MedianPerBlock.sort_values('transcript_count', ascending=True)

# Display the result
display(BlockAgeExamination_CombinedSlides_MedianPerBlock)

In [None]:
# Print unique block names

BlockAgeExamination_CombinedSlides_MedianPerBlock['Block_Name'].unique()

In [None]:
## Quantify the number of blocks per each block age category

# Count number of rows per unique block age 
age_counts = BlockAgeExamination_CombinedSlides_MedianPerBlock['25_06_10_Block_age'].value_counts() 

# Turn into a DataFrame
age_counts = age_counts.rename_axis('Block Age').reset_index(name='Number of Blocks') 

# Keep only real age labels like "2y", "3y", etc.
age_counts = age_counts[age_counts['Block Age'].str.endswith('y')].copy()

# Now convert "2y" → 2
age_counts['Block Age (Numeric)'] = (
    age_counts['Block Age']
    .str.replace('y', '', regex=False)
    .astype(int)
)

age_counts = age_counts.sort_values('Block Age (Numeric)')[['Block Age', 'Number of Blocks']] 

# Print result 
print(age_counts.to_string(index=False))

# Output plots

In [None]:
## Number of transcripts per cell

# Copy and process the DataFrame
df = BlockAgeExamination_CombinedSlides_MedianPerBlock.copy()
df['Block_age_clean'] = df['25_06_10_Block_age'].str.replace('y', '', regex=False).astype(float)

# Spearman correlation (rank-based)
spearman_r, spearman_p = stats.spearmanr(df['Block_age_clean'], df['transcript_count'])

# Format p-value for display
if spearman_p < 0.001:
    spearman_p_text = "p < 0.001"
else:
    spearman_p_text = f"p = {spearman_p:.3f}"

# ---- Linear regression on RAW counts (no log transform) ----
X = sm.add_constant(df['Block_age_clean'])
y = df['transcript_count']
ols_model = sm.OLS(y, X).fit()
slope = ols_model.params['Block_age_clean']  # transcripts per cell per year
intercept = ols_model.params['const']

# Plot
sns.set(style="ticks", rc={"figure.figsize": (6, 6)})

# Scatter
sns.scatterplot(
    data=df, x='Block_age_clean', y='transcript_count',
    color='black', s=40, alpha=0.6
)

# Regression line over full axis range
x_pred = np.linspace(0, 12, 100)
y_pred = intercept + slope * x_pred
plt.plot(x_pred, y_pred, color='red', linewidth=2)

# X-axis range and ticks
plt.xlim(0, 12)
plt.xticks(range(0, 13, 2))

# Y-axis range and ticks
plt.ylim(0, 300)
plt.yticks(range(0, 305, 50))

# Axis labels
plt.xlabel('Block Age (Years)', fontsize=14)
plt.ylabel('Number of Transcripts per Cell, Median per Block', fontsize=14)
plt.tick_params(axis='x', labelsize=13)
plt.tick_params(axis='y', labelsize=13)

# Annotation: Spearman + slope
ax = plt.gca()
transform = mtransforms.blended_transform_factory(ax.transAxes, ax.transAxes)
ax.text(
    0.50, 0.99,
    f"Spearman ρ = {spearman_r:.2f}  ({spearman_p_text})\n"
    f"Slope = {slope:.2f} transcripts/year",
    transform=transform, ha='left', va='top', fontsize=12,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.5)
)

sns.despine(offset=5, trim=True)

# Titles
plt.suptitle('Transcript Counts by Block Age', fontsize=16, y=0.99)
plt.title('Xenium Dataset 3; Combined Slides', fontsize=12, pad=20)

# Save plot
plt.tight_layout()
#plt.savefig('/path/XeniumICI480Int_SpearmanPlot_TranscriptCountsByBlockAge_MedianPerBlock_Linear.pdf', bbox_inches='tight', pad_inches=0.1)

plt.show()

In [None]:
## Number of unique features per cell

# Copy and process the DataFrame
df = BlockAgeExamination_CombinedSlides_MedianPerBlock.copy()
df['Block_age_clean'] = df['25_06_10_Block_age'].str.replace('y', '', regex=False).astype(float)

# Spearman correlation (rank-based)
spearman_r, spearman_p = stats.spearmanr(df['Block_age_clean'], df['unique_features'])

# Format p-value for display
spearman_p_text = "p < 0.001" if spearman_p < 0.001 else f"p = {spearman_p:.3f}"

# ---- Linear regression on RAW unique features (no log transform) ----
X = sm.add_constant(df['Block_age_clean'])
y = df['unique_features']
ols_model = sm.OLS(y, X).fit()
slope = ols_model.params['Block_age_clean']   # features per year
intercept = ols_model.params['const']

# Plot
sns.set(style="ticks", rc={"figure.figsize": (6, 6)})

# Scatter
sns.scatterplot(
    data=df, x='Block_age_clean', y='unique_features',
    color='black', s=40, alpha=0.6
)

# Regression line over full axis range
x_pred = np.linspace(0, 12, 100)
y_pred = intercept + slope * x_pred
plt.plot(x_pred, y_pred, color='red', linewidth=2)

# X-axis range and ticks (match transcripts plot)
plt.xlim(0, 12)
plt.xticks(range(0, 13, 2))

# Y-axis range and ticks (match transcripts plot)
plt.ylim(0, 100)
plt.yticks(range(0, 105, 25))

# Labels
plt.xlabel('Block Age (Years)', fontsize=14)
plt.ylabel('Number of Unique Features per Cell, Median per Block', fontsize=14)
plt.tick_params(axis='x', labelsize=13)
plt.tick_params(axis='y', labelsize=13)

# Annotation: Spearman + slope
ax = plt.gca()
transform = mtransforms.blended_transform_factory(ax.transAxes, ax.transAxes)
ax.text(
    0.50, 0.99,
    f"Spearman ρ = {spearman_r:.2f}  ({spearman_p_text})\n"
    f"Slope = {slope:.2f} unique features/year",
    transform=transform, ha='left', va='top', fontsize=12,
    bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
)

sns.despine(offset=5, trim=True)

# Titles
plt.suptitle('Unique Features by Block Age', fontsize=16, y=0.99)
plt.title('Xenium Dataset 3; Combined Slides', fontsize=12, pad=20)

# Save plot
plt.tight_layout()
#plt.savefig('/path/XeniumICI480Int_SpearmanPlot_UniqueFeaturesByBlockAge_MedianPerBlock_Linear.pdf', bbox_inches='tight', pad_inches=0.1)

plt.show()