#### This particular notebook includes QC metric calculations and comparisons between: 
* Dataset 1: Xenium (rep 1 only) vs CosMx
* Dataset 1: Xenium rep 1 vs rep 2 (occasionally referred to as 'Sept' and 'Nov' due to their run dates)

It can be adapted to compare any 2 datasets or reps/slides within a dataset. 


#### Required input files:
* Overlapping gene list between datasets (if applicable)
* Filtered transcript file (for each dataset/rep/slide)
* Filtered cell-based data object (processed or unprocessed) (for each dataset)

Note: r denotes filtered transcript file (filtering removal)

Environment: Please create and activate the conda environment provided in default_env.yaml before running this notebook

In [None]:
## Load in packages

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import seaborn as sns

import scanpy as sc
import squidpy as sq

import gzip
import anndata

import os

import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import kruskal

from adjustText import adjust_text

In [None]:
# Set output directory

sc.settings.figdir = '/path/'

## Transcript-Related Plots

#### Generate data objects for plots

In [None]:
# Read in overlapping genes list between datasets
overlappinggenes_df = pd.read_csv('/path/CosMx1000Xenium290_overlappinggenelist_231112.csv')

# Convert the merged DataFrame to a list
overlappinggenes_list = overlappinggenes_df['unique_genes'].tolist()

Rep1_Xenium

In [None]:
# Load in data

Xenium_transcripts_r = pd.read_csv('/path/Xenium_transcripts_r.csv')

Xenium_transcripts_r

In [None]:
## Quantifying cellular location classification for transcripts

# Xenium_transcripts_r. Percentages of whole

# 46486849 total transcripts

Xenium_r_overlaps_nucleus_0no = (((Xenium_transcripts_r['overlaps_nucleus'] == 0).sum())/46486849) * 100
Xenium_r_overlaps_nucleus_1yes = (((Xenium_transcripts_r['overlaps_nucleus'] == 1).sum())/46486849) * 100

In [None]:
## Quantify number of transcripts per cell

## To answer the question: How many transcripts does each cell have?

# Make new df
Xenium_transcript_counts_per_cell_r = Xenium_transcripts_r.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
Xenium_transcript_counts_per_cell_r.columns = ['cell_id', 'transcript_count']

In [None]:
print("Number of transcripts per cell -- All genes")
print(f"Xenium, Mean: {Xenium_transcript_counts_per_cell_r['transcript_count'].mean()}")
print(f"Xenium, Median: {Xenium_transcript_counts_per_cell_r['transcript_count'].median()}")
print(f"Xenium, Q1: {Xenium_transcript_counts_per_cell_r['transcript_count'].quantile(0.25)}")
print(f"Xenium, Q3: {Xenium_transcript_counts_per_cell_r['transcript_count'].quantile(0.75)}")

In [None]:
## What about for only the overlapping genes?

## To answer the question: How many transcripts (of overlapping genes) does each cell have?


### Part 1: Make overlapping genes df

## Filter df to only keep rows with overlapping genes

# Create a boolean mask to filter rows based on the selected genes
Xenium_mask_r = Xenium_transcripts_r['feature_name'].isin(overlappinggenes_list)

# Apply the mask to filter the DataFrame
Xenium_transcripts_r_overlappinggenes = Xenium_transcripts_r[Xenium_mask_r]

### Part 2: Quantify transcripts for overlapping genes df

# Make new df
Xenium_transcript_counts_per_cell_overlappinggenes_r = Xenium_transcripts_r_overlappinggenes.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
Xenium_transcript_counts_per_cell_overlappinggenes_r.columns = ['cell_id', 'transcript_count']

In [None]:
print("Number of transcripts per cell -- Overlapping genes")
print(f"Xenium, Mean: {Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].mean()}")
print(f"Xenium, Median: {Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].median()}")
print(f"Xenium, Q1: {Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].quantile(0.25)}")
print(f"Xenium, Q3: {Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].quantile(0.75)}")

In [None]:
## Table with cell id and feature name values

# Make df with just cell_id and feature_name
Xenium_transcripts_r_featurespercell = Xenium_transcripts_r[['cell_id', 'feature_name']]

## Print unique features per cell

## To answer the question: What unique features does each cell express?

# Drop duplicate rows with same cell_id and feature_name combo
Xenium_transcripts_r_uniquefeaturespercell = Xenium_transcripts_r_featurespercell.drop_duplicates(subset=['cell_id', 'feature_name'])

In [None]:
## Quantify the number of unique feature_name values per cell

## To answer the question: How many unique features does each cell express

# Make new df
Xenium_transcripts_r_uniquefeaturespercell_count = Xenium_transcripts_r_uniquefeaturespercell.groupby('cell_id')['feature_name'].nunique().reset_index()

# Rename the new column for clarity
Xenium_transcripts_r_uniquefeaturespercell_count.columns = ['cell_id', 'unique_features']

In [None]:
print("Number of unique features per cell -- All genes")
print(f"Xenium, Mean: {Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].mean()}")
print(f"Xenium, Median: {Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].median()}")
print(f"Xenium, Q1: {Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].quantile(0.25)}")
print(f"Xenium, Q3: {Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].quantile(0.75)}")

In [None]:
## What about for only overlapping genes?

## To answer the question: How many unique features (of overlapping genes) does each cell have?


### Part 1: Get rid of duplicate feature rows

# Drop duplicate rows with same cell_id and feature_name combo
Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes = Xenium_transcripts_r_overlappinggenes.drop_duplicates(subset=['cell_id', 'feature_name'])

### Part 2: Quantify unique features for overlapping genes df

# Make new df
Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count = Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes.groupby('cell_id')['feature_name'].nunique().reset_index()

# Rename the new column for clarity
Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count.columns = ['cell_id', 'unique_features']

In [None]:
print("Number of unique features per cell -- Overlapping genes")
print(f"Xenium, Mean: {Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].mean()}")
print(f"Xenium, Median: {Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].median()}")
print(f"Xenium, Q1: {Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].quantile(0.25)}")
print(f"Xenium, Q3: {Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].quantile(0.75)}")

In [None]:
## Quantify the frequency / count that each feature appears in each cell (gene probe expression level)

## To answer the question: What is the expression level of each feature in each cell? (Per gene, per cell)

# Make a copy
Xenium_transcripts_r_quantityofeachfeaturepercell = Xenium_transcripts_r_featurespercell.copy()

# Quantify feature appearances
Xenium_grouped_counts = Xenium_transcripts_r_quantityofeachfeaturepercell.groupby(['cell_id', 'feature_name'])['feature_name'].transform('count')
Xenium_transcripts_r_quantityofeachfeaturepercell.loc[:, 'feature_quantity'] = Xenium_grouped_counts

# Drop duplicate rows to keep only unique combinations
Xenium_transcripts_r_quantityofeachfeaturepercell = Xenium_transcripts_r_quantityofeachfeaturepercell.drop_duplicates(subset=['cell_id', 'feature_name'])

# Resetting index for clarity
Xenium_transcripts_r_quantityofeachfeaturepercell.reset_index(drop=True, inplace=True)

In [None]:
## What about for only the overlapping genes?

## To answer the question: What is the expression level of each overlapping feature in each cell? (Per gene, per cell)

## Filter df to only keep rows with overlapping genes

# Create a boolean mask to filter rows based on the selected genes
Xenium_mask_r = Xenium_transcripts_r_quantityofeachfeaturepercell['feature_name'].isin(overlappinggenes_list)

# Apply the mask to filter the DataFrame
Xenium_transcripts_r_quantityofeachfeaturepercell_overlappinggenes = Xenium_transcripts_r_quantityofeachfeaturepercell[Xenium_mask_r]

Rep2_Xenium (sometimes referred to as 'Nov' for 'November run')

In [None]:
# Load in data

Rep2_Xenium_transcripts_r = pd.read_csv('/path/Nov_Xenium_transcripts_r.csv')

In [None]:
## Table with cell id and feature name values

# Make df with just cell_id and feature_name
Rep2_Xenium_transcripts_r_featurespercell = Rep2_Xenium_transcripts_r[['cell_id', 'feature_name']]

## Print unique features per cell

## To answer the question: What unique features does each cell express?

# Drop duplicate rows with same cell_id and feature_name combo
Rep2_Xenium_transcripts_r_uniquefeaturespercell = Rep2_Xenium_transcripts_r_featurespercell.drop_duplicates(subset=['cell_id', 'feature_name'])

In [None]:
## Quantify the frequency / count that each feature appears in each cell (gene probe expression level)

## To answer the question: What is the expression level of each feature in each cell? (Per gene, per cell)

# Make a copy
Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell = Rep2_Xenium_transcripts_r_featurespercell.copy()

# Quantify feature appearances
Rep2_Xenium_grouped_counts = Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell.groupby(['cell_id', 'feature_name'])['feature_name'].transform('count')
Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell.loc[:, 'feature_quantity'] = Rep2_Xenium_grouped_counts

# Drop duplicate rows to keep only unique combinations
Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell = Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell.drop_duplicates(subset=['cell_id', 'feature_name'])

# Resetting index for clarity
Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell.reset_index(drop=True, inplace=True)

CosMx

In [None]:
# Load in data

CosMx_transcripts_r = pd.read_csv('/CosMx_transcripts_r.csv')

CosMx_transcripts_r

In [None]:
## Quantifying cellular location classification for transcripts

# CosMx_transcripts_r. Percentages of whole

# 27695490 total transcripts

CosMx_r_pct_nuc = (((CosMx_transcripts_r['CellComp'] == 'Nuclear').sum())/27695490) * 100
CosMx_r_pct_mem = (((CosMx_transcripts_r['CellComp'] == 'Membrane').sum())/27695490) * 100
CosMx_r_pct_cyt = (((CosMx_transcripts_r['CellComp'] == 'Cytoplasm').sum())/27695490) * 100

In [None]:
## Quantify number of transcripts per cell

## To answer the question: How many transcripts does each cell have?

# Make new df
CosMx_transcript_counts_per_cell_r = CosMx_transcripts_r.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
CosMx_transcript_counts_per_cell_r.columns = ['cell_id', 'transcript_count']

In [None]:
print("Number of transcripts per cell -- All genes")
print(f"CosMx, Mean: {CosMx_transcript_counts_per_cell_r['transcript_count'].mean()}")
print(f"CosMx, Median: {CosMx_transcript_counts_per_cell_r['transcript_count'].median()}")
print(f"CosMx, Q1: {CosMx_transcript_counts_per_cell_r['transcript_count'].quantile(0.25)}")
print(f"CosMx, Q3: {CosMx_transcript_counts_per_cell_r['transcript_count'].quantile(0.75)}")

In [None]:
## What about for only the overlapping genes?

## To answer the question: How many transcripts (of overlapping genes) does each cell have?


### Part 1: Make overlapping genes df

## Filter df to only keep rows with overlapping genes

# Create a boolean mask to filter rows based on the selected genes
CosMx_mask_r = CosMx_transcripts_r['target'].isin(overlappinggenes_list)

# Apply the mask to filter the DataFrame
CosMx_transcripts_r_overlappinggenes = CosMx_transcripts_r[CosMx_mask_r]

### Part 2: Quantify transcripts for overlapping genes df

# Make new df
CosMx_transcript_counts_per_cell_overlappinggenes_r = CosMx_transcripts_r_overlappinggenes.groupby('cell_id')['transcript_id'].nunique().reset_index()

# Rename columns for clarity
CosMx_transcript_counts_per_cell_overlappinggenes_r.columns = ['cell_id', 'transcript_count']

In [None]:
print("Number of transcripts per cell -- Overlapping genes")
print(f"CosMx, Mean: {CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].mean()}")
print(f"CosMx, Median: {CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].median()}")
print(f"CosMx, Q1: {CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].quantile(0.25)}")
print(f"CosMx, Q3: {CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count'].quantile(0.75)}")

In [None]:
## Table with cell id and feature name values

# Make df with just cell_id and feature_name
CosMx_transcripts_r_featurespercell = CosMx_transcripts_r[['cell_id', 'target']]

## Print unique features per cell

## To answer the question: What unique features does each cell express?
# Drop duplicate rows with same cell_id and feature_name combo
CosMx_transcripts_r_uniquefeaturespercell = CosMx_transcripts_r_featurespercell.drop_duplicates(subset=['cell_id', 'target'])

In [None]:
## Quantify the number of unique feature_name values per cell

## To answer the question: How many unique features does each cell express

# Make new df
CosMx_transcripts_r_uniquefeaturespercell_count = CosMx_transcripts_r_uniquefeaturespercell.groupby('cell_id')['target'].nunique().reset_index()

# Rename the new column for clarity
CosMx_transcripts_r_uniquefeaturespercell_count.columns = ['cell_id', 'unique_features']

In [None]:
print("Number of unique features per cell -- All genes")
print(f"CosMx, Mean: {CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].mean()}")
print(f"CosMx, Median: {CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].median()}")
print(f"CosMx, Q1: {CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].quantile(0.25)}")
print(f"CosMx, Q3: {CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].quantile(0.75)}")

In [None]:
## What about for only overlapping genes?

## To answer the question: How many unique features (of overlapping genes) does each cell have?


### Part 1: Get rid of duplicate feature rows

# Drop duplicate rows with same cell_id and feature_name combo
CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes = CosMx_transcripts_r_overlappinggenes.drop_duplicates(subset=['cell_id', 'target'])

### Part 2: Quantify unique features for overlapping genes df

# Make new df
CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count = CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes.groupby('cell_id')['target'].nunique().reset_index()

# Rename the new column for clarity
CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count.columns = ['cell_id', 'unique_features']

In [None]:
print("Number of unique features per cell -- Overlapping genes")
print(f"CosMx, Mean: {CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].mean()}")
print(f"CosMx, Median: {CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].median()}")
print(f"CosMx, Q1: {CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].quantile(0.25)}")
print(f"CosMx, Q3: {CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].quantile(0.75)}")

In [None]:
## Quantify the frequency / count that each feature appears in each cell (gene probe expression level)

## To answer the question: What is the expression level of each feature in each cell? (Per gene, per cell)

# Make a copy
CosMx_transcripts_r_quantityofeachfeaturepercell = CosMx_transcripts_r_featurespercell.copy()

# Quantify feature appearances
CosMx_grouped_counts = CosMx_transcripts_r_quantityofeachfeaturepercell.groupby(['cell_id', 'target'])['target'].transform('count')
CosMx_transcripts_r_quantityofeachfeaturepercell.loc[:, 'feature_quantity'] = CosMx_grouped_counts

# Drop duplicate rows to keep only unique combinations
CosMx_transcripts_r_quantityofeachfeaturepercell = CosMx_transcripts_r_quantityofeachfeaturepercell.drop_duplicates(subset=['cell_id', 'target'])

# Resetting index for clarity
CosMx_transcripts_r_quantityofeachfeaturepercell.reset_index(drop=True, inplace=True)

In [None]:
## What about for only the overlapping genes?

## To answer the question: What is the expression level of each overlapping feature in each cell? (Per gene, per cell)

## Filter df to only keep rows with overlapping genes

# Create a boolean mask to filter rows based on the selected genes
CosMx_mask_r = CosMx_transcripts_r_quantityofeachfeaturepercell['target'].isin(overlappinggenes_list)

# Apply the mask to filter the DataFrame
CosMx_transcripts_r_quantityofeachfeaturepercell_overlappinggenes = CosMx_transcripts_r_quantityofeachfeaturepercell[CosMx_mask_r]

#### Final set up for plots

CosMx vs Xenium

In [None]:
### CosMx df edits

# Create a new DataFrame without the 'cell_id' column
CosMx_r_ntrancriptsperfeature = CosMx_transcripts_r_quantityofeachfeaturepercell.drop('cell_id', axis=1).copy()

# Group by 'target' and calculate the sum of 'feature_quantity'
CosMx_r_ntrancriptsperfeature = CosMx_r_ntrancriptsperfeature.groupby('target')['feature_quantity'].mean().reset_index()

# Rename the 'target' column to 'feature_name' to match Xenium
CosMx_r_ntrancriptsperfeature = CosMx_r_ntrancriptsperfeature.rename(columns={'target': 'feature_name'})

# Rename the 'feature_quantity' column to 'Xenium_meancounts' to match Xenium
CosMx_r_ntrancriptsperfeature = CosMx_r_ntrancriptsperfeature.rename(columns={'feature_quantity': 'CosMx_r_meancounts'})

# Display df
display(CosMx_r_ntrancriptsperfeature)


### Xenium df edits

# Create a new DataFrame without the 'cell_id' column
Xenium_r_ntrancriptsperfeature = Xenium_transcripts_r_quantityofeachfeaturepercell.drop('cell_id', axis=1).copy()

# Group by 'feature_name' and calculate the sum of 'feature_quantity'
Xenium_r_ntrancriptsperfeature = Xenium_r_ntrancriptsperfeature.groupby('feature_name')['feature_quantity'].mean().reset_index()

# Rename the 'feature_quantity' column to 'Xenium_meancounts' to match Xenium
Xenium_r_ntrancriptsperfeature = Xenium_r_ntrancriptsperfeature.rename(columns={'feature_quantity': 'Xenium_r_meancounts'})

# Display df
display(Xenium_r_ntrancriptsperfeature)

In [None]:
### All genes

# Perform an outer merge based on the 'feature_name' column to keep all of the rows from both data frames
merged_meancounts_allgenes = pd.merge(CosMx_r_ntrancriptsperfeature, Xenium_r_ntrancriptsperfeature, on='feature_name', how='outer')

# Display
display(merged_meancounts_allgenes)
# 1131 rows (which includes all genes from CosMx and Xenium)


### Overlapping genes

# Perform an inner merge based on the 'feature_name' column
merged_meancounts_overlappinggenes = pd.merge(CosMx_r_ntrancriptsperfeature, Xenium_r_ntrancriptsperfeature, on='feature_name', how='inner')

# Display
display(merged_meancounts_overlappinggenes)
# 159 overlapping genes

In [None]:
print("Expression of each gene within cells \n")

print("Mean")
print(f"Xenium, all genes: {merged_meancounts_allgenes['Xenium_r_meancounts'].mean(skipna=True)}")
print(f"CosMx, all genes: {merged_meancounts_allgenes['CosMx_r_meancounts'].mean(skipna=True)}")
print(f"Xenium, overlapping genes: {merged_meancounts_overlappinggenes['Xenium_r_meancounts'].mean()}")
print(f"CosMx, overlapping genes: {merged_meancounts_overlappinggenes['CosMx_r_meancounts'].mean()}")

print(" ")
print("Median")
print(f"Xenium, all genes: {merged_meancounts_allgenes['Xenium_r_meancounts'].median(skipna=True)}")
print(f"CosMx, all genes: {merged_meancounts_allgenes['CosMx_r_meancounts'].median(skipna=True)}")
print(f"Xenium, overlapping genes: {merged_meancounts_overlappinggenes['Xenium_r_meancounts'].median()}")
print(f"CosMx, overlapping genes: {merged_meancounts_overlappinggenes['CosMx_r_meancounts'].median()}")

print(" ")
print("Q1")
print(f"Xenium, all genes: {merged_meancounts_allgenes['Xenium_r_meancounts'].quantile(0.25)}")
print(f"CosMx, all genes: {merged_meancounts_allgenes['CosMx_r_meancounts'].quantile(0.25)}")
print(f"Xenium, overlapping genes: {merged_meancounts_overlappinggenes['Xenium_r_meancounts'].quantile(0.25)}")
print(f"CosMx, overlapping genes: {merged_meancounts_overlappinggenes['CosMx_r_meancounts'].quantile(0.25)}")

print(" ")
print("Q3")
print(f"Xenium, all genes: {merged_meancounts_allgenes['Xenium_r_meancounts'].quantile(0.75)}")
print(f"CosMx, all genes: {merged_meancounts_allgenes['CosMx_r_meancounts'].quantile(0.75)}")
print(f"Xenium, overlapping genes: {merged_meancounts_overlappinggenes['Xenium_r_meancounts'].quantile(0.75)}")
print(f"CosMx, overlapping genes: {merged_meancounts_overlappinggenes['CosMx_r_meancounts'].quantile(0.75)}")

# Will ignore NA vales by default

Rep1 vs Rep2 Xenium

In [None]:
### Xenium (Rep1) df edits

# Make copy
Rep1_Xenium_transcripts_r_quantityofeachfeaturepercell = Xenium_transcripts_r_quantityofeachfeaturepercell.copy()

# Create a new DataFrame without the 'cell_id' column
Rep1_Xenium_r_ntrancriptsperfeature = Rep1_Xenium_transcripts_r_quantityofeachfeaturepercell.drop('cell_id', axis=1).copy()

# Group by 'feature_name' and calculate the sum of 'feature_quantity'
Rep1_Xenium_r_ntrancriptsperfeature = Rep1_Xenium_r_ntrancriptsperfeature.groupby('feature_name')['feature_quantity'].mean().reset_index()

# Rename the 'feature_quantity' column to 'Xenium_meancounts' to match Xenium
Rep1_Xenium_r_ntrancriptsperfeature = Rep1_Xenium_r_ntrancriptsperfeature.rename(columns={'feature_quantity': 'Rep1_Xenium_r_meancounts'})

# Display df
display(Rep1_Xenium_r_ntrancriptsperfeature)

### Xenium (Rep2) df edits

# Create a new DataFrame without the 'cell_id' column
Rep2_Xenium_r_ntrancriptsperfeature = Rep2_Xenium_transcripts_r_quantityofeachfeaturepercell.drop('cell_id', axis=1).copy()

# Group by 'feature_name' and calculate the sum of 'feature_quantity'
Rep2_Xenium_r_ntrancriptsperfeature = Rep2_Xenium_r_ntrancriptsperfeature.groupby('feature_name')['feature_quantity'].mean().reset_index()

# Rename the 'feature_quantity' column to 'Xenium_meancounts' to match Xenium
Rep2_Xenium_r_ntrancriptsperfeature = Rep2_Xenium_r_ntrancriptsperfeature.rename(columns={'feature_quantity': 'Rep2_Xenium_r_meancounts'})

# Display df
display(Rep2_Xenium_r_ntrancriptsperfeature)

In [None]:
# Perform an outer merge based on the 'feature_name' column to keep all of the rows from both data frames
XeniumReps_merged_meancounts = pd.merge(Rep1_Xenium_r_ntrancriptsperfeature, Rep2_Xenium_r_ntrancriptsperfeature, on='feature_name', how='outer')

# Display
display(XeniumReps_merged_meancounts)
# 290 rows (all Xenium genes)

#### Output plots

In [None]:
# Scatterplot for overlapping genes

# Extract X and Y values from the DataFrame
x_values = merged_meancounts_overlappinggenes['Xenium_r_meancounts']
y_values = merged_meancounts_overlappinggenes['CosMx_r_meancounts']

# Set the size of the plot
plt.figure(figsize=(7, 7))  # Width=7 inches, Height=7 inches

# Calculate Pearson's correlation
corr, p_value = pearsonr(x_values, y_values)

# Calculate medians
median_x = np.median(x_values)
median_y = np.median(y_values)

# Calculate median fold change -- using x / y to emphasize how the x values are larger
median_fold_change = median_x / median_y if median_x != 0 else 0

# Create plot 
plt.scatter(x_values, y_values)

# Set axis limits (specify the range for the x and y axes)
plt.xlim(0, 11)
plt.ylim(0, 11) 

# Set custom tick locations and labels for the x and y axes
# Customize the tick locations (as integers)
plt.xticks([0, 2, 4, 6, 8, 10], fontsize=11)
plt.yticks([0, 2, 4, 6, 8, 10], fontsize=11)  


# Add labels next to each point using marker_styles and a value threshold (e.g., 1.0)
value_threshold = 3.0  # Modify this threshold as needed

annotations = []

feature_names = merged_meancounts_overlappinggenes['feature_name']

# Loop through the rows and label the points
for i, feature_name in enumerate(feature_names):
    x_val = x_values[i]
    y_val = y_values[i]

    if x_val > value_threshold or y_val > value_threshold:
        label = plt.annotate(feature_name, (x_val, y_val), ha='left', fontsize=11)
        annotations.append(label)

# Use adjust_text to automatically adjust label positions
adjust_text(annotations, arrowprops=dict(arrowstyle='->', color='black'))

# Draw a dashed diagonal line through the plot
plt.plot([0, 11], [0, 11], 'k--', linewidth=1) # 'k--' specifies a black, dashed line

# Display Pearson's correlation coefficient, p-value, and median fold on the plot
plt.text(0.5, 10.5, f'Pearson R = {corr:.2f}\np value = {p_value:.2e}\nMedian fold = {median_fold_change:.2f}', fontsize=12, ha='left', va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

# Add labels and a title
plt.xlabel('Xenium', fontsize=12)
plt.ylabel('CosMx', fontsize=12)
plt.title('Mean Transcript Counts (Overlapping Genes)', fontsize=12)

# Disable the grid
plt.grid(False)

# Save plot
#plt.savefig('/path/Scatterplot_MeanCounts_OverlappingGenesOnly.pdf')

# Display the plot
plt.show()

In [None]:
# Scatterplot for all genes 

# Replace NaN values with -0.90
merged_meancounts_allgenes.fillna(-0.90, inplace=True)

# Axis tick code
def format_tick_labels(axis_range):
    # Generate tick labels: show labels for 0 and multiples of 2, empty string for others
    labels = [str(i) if i % 2 == 0 or i == 0 else '' for i in axis_range]
    return labels

In [None]:
## Continued from previous code block
# Extract X and Y values from the DataFrame
y_values = merged_meancounts_allgenes['CosMx_r_meancounts']
x_values = merged_meancounts_allgenes['Xenium_r_meancounts']

# Filter data to include only formerly NA points
na_points = merged_meancounts_allgenes[(x_values == -0.90) | (y_values == -0.90)]
na_x_values = na_points['Xenium_r_meancounts']
na_y_values = na_points['CosMx_r_meancounts']

# Set the size of the plot
plt.figure(figsize=(9, 13))  # Width=13 inches, Height=6 inches

# Define colors based on the condition (darkorange for Xenium NA, cornflowerblue for CosMx NA)
colors = ['darkorange' if x == -0.90 else 'cornflowerblue' for x, y in zip(na_x_values, na_y_values)]

# Create plot 
plt.scatter(na_x_values, na_y_values, c=colors)

# Set axis limits (specify the range for the x and y axes)
plt.ylim(-1, 23)
plt.xlim(-1, 12)

# Set custom tick locations for the x and y axes
x_ticks = range(-1, 12)
y_ticks = range(-1, 23)  

# Set custom tick labels, showing only 0 and multiples of 2
plt.xticks(x_ticks, format_tick_labels(x_ticks), fontsize=11)
plt.yticks(y_ticks, format_tick_labels(y_ticks), fontsize=11)

# Add labels next to each point using marker_styles and a value threshold (e.g., 1.0)
value_threshold = 3.0  # Modify this threshold as needed

annotations = []

feature_names = na_points['feature_name']

# Loop through the rows and label the points
for i, feature_name in enumerate(feature_names):
    x_val = na_x_values.iloc[i]
    y_val = na_y_values.iloc[i]

    if x_val > value_threshold or y_val > value_threshold:
        label = plt.annotate(feature_name, (x_val, y_val), ha='left', fontsize=11)
        annotations.append(label)

# Use adjust_text to automatically adjust label positions
adjust_text(annotations, arrowprops=dict(arrowstyle='->', color='black'))

# Add lines at 1, since 10x Genomics states that a high expression gene has >1 mean count per cell 
plt.axvline(x=1, color='gray', linestyle='--')
plt.axhline(y=1, color='gray', linestyle='--')

# Disable the grid
plt.grid(False)

# Add labels and a title
plt.ylabel('CosMx', fontsize=12)
plt.xlabel('Xenium', fontsize=12)
plt.title('Mean Transcript Counts (Dataset-Specific Genes)', fontsize=12)

# Save plot
#plt.savefig('/path/Scatterplot_MeanCounts_DatasetSpecificGenes.pdf')

# Display the plot
plt.show()

In [None]:
# Scatterplot for comparing Xenium reps

# Extract X and Y values from the DataFrame
x_values = XeniumReps_merged_meancounts['Rep1_Xenium_r_meancounts']
y_values = XeniumReps_merged_meancounts['Rep2_Xenium_r_meancounts']

# Set the size of the plot
plt.figure(figsize=(7, 7))  # Width=7 inches, Height=7 inches

# Calculate Pearson's correlation
corr, p_value = pearsonr(x_values, y_values)

# Create plot 
plt.scatter(x_values, y_values)

# Set axis limits (specify the range for the x and y axes)
plt.xlim(0, 11)
plt.ylim(0, 11) 

# Set custom tick locations and labels for the x and y axes
# Customize the tick locations (as integers)
plt.xticks([0, 2, 4, 6, 8, 10], fontsize=11)
plt.yticks([0, 2, 4, 6, 8, 10], fontsize=11)  

# Note: Removed the code to label the dots

# Draw a dashed diagonal line through the plot
plt.plot([0, 11], [0, 11], 'k--', linewidth=1) # 'k--' specifies a black, dashed line

# Display Pearson's correlation coefficient and p-value on the plot
plt.text(0.5, 10.5, f'Pearson R = {corr:.2f}\np value = {p_value:.2e}', fontsize=12, ha='left', va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

# Add labels and a title
plt.xlabel('Xenium - Rep 1', fontsize=12)
plt.ylabel('Xenium - Rep 2', fontsize=12)
plt.title('Mean Transcript Counts (All Genes)', fontsize=12)

# Disable the grid
plt.grid(False)

# Save plot
#plt.savefig('/path/Scatterplot_XeniumReps_MeanCounts.pdf')

# Display the plot
plt.show()

#### Run MannWhitney tests (Wilcoxon) 

In [None]:
### Transcript counts

## All Genes

# Create lists
Xenium_transcript_counts_per_cell_stats = Xenium_transcript_counts_per_cell_r['transcript_count']
CosMx_transcript_counts_per_cell_stats = CosMx_transcript_counts_per_cell_r['transcript_count']

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_transcript_counts_per_cell_stats, 
                   CosMx_transcript_counts_per_cell_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print("Transcript counts_All genes")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.50f}')

## Overlapping Genes

# Create lists
Xenium_transcript_counts_per_cell_overlappinggenes_stats = Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count']
CosMx_transcript_counts_per_cell_overlappinggenes_stats = CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count']

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_transcript_counts_per_cell_overlappinggenes_stats, 
                   CosMx_transcript_counts_per_cell_overlappinggenes_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print(" ")
print("Transcript counts_Overlapping genes")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.50f}')



### Unique features

## All Genes

# Create lists
Xenium_uniquefeatures_stats = Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].tolist()
CosMx_uniquefeatures_stats = CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].tolist()

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_uniquefeatures_stats, 
                   CosMx_uniquefeatures_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print(" ")
print("Unique Features_All genes")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.50f}')

## Overlapping Genes

# Create lists
Xenium_uniquefeatures_overlappinggenes_stats = Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].tolist()
CosMx_uniquefeatures_overlappinggenes_stats = CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].tolist()

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_uniquefeatures_overlappinggenes_stats, 
                   CosMx_uniquefeatures_overlappinggenes_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print(" ")
print("Unique Features_Overlapping genes")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.50f}')

In [None]:
## Box and Whiskers Log scale transcript count plot for all and overlapping genes (with p-values)

# Convert the 'transcript_count' columns to lists and combine them
data = [
    Xenium_transcript_counts_per_cell_r['transcript_count'].tolist(),
    CosMx_transcript_counts_per_cell_r['transcript_count'].tolist(),
    Xenium_transcript_counts_per_cell_overlappinggenes_r['transcript_count'],
    CosMx_transcript_counts_per_cell_overlappinggenes_r['transcript_count']
]

# Custom labels
labels = ['Xenium\n ', 'CosMx\n ', 'Xenium', 'CosMx']

# Create a DataFrame from the lists for compatibility with seaborn
df_data = pd.DataFrame(data).transpose()  # Transpose so each list becomes a column
df_data.columns = labels  # Set the column names as your custom labels

# Melt the DataFrame for plotting with seaborn
df_long = df_data.melt(var_name='Technology', value_name='Number of Transcripts per Cell')

# Now plotting with seaborn
sns.set(style="ticks", rc={"figure.figsize": (6, 6)})

# Create a color palette
palette = {
    'Xenium\n ': 'cornflowerblue',
    'CosMx\n ': 'darkorange',
    'Xenium': 'cornflowerblue',
    'CosMx': 'darkorange'
}

# Create the boxplot with seaborn
sns.boxplot(data=df_long, x='Technology', y='Number of Transcripts per Cell', palette=palette, flierprops=dict(marker='o', markersize=4, color='black'))

# Use log scale for the y-axis
plt.yscale('log')  

# Extend y-axis to 10000
plt.ylim(1, 10000)

# Manually adjusting the x-axis to add custom labels between certain ticks
# There are 4 main categories plotted at positions 0, 1, 2, 3 on the x-axis
tick_positions = [0, 1, 2, 3]  

# Calculate midpoints for text placement
midpoint_1 = (tick_positions[0] + tick_positions[1]) / 2
midpoint_2 = (tick_positions[2] + tick_positions[3]) / 2

# Adjust layout to make space for text below the x-axis
plt.subplots_adjust(bottom=0.2)  # Increase bottom margin; adjust as needed

# Get current axes
ax = plt.gca()

# Create a blended transformation
# x in data coords, y in axes fraction coords
transform = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)

# Place custom text below the plot's y-axis using axes coordinates
ax.text(midpoint_1, -0.1, 'All Genes', ha='center', va='top', fontsize=12, transform=transform)
ax.text(midpoint_2, -0.1, 'Overlapping Genes', ha='center', va='top', fontsize=12, transform=transform)

# Add stats
ax.text(midpoint_1, 0.92, 'p<0.001', ha='center', va='top', fontsize=12, transform=transform)
ax.text(midpoint_2, 0.92, 'p<0.001', ha='center', va='top', fontsize=12, transform=transform)

plt.xlabel('', fontsize=12)
plt.ylabel('Number of Transcripts per Cell', fontsize=12)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=11)
sns.despine(offset=5, trim=True)

# Adjust layout
plt.tight_layout()

# Save plot
#plt.savefig('/path/BoxWhiskers_NumTranscripts_AllGenes_LogScale_WithPvalues_.pdf')

# Show the plot
plt.show()

In [None]:
## Box and Whiskers Log scale unique features plot for all and overlapping genes (with p-values)

# Convert the 'unique_features' columns to lists and combine them
data = [
    Xenium_transcripts_r_uniquefeaturespercell_count['unique_features'].tolist(),
    CosMx_transcripts_r_uniquefeaturespercell_count['unique_features'].tolist(),
    Xenium_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].tolist(),
    CosMx_transcripts_r_uniquefeaturespercell_overlappinggenes_count['unique_features'].tolist()
]

# Custom labels
labels = ['Xenium\n ', 'CosMx\n ', 'Xenium', 'CosMx']

# Create a DataFrame from the lists for compatibility with seaborn
df_data = pd.DataFrame(data).transpose()  # Transpose so each list becomes a column
df_data.columns = labels  # Set the column names as your custom labels

# Melt the DataFrame for plotting with seaborn
df_long = df_data.melt(var_name='Technology', value_name='Number of Unique Features per Cell')

# Now plotting with seaborn
sns.set(style="ticks", rc={"figure.figsize": (6, 6)})

# Create a color palette
palette = {
    'Xenium\n ': 'cornflowerblue',
    'CosMx\n ': 'darkorange',
    'Xenium': 'cornflowerblue',
    'CosMx': 'darkorange'
}

# Create the boxplot with seaborn
sns.boxplot(data=df_long, x='Technology', y='Number of Unique Features per Cell', palette=palette, flierprops=dict(marker='o', markersize=4, color='black'))

# Use log scale for y-axis
plt.yscale('log')

# Extend y-axis to 1000
plt.ylim(1, 1000)

# Manually adjusting the x-axis to add custom labels between certain ticks
# There are 4 main categories plotted at positions 0, 1, 2, 3 on the x-axis
tick_positions = [0, 1, 2, 3]  

# Calculate midpoints for text placement
midpoint_1 = (tick_positions[0] + tick_positions[1]) / 2
midpoint_2 = (tick_positions[2] + tick_positions[3]) / 2

# Adjust layout to make space for text below the x-axis
plt.subplots_adjust(bottom=0.2)  # Increase bottom margin; adjust as needed

# Get current axes
ax = plt.gca()

# Create a blended transformation
# x in data coords, y in axes fraction coords
transform = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)

# Place custom text below the plot's y-axis using axes coordinates
ax.text(midpoint_1, -0.1, 'All Genes', ha='center', va='top', fontsize=12, transform=transform)
ax.text(midpoint_2, -0.1, 'Overlapping Genes', ha='center', va='top', fontsize=12, transform=transform)

# Add stats
ax.text(midpoint_1, 0.92, 'p<0.001', ha='center', va='top', fontsize=12, transform=transform)
ax.text(midpoint_2, 0.92, 'p<0.001', ha='center', va='top', fontsize=12, transform=transform)

plt.xlabel('', fontsize=12)
plt.ylabel('Number of Unique Features per Cell', fontsize=12)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=11)
sns.despine(offset=5, trim=True)

# Adjust layout
plt.tight_layout()

# Save plot
#plt.savefig('/path/BoxWhiskers_NumUniqueFeatures_AllGenes_LogScale_WithPvalues_.pdf')

# Show the plot
plt.show()

## Cell-Related Plots

In [None]:
# Load in data

Int_XeniumData = sc.read_h5ad('/path/25_11_22_Xenium_Dataset1_290_IntReps1and2_Annotated.h5ad')
CosMxData = sc.read_h5ad('/path/25_11_22_CosMx_Annotated.h5ad')

In [None]:
## Subset Int_XeniumData to create a Rep1 only version

Rep1_XeniumData = Int_XeniumData[Int_XeniumData.obs['batch'] == 'Sept'].copy()

Rep1_XeniumData.obs

#### Cell area

In [None]:
# Create lists
Xenium_cell_area_stats = Rep1_XeniumData.obs['cell_area_um2']
CosMx_cell_area_stats = CosMxData.obs['cell_area_um2']

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_cell_area_stats, 
                   CosMx_cell_area_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print("Cell area")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.5f}')

In [None]:
## Box and Whiskers plot for cell area

# Convert the 'cell_area_um2' columns to lists and combine them
data = [
    Rep1_XeniumData.obs["cell_area_um2"].tolist(),
    CosMxData.obs["cell_area_um2"].tolist()
]

# Custom labels
labels = ['Xenium', 'CosMx']

# Create a DataFrame from the lists for compatibility with seaborn
df_data = pd.DataFrame(data).transpose()  # Transpose so each list becomes a column
df_data.columns = labels  # Set the column names as your custom labels

# Melt the DataFrame for plotting with seaborn
df_long = df_data.melt(var_name='Technology', value_name='Cell Area (um2)')

# Now plotting with seaborn
sns.set(style="ticks", rc={"figure.figsize": (4.5, 6)})

# Create a color palette
palette = {
    'Xenium': 'cornflowerblue',
    'CosMx': 'darkorange'
}

# Create the boxplot with seaborn
sns.boxplot(data=df_long, x='Technology', y='Cell Area (um2)', palette=palette, flierprops=dict(marker='o', markersize=4, color='black'))

# Extend y-axis to 2000
plt.ylim(0, 2000)

# Specify custom y-axis ticks
plt.yticks([0, 500, 1000, 1500, 2000])

# Manually adjusting the x-axis to add custom labels between certain ticks
# There are 4 main categories plotted at positions 0, 1, 2, 3 on the x-axis
tick_positions = [0, 1, 2, 3]  

# Calculate midpoints for text placement
midpoint_1 = (tick_positions[0] + tick_positions[1]) / 2

# Adjust layout to make space for text below the x-axis
plt.subplots_adjust(bottom=0.2)  # Increase bottom margin; adjust as needed

# Get current axes
ax = plt.gca()

# Create a blended transformation
# x in data coords, y in axes fraction coords
transform = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)

# Place custom text using axes coordinates
# Add stats
ax.text(midpoint_1, 0.92, 'p=0.377', ha='center', va='top', fontsize=12, transform=transform)

# Customize the plot
plt.ylabel("Cell Area (um2)", fontsize=12)
plt.xlabel("Technology", fontsize=12)
#plt.title("Cell Area (um2)", fontsize=12)
sns.despine(offset=5, trim=True)  # Removes the top and right border spines

plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=11)

# Adjust the plot
plt.tight_layout()  # Adjust layout to make room for the legend if necessary

# Save plot
#plt.savefig('/path/BoxWhiskers_CellArea.pdf')

# Show the figure
plt.show()

In [None]:
print("Cell area \n")

print("Xenium")
print(f"Mean: {Rep1_XeniumData.obs['cell_area_um2'].mean()}")
print(f"Median: {Rep1_XeniumData.obs['cell_area_um2'].median()}")
print(f"Q1: {Rep1_XeniumData.obs['cell_area_um2'].quantile(0.25)}")
print(f"Q3: {Rep1_XeniumData.obs['cell_area_um2'].quantile(0.75)}")
print(f"Maximum: {Rep1_XeniumData.obs['cell_area_um2'].max()}")

print(" ")
print("CosMx")
print(f"Mean: {CosMxData.obs['cell_area_um2'].mean()}")
print(f"Median: {CosMxData.obs['cell_area_um2'].median()}")
print(f"Q1: {CosMxData.obs['cell_area_um2'].quantile(0.25)}")
print(f"Q3: {CosMxData.obs['cell_area_um2'].quantile(0.75)}")
print(f"Maximum: {CosMxData.obs['cell_area_um2'].max()}")

#### Transcript cellular location

In [None]:
## Stacked barplot for transcript cellular location

## Note: CosMx is further broken down by Membrane and Cytoplasm, which isn't being specified here

# Categories
categories = ['Xenium', 'CosMx']

# Numpy arrays
Nuclear = [Xenium_r_overlaps_nucleus_1yes, CosMx_r_pct_nuc]
NonNuclear = [Xenium_r_overlaps_nucleus_0no, (CosMx_r_pct_mem + CosMx_r_pct_cyt)]

# Set the width of the bars
bar_width = 0.35

# Create a smaller figure and axis
fig, ax = plt.subplots(figsize=(5, 6))  # Adjust the width and height as needed

# Create stacked bar plot with reduced gap
p1 = ax.bar(np.arange(len(categories)), Nuclear, bar_width, label='Nuclear', color='darkviolet')
p2 = ax.bar(np.arange(len(categories)), NonNuclear, bar_width, bottom=Nuclear, label='Non-Nuclear', color='forestgreen')
       
# Labels
for bar in ax.patches:
    ax.text(bar.get_x() + bar.get_width() / 2,
            bar.get_height() / 2 + bar.get_y(),
            f'{round(bar.get_height())}%',  # Add "%" to the label
            ha='center',
            color='white',
            size=13,
            fontweight='bold')

# Display legend
ax.legend().set_visible(True)
ax.legend(fontsize=11)

# Add labels, title
ax.set_ylabel('Percent of Transcripts in Dataset', fontsize=12)
# ax.set_title('Transcript Cellular Composition', fontsize=11)

# Set y-axis ticks and labels
plt.tick_params(axis='y', labelsize=11)

# Despine
sns.despine(offset=5, trim=True)  # Removes the top and right border spines

# Adjust layout
plt.tight_layout()

# Set x-axis ticks and labels
plt.xticks(ticks=np.arange(len(categories)), labels=categories, fontsize=12)

# Save plot
#plt.savefig('/path/StackedBarPlot_CellComp_Updated_.pdf')

# Show the plot
plt.show()

#### Negative probes per cell

In [None]:
print("Negative probes per cell \n")

print("Xenium")
print(f"Mean: {Rep1_XeniumData.obs['control_probe_counts'].mean()}")
print(f"Median: {Rep1_XeniumData.obs['control_probe_counts'].median()}")
print(f"Q1: {Rep1_XeniumData.obs['control_probe_counts'].quantile(0.25)}")
print(f"Q3: {Rep1_XeniumData.obs['control_probe_counts'].quantile(0.75)}")
print(f"Maximum: {Rep1_XeniumData.obs['control_probe_counts'].max()}")

print(" ")
print("CosMx")
print(f"Mean: {CosMxData.obs['nCount_negprobes'].mean()}")
print(f"Median: {CosMxData.obs['nCount_negprobes'].median()}")
print(f"Q1: {CosMxData.obs['nCount_negprobes'].quantile(0.25)}")
print(f"Q3: {CosMxData.obs['nCount_negprobes'].quantile(0.75)}")
print(f"Maximum: {CosMxData.obs['nCount_negprobes'].max()}")

In [None]:
sorted_df = CosMxData.obs.sort_values(by='nCount_negprobes', ascending=False)

display(sorted_df)

In [None]:
# Create lists
Xenium_neg_probes_stats = Rep1_XeniumData.obs['control_probe_counts']
CosMx_neg_probes_stats = CosMxData.obs['nCount_negprobes']

## Run MannWhitney test (Wilcoxon)
res = stats.mannwhitneyu(Xenium_neg_probes_stats, 
                   CosMx_neg_probes_stats,
                   use_continuity=True, 
                   alternative='two-sided', axis=0, method='auto')

print("Cell area")
print(res)
print(f'Statistic: {res.statistic}, p-value: {res.pvalue:.50f}')

In [None]:
## Box and Whiskers plot for negative probes per cell

# Convert the associated negativate probe columns to lists and combine them
data = [
    Rep1_XeniumData.obs["control_probe_counts"].tolist(),
    CosMxData.obs["nCount_negprobes"].tolist()
]

# Custom labels
labels = ['Xenium', 'CosMx']

# Create a DataFrame from the lists for compatibility with seaborn
df_data = pd.DataFrame(data).transpose()  # Transpose so each list becomes a column
df_data.columns = labels  # Set the column names as your custom labels

# Melt the DataFrame for plotting with seaborn
df_long = df_data.melt(var_name='Technology', value_name='Number of Negative Probes per Cell')

# Now plotting with seaborn
sns.set(style="ticks", rc={"figure.figsize": (4.5, 6)})

# Create a color palette
palette = {
    'Xenium': 'cornflowerblue',
    'CosMx': 'darkorange'
}

# Create the boxplot with seaborn
sns.boxplot(data=df_long, x='Technology', y='Number of Negative Probes per Cell', palette=palette, flierprops=dict(marker='o', markersize=4, color='black'))

# Extend y-axis to 25
plt.ylim(0, 25)

# Specify custom y-axis ticks
plt.yticks([0, 5, 10, 15, 20, 25])

# Manually set the y-tick labels to only show particular values
#plt.gca().set_yticklabels(['0', '', '10', '', '20', ''])

# Manually adjusting the x-axis to add custom labels between certain ticks
# There are 4 main categories plotted at positions 0, 1, 2, 3 on the x-axis
tick_positions = [0, 1, 2, 3]  

# Calculate midpoints for text placement
midpoint_1 = (tick_positions[0] + tick_positions[1]) / 2

# Adjust layout to make space for text below the x-axis
plt.subplots_adjust(bottom=0.2)  # Increase bottom margin; adjust as needed

# Get current axes
ax = plt.gca()

# Create a blended transformation
# x in data coords, y in axes fraction coords
transform = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)

# Place custom text using axes coordinates
# Add stats
ax.text(midpoint_1, 0.92, 'p<0.001', ha='center', va='top', fontsize=12, transform=transform)

# Customize the plot
plt.ylabel("Number of Negative Probes per Cell", fontsize=12)
plt.xlabel("Technology", fontsize=12)
sns.despine(offset=5, trim=True)  # Removes the top and right border spines

plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=11)

# Adjust the plot
plt.tight_layout()  # Adjust layout to make room for the legend if necessary

# Save plot
#plt.savefig('/path/BoxWhiskers_NegProbesPerCell.pdf')

# Show the figure
plt.show()