# Genome-Transcriptome Integration using SIMBA

In [None]:
import os
import simba as si
from scipy import sparse
import pandas as pd
import seaborn as sns
print(si.__version__)
print(sns.__version__)

In [None]:
si.settings.set_figure_params(dpi=80,
                              style='white',
                              fig_size=[5,5],
                              rc={'image.cmap': 'viridis'})

# make plots prettier
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')

In [None]:
project_name = "07-03-2024" 
workdir = f'/mnt/d/JorritvU/SIMBA/full_integration/{project_name}/unfiltered'
si.settings.set_workdir(workdir)

In [None]:
# Annotation files with Omic-specific cluster labels
seurat_clusters = "/mnt/d/JorritvU/Tripolar/scRNA-seq/s143/processed/Seurat/s143_Seuratdata.csv"
chaotic_sample_names_file = "/mnt/d/JorritvU/Tripolar/scDNA-seq/chaotic_cnv_patterns.lst" 

# File with barcode - well combination
well_barcode_file = "/mnt/d/JorritvU/Tripolar/scRNA-seq/SORTseq_cellbarcodes.tsv" 


In [None]:
# Read the RNA plates

adata_s143 = si.read_h5ad('/mnt/d/JorritvU/Tripolar/scRNA-seq/s143/old/SNV/s143.germline.updated.h5ad')
adata_s145 = si.read_h5ad('/mnt/d/JorritvU/Tripolar/scRNA-seq/s145/old/SNV/s145.germline.updated.h5ad')

In [None]:
# Read the DNA plates

adata_chi006 = si.read_h5ad('/mnt/d/JorritvU/Tripolar/scDNA-seq/CHI-006/processed/SNV/CHI-006.germline_v2.h5ad')
adata_chi007 = si.read_h5ad('/mnt/d/JorritvU/Tripolar/scDNA-seq/CHI-007/processed/SNV/CHI-007.germline_v2.h5ad')

In [None]:
# Attach Seurat clusters to RNA cells
import re

def create_well_annotations(seurat_clusters_mapping, well_barcode_file, adata):
    """Create a dictionary with well annotations from the well barcode match file."""
    well_annotations = {}
    # Find the barcode - well match
    with open(well_barcode_file, 'r') as file:
        for line in file:
            well,_ , barcode = line.strip().split('\t')
            well_annotations[barcode] = well

    barcode_df = pd.DataFrame(list(well_annotations.items()), columns=['Barcode', 'Well'])

    # Find the Barcode - SeuratCluster match
    seurat_clusters_mapping['barcode_second_half'] = seurat_clusters_mapping['barcode'].str.split('_', expand=False).str[1]

    seurat_clusters_mapping = pd.DataFrame({
        'Barcode': seurat_clusters_mapping['barcode_second_half'],
        'type_scluster': seurat_clusters_mapping['type_scluster']
    })

    # Combine the dataframes
    combined_df = pd.merge(seurat_clusters_mapping, barcode_df, on='Barcode')
    print(combined_df)
    # Find wells in the name of the sample (e.g. s145_bi_F18_Bipolar -> F18) 
    #   And then attach to adata
    pattern = r'_([AGCT]{8})_'
    barcode_info = [re.search(pattern, name).group(1) if re.search(pattern, name) else None for name in adata.obs_names]
    
    adata_obs_df = pd.DataFrame({'obs_names': adata.obs_names, 'Barcode': barcode_info})
    
    well_to_cluster = combined_df[['Barcode', 'type_scluster']].set_index('Barcode').to_dict()['type_scluster']
    bc_map = adata_obs_df['Barcode'].map(well_to_cluster)
    print(bc_map)
    adata.obs['cluster'] = list(bc_map)
    
    return adata


seurat_clusters_mapping = pd.read_csv(seurat_clusters, header=0)
adata_s143 = create_well_annotations(seurat_clusters_mapping, well_barcode_file, adata_s143)

adata_s145 = create_well_annotations(seurat_clusters_mapping, well_barcode_file, adata_s145)

In [None]:
# Read the list of sample names with the chaotic cnv pattern

with open(chaotic_sample_names_file) as fi:
    sample_names = [line.strip().split('.')[0] for line in fi.readlines()]

sample_names = {name.replace('−', '-') for name in sample_names}

In [None]:
# Assign the CNV profile type (chaotic or normal) to the cells.

# Check if each obs_name is in the set of sample names and assign 1 or 0 accordingly
adata_chi006.obs['cluster'] = ["chaotic_cnv" if name in sample_names else "normal_cnv" for name in adata_chi006.obs_names]

# If you need to ensure that 'cluster' is of a specific dtype (e.g., int), you can enforce it like this:
adata_chi006.obs['cluster'] = adata_chi006.obs['cluster'].astype(str)

# Now the same for CHI-007
adata_chi007.obs['cluster'] = ["chaotic_cnv" if name in sample_names else "normal_cnv" for name in adata_chi007.obs_names]
adata_chi007.obs['cluster'] = adata_chi007.obs['cluster'].astype(str)

In [None]:
adata_chi007.obs

Now we filter on Allele Frequency (AF). <br/>
For now arbitrary number (0.1). 

In [None]:
adata_s143.var['pass'] = adata_s143.var['AF'] > 0.1

In [None]:
adata_s145.var['pass'] = adata_s145.var['AF'] > 0.1

In [None]:
adata_chi006.var['pass'] = adata_chi006.var['AF'] > 0.1

In [None]:
adata_chi007.var['pass'] = adata_chi007.var['AF'] > 0.1

## Merge the RNA runs into 1, and merge the DNA runs into 1

In [None]:
import anndata as ad

def merge_datasets(adata1, adata2):
    common_vars = list(set(adata1.var_names).intersection(set(adata2.var_names)))
    print(f"Number of intersecting SNVs: {len(common_vars)}")
    adata1 = adata1[:, common_vars]
    adata2 = adata2[:, common_vars]
    adata = ad.concat([adata1, adata2], merge='first', join='inner')
    return adata

In [None]:
adata_dna = merge_datasets(adata_chi006, adata_chi007)  
print(adata_dna)
adata_rna = merge_datasets(adata_s143, adata_s145)  
print(adata_rna)

# DNA and RNA

In [None]:
"""
Filter the NaN samples and based on AF > 0.05.
If other sample types should be excluded, change code here.
"""

data = {'rna': adata_rna, 'dna': adata_dna}

for k in ['rna', 'dna']:
    data[f"{k}_filtered"] = data[k][~data[k].obs_names.str.contains('nan|Control', na=False), data[k].var['AF'] > 0.05].copy()
    print(f"{k}_filtered: {data[f'{k}_filtered'].shape}")

data

In [None]:
filtered_datasets = [d for d in data.keys() if 'filtered' in d]
filtered_datasets

## SIMBA
Infer edges between cells

In [None]:
"""
Infer edges between all possible combinations of 'filtered_datasets'.
In this case only 1 combination:
rna_filtered - dna_filtered
"""

import itertools
# Dictionary to store the CC_embeddings
CC_embeddings = {}

# Iterate over all pairs of datasets
for dataset1, dataset2 in itertools.combinations(filtered_datasets, 2):
    print(f"Inferring edges between 1) {dataset1}, and 2) {dataset2}")

    print(data[dataset1].var_names)
    print(data[dataset2].var_names)
    
    # Call si.tl.infer_edges for each combination
    CC_embeddings[f'CC-{dataset1}-{dataset2}'] = si.tl.infer_edges(data[dataset1], data[dataset2], feature='pass', n_components=10, k=10)  # n_components & k need to be optimized

CC_embeddings

In [None]:
'''
Prune the DNA so that only the RNA SNVs are being kept to reduce complexity.
Then create a new dictionary that includes the filtered datasets + the cell-cell embedding.
'''

all_datasets = {}

# Overlap RNA-DNA, and prune on common features -> DNA is too big
common_vars = set(data[filtered_datasets[1]].var_names).intersection(set(data[filtered_datasets[0]].var_names))

# Filtered datasets 0 = RNA
data[filtered_datasets[0]] = data[filtered_datasets[0]][:, list(common_vars)]

# Filtered datasets 1 = DNA
data[filtered_datasets[1]] = data[filtered_datasets[1]][:, list(common_vars)]


for k in filtered_datasets:
    print(k)
    all_datasets[k] = data[k].copy()

# Adding CC_embeddings as separate items in the list
for key, value in CC_embeddings.items():
    all_datasets[key] = value

all_datasets

In [None]:
'''
Ensure good naming within datasets.
'''

for dataset_name, dataset in all_datasets.items():
    print('Checking', dataset_name)
    assert dataset.var_names.is_unique, f"var_names in {dataset_name} are not unique."
    assert dataset.obs_names.is_unique, f"obs_names in {dataset_name} are not unique."
    assert dataset.var_names.notnull().all(), f"var_names in {dataset_name} contain null values."
    assert dataset.obs_names.notnull().all(), f"obs_names in {dataset_name} contain null values."

## Generate the graph and train the NN

In [None]:
si.tl.gen_graph(list_adata=[x.copy() for x in all_datasets.values()], copy=False, layer=None, dirname="graph")

In [None]:
si.settings.pbg_params

In [None]:
dict_config = si.settings.pbg_params.copy()
dict_config['workers'] = 12

# MKL_THREADING_LAYER is by default INTEL
# By default numpy is trying to use Intel's implementation of OpenMP, while PyTorch is linked with GNU, which seem to trigger this error message
# To avoid this, we set MKL_THREADING_LAYER to GNU, and this is an environment variable
os.environ["MKL_THREADING_LAYER"]="GNU"

## start training
si.tl.pbg_train(pbg_params = dict_config, auto_wd=True, save_wd=True, output='model', use_edge_weights=True)

In [None]:
si.pl.pbg_metrics(fig_ncol=3)

## Show results

In [None]:
'''
Extract the embedding and assign correct names and variables.
'''

dict_adata = si.read_embedding()
adata_C = dict_adata['E0']  # embeddings of cells from RNA
adata_C2 = dict_adata['E2']  # embeddings for cells from DNA
adata_S = dict_adata['E1']  # embeddings for SNVs

In [None]:
data['rna_filtered'].obs

In [None]:
adata_C.obs['phenotype'] = data['rna_filtered'][adata_C.obs_names,:].obs['Phenotype'].copy()
adata_C.obs['cluster'] = data['rna_filtered'][adata_C.obs_names,:].obs['cluster'].copy()
adata_C.obs['batch'] = data['rna_filtered'][adata_C.obs_names,:].obs['Batch'].copy()
si.tl.umap(adata_C,n_neighbors=15,n_components=2)

adata_C2.obs['phenotype'] = data['dna_filtered'][adata_C2.obs_names,:].obs['Phenotype'].copy()
adata_C2.obs['cluster'] = data['dna_filtered'][adata_C2.obs_names,:].obs['cluster'].copy()
adata_C2.obs['batch'] = data['dna_filtered'][adata_C2.obs_names,:].obs['Batch'].copy()
si.tl.umap(adata_C2,n_neighbors=15,n_components=2)

In [None]:
'''
Use the RNA as the reference, and DNA as the query.
'''

adata_all = si.tl.embed(adata_ref=adata_C,list_adata_query=[adata_C2])

## add annotations of two batches
adata_all.obs['entity_group'] = ""
adata_all.obs.loc[adata_C.obs_names, 'entity_group'] = "rna"
adata_all.obs.loc[adata_C2.obs_names, 'entity_group'] = "dna"

# Add entity group to the SNVs if the SNVs embeddings is included in the query.
# adata_all.obs.loc[adata_S.obs_names, 'entity_group'] = "SNV"

In [None]:
import math

clusters = list(adata_all.obs['cluster'])
for i, c in enumerate(clusters):
    if c is not None and isinstance(c, float) and math.isnan(c):
        clusters[i] = 'Unclustered'

adata_all.obs['cluster'] = clusters
adata_all.obs['cluster']

In [None]:
adata_all.X[1,:]  ## This is what 1 embedding looks like

In [None]:
si.tl.umap(adata_all,n_neighbors=20,n_components=2)  # n_components & n_neighbors need to be optimized

"""
    
UMAPs to save to file
    
"""
si.pl.umap(adata_C,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               drawing_order='random',
               save_fig=True, fig_path=f"{workdir}", fig_name=f"UMAP_RNA.pdf")
    
si.pl.umap(adata_C2,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               drawing_order='random',
               save_fig=True, fig_path=f"{workdir}", fig_name=f"UMAP_DNA.pdf")
    
si.pl.umap(adata_all,color=['entity_group', 'batch','phenotype','cluster'],
               drawing_order='random',
               fig_size=(5.5,4),
               save_fig=True, fig_path=f"{workdir}", fig_name=f"UMAP_integrated.pdf")
    


si.pl.umap(adata_C,
           color=['phenotype', 'batch'],
           fig_size=(5.5, 4),
           drawing_order='random')

si.pl.umap(adata_C2,
           color=['phenotype', 'batch'],
           fig_size=(5.5, 4),
           drawing_order='random')

si.pl.umap(adata_all,color=['entity_group', 'batch','phenotype','cluster'],
           drawing_order='random',
           fig_size=(5.5,4),
           save_fig=False, fig_path=f"{workdir}", fig_name="UMAP_noControl.pdf")

# SIMBA integration with filtered SNVs based on proportions

In [None]:

workdir = f'/mnt/d/JorritvU/SIMBA/full_integration/{project_name}/filtered'
si.settings.set_workdir(workdir)

In [None]:
"""
Function to add proportions per SNV
"""

import numpy as np

def calc_props(adata):
    X_dense = adata.X.toarray()
    
    # Count occurrences of each variant type (1, 2, 3) per cell
    variant_counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=4)[1:], axis=1, arr=X_dense)
    
    # Calculate proportions
    variant_proportions = variant_counts / variant_counts.sum(axis=1, keepdims=True)
    
    # Add proportions back to adata as layers or as part of obs (depending on your preference)
    # Add variant proportions as separate columns in adata.obs
    data[key].obs['variant_1_proportion_0/0'] = variant_proportions[:, 0]
    data[key].obs['variant_2_proportion_0/1'] = variant_proportions[:, 1]
    data[key].obs['variant_3_proportion_1/1'] = variant_proportions[:, 2]


    snv_counts = np.zeros((adata.n_vars, 3), dtype=int)
    
    # Iterate over each variant type and count occurrences per SNV
    for variant_type in range(1, 4):
        snv_counts[:, variant_type-1] = np.sum(X_dense == variant_type, axis=0)

    # Calculate the total counts per SNV to use for proportion calculation
    total_snv_counts = snv_counts.sum(axis=1, keepdims=True)
    
    # Calculate proportions of each variant type per SNV
    snv_proportions = snv_counts / total_snv_counts

    # Add SNV proportions to the .var DataFrame
    adata.var['variant_1_proportion_0/0'] = snv_proportions[:, 0]
    adata.var['variant_2_proportion_0/1'] = snv_proportions[:, 1]
    adata.var['variant_3_proportion_1/1'] = snv_proportions[:, 2]

    return adata

"""
Function to filter the data
"""

def compare_proportions(adata, adata1, var_name, t=0.01, window=0.2, debug=False):
    print("\nChecking", var_name) if debug else print("", end="")
    variants = ['variant_1_proportion_0/0', 'variant_2_proportion_0/1', 'variant_3_proportion_1/1']
    p1 = [adata.var[k][var_name] for k in variants]
    p2 = [adata1.var[k][var_name] for k in variants]
    ratios = [(p2[i]/p1[i]) for i in range(len(variants))]

    if debug:
        print(f"p1: {p1}")
        print(f"p2: {p2}")
        print(f"ratio: {ratios}")
    
    if max(p1) >= 0.999 or max(p2) >= 0.999:
        print(f"BAD: {var_name} solely 1 variant") if debug else print("", end="")
        return 
        
    for i, r in enumerate(ratios):
        if r < 1-window or r > 1+window:
            if p1[i] > t and p2[i] > t:
                print("BAD RATIO") if debug else print("", end="")
                return 

    return {var_name: ratios}

def strict_filtering(adata, adata1, var_name, t=0.01, window=0.05, debug=False):
    print("\nChecking", var_name) if debug else print("", end="")
    variants = ['variant_1_proportion_0/0', 'variant_2_proportion_0/1', 'variant_3_proportion_1/1']
    p1 = [adata.var[k][var_name] for k in variants]
    p2 = [adata1.var[k][var_name] for k in variants]
    
    for i in range(len(variants)):
        rna_p = p1[i]
        dna_p = p2[i]
        if dna_p + window > rna_p and dna_p - window < rna_p:
            print(f"{dna_p} is very close to being equal to {rna_p}") if debug else print("", end="")
        else:
            return
    
    print("All three are good") if debug else print("", end="")
    
    return 1

In [None]:
for key in filtered_datasets:
    data[key] = calc_props(data[key])

In [None]:
"""
STRICT filtering, to make the profiles across modalities more similar
"""

# snvs = list(data['rna_filtered'].var_names)

# len_before = len(snvs)
# good_snvs = []

# for s in snvs:
#     keep = strict_filtering(data['rna_filtered'], data['dna_filtered'], s, window=0.03, debug=False)
#     if keep:
#         good_snvs.append(s)

# len_after = len(good_snvs)

# print(f"No. SNVs before: {len_before}")
# print(f"No. SNVs after: {len_after}")
# print(f"Percentage thrown out: {100-round(len_after/len_before*100, 2)}%")

# ## Filter the datasets based on these SNVs

# # Filtered datasets 0 = RNA
# data[filtered_datasets[0]] = data[filtered_datasets[0]][:, list(good_snvs)]

# # Filtered datasets 1 = DNA
# data[filtered_datasets[1]] = data[filtered_datasets[1]][:, list(good_snvs)]


In [None]:
"""

Lenient filtering with ratios and a bigger window

"""

snvs = list(data['rna_filtered'].var_names)


len_before = len(snvs)
good_snvs = []

for s in snvs:
    keep = compare_proportions(data['rna_filtered'], data['dna_filtered'], s, window=0.3, t=0.1, debug=False)
    if keep:
        good_snvs.append(list(keep.keys())[0])

len_after = len(good_snvs)

print(f"No. SNVs before: {len_before}")
print(f"No. SNVs after: {len_after}")
print(f"Percentage thrown out: {100-round(len_after/len_before*100, 2)}%")

## Filter the datasets based on these SNVs

# Filtered datasets 0 = RNA
data[filtered_datasets[0]] = data[filtered_datasets[0]][:, list(good_snvs)]

# Filtered datasets 1 = DNA
data[filtered_datasets[1]] = data[filtered_datasets[1]][:, list(good_snvs)]


In [None]:
print("RNA")
data[filtered_datasets[0]].var

In [None]:
print("DNA")
data[filtered_datasets[1]].var

## SIMBA: infer edges

In [None]:
"""
Infer edges between all possible combinations of 'filtered_datasets'.
In this case only 1 combination:
rna_filtered - dna_filtered
"""

import itertools
# Dictionary to store the CC_embeddings
CC_embeddings = {}

# Iterate over all pairs of datasets
for dataset1, dataset2 in itertools.combinations(filtered_datasets, 2):
    print(f"Inferring edges between 1) {dataset1}, and 2) {dataset2}")

    print(data[dataset1].var_names)
    print(data[dataset2].var_names)
    
    # Call si.tl.infer_edges for each combination
    CC_embeddings[f'CC-{dataset1}-{dataset2}'] = si.tl.infer_edges(data[dataset1], data[dataset2], feature='pass', n_components=10, k=10)  # n_components & k need to be optimized

CC_embeddings

In [None]:
all_datasets = {}

for key in filtered_datasets:
    all_datasets[key] = data[key]

# Adding CC_embeddings as separate items in the list
for key, value in CC_embeddings.items():
    all_datasets[key] = value

all_datasets

In [None]:
'''
Ensure good naming within datasets.
'''

for dataset_name, dataset in all_datasets.items():
    print('Checking', dataset_name)
    assert dataset.var_names.is_unique, f"var_names in {dataset_name} are not unique."
    assert dataset.obs_names.is_unique, f"obs_names in {dataset_name} are not unique."
    assert dataset.var_names.notnull().all(), f"var_names in {dataset_name} contain null values."
    assert dataset.obs_names.notnull().all(), f"obs_names in {dataset_name} contain null values."

In [None]:
workdir

In [None]:
si.tl.gen_graph(list_adata=[x.copy() for x in all_datasets.values()], copy=False, layer=None, dirname="graph2")

In [None]:
dict_config = si.settings.pbg_params.copy()
dict_config['workers'] = 12



# MKL_THREADING_LAYER is by default INTEL
# By default numpy is trying to use Intel's implementation of OpenMP, while PyTorch is linked with GNU, which seem to trigger this error message
# To avoid this, we set MKL_THREADING_LAYER to GNU, and this is an environment variable
os.environ["MKL_THREADING_LAYER"]="GNU"

## start training
si.tl.pbg_train(pbg_params = dict_config, auto_wd=True, save_wd=True, output='model2', use_edge_weights=True)

In [None]:
si.pl.pbg_metrics(fig_ncol=3)

## Show results

In [None]:
'''
Extract the embedding and assign correct names and variables.
'''

dict_adata = si.read_embedding()
adata_C = dict_adata['E0']  # embeddings of cells from RNA
adata_C2 = dict_adata['E2']  # embeddings for cells from DNA
adata_S = dict_adata['E1']  # embeddings for SNVs

In [None]:
adata_C.obs['phenotype'] = data['rna_filtered'][adata_C.obs_names,:].obs['Phenotype'].copy()
adata_C.obs['cluster'] = data['rna_filtered'][adata_C.obs_names,:].obs['cluster'].copy()
adata_C.obs['batch'] = data['rna_filtered'][adata_C.obs_names,:].obs['Batch'].copy()
si.tl.umap(adata_C,n_neighbors=15,n_components=2)

adata_C2.obs['phenotype'] = data['dna_filtered'][adata_C2.obs_names,:].obs['Phenotype'].copy()
adata_C2.obs['cluster'] = data['dna_filtered'][adata_C2.obs_names,:].obs['cluster'].copy()
adata_C2.obs['batch'] = data['dna_filtered'][adata_C2.obs_names,:].obs['Batch'].copy()

adata_S.obs['phenotype'] = "SNV"
adata_S.obs['cluster'] = "NA"
adata_S.obs['batch'] = "SNV"
si.tl.umap(adata_C2,n_neighbors=15,n_components=2)
si.tl.umap(adata_S,n_neighbors=15,n_components=2)

In [None]:
'''
Use the RNA as the reference, and DNA as the query.
'''
import math

adata_all = si.tl.embed(adata_ref=adata_C,list_adata_query=[adata_C2])

## add annotations of two batches
adata_all.obs['entity_group'] = ""
adata_all.obs.loc[adata_C.obs_names, 'entity_group'] = "rna"
adata_all.obs.loc[adata_C2.obs_names, 'entity_group'] = "dna"
# adata_all.obs.loc[adata_S.obs_names, 'entity_group'] = "SNV"
# adata_all.obs.loc[adata_S.obs_names, 'cluster'] = "SNV"
# adata_all.obs.loc[adata_S.obs_names, 'phenotype'] = "SNV"
# adata_all.obs.loc[adata_S.obs_names, 'batch'] = "SNV"

clusters = list(adata_all.obs['cluster'])
for i, c in enumerate(clusters):
    if c is not None and isinstance(c, float) and math.isnan(c):
        clusters[i] = 'Unclustered'

adata_all.obs['cluster'] = clusters
adata_all.obs['cluster']

In [None]:
print(workdir)

In [None]:
neighbor = [5, 10, 15, 20, 30, 50]

for n in neighbor:
    print(n)
    n_neighbors = n
    n_components = 2
    
    si.tl.umap(adata_all,n_neighbors=n_neighbors,n_components=n_components)  # n_components & n_neighbors need to be optimized
    
    """
    
    UMAPs to save to file
    
    """
    si.pl.umap(adata_C,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random',
               save_fig=True, fig_path=f"{workdir}/RNA", fig_name=f"UMAP_RNA_param-nNB{n_neighbors}-nC{n_components}.pdf")
    
    si.pl.umap(adata_C2,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random',
               save_fig=True, fig_path=f"{workdir}/DNA", fig_name=f"UMAP_DNA_param-nNB{n_neighbors}-nC{n_components}.pdf")

    si.pl.umap(adata_S,
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random',
               save_fig=True, fig_path=f"{workdir}/SNV", fig_name=f"UMAP_SNVs_param-nNB{n_neighbors}-nC{n_components}.pdf")
    
    si.pl.umap(adata_all,color=['entity_group', 'batch','phenotype','cluster'],
               drawing_order='random',
               alpha=0.7, 
               fig_size=(5.5,4),
               save_fig=True, fig_path=f"{workdir}/integrated", fig_name=f"UMAP_integrated_param-nNB{n_neighbors}-nC{n_components}.pdf")
    
    
    """
        UMAP to show in output cell
    """
    
    si.pl.umap(adata_C,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random')
    
    si.pl.umap(adata_C2,
               color=['phenotype', 'batch'],
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random')

    si.pl.umap(adata_S,
               fig_size=(5.5, 4),
               alpha=0.7, 
               drawing_order='random')
    
    si.pl.umap(adata_all,color=['entity_group', 'batch','phenotype','cluster'],
               drawing_order='random',
               alpha=0.7, 
               fig_size=(5.5,4))

In [None]:
adata_all = si.tl.embed(adata_ref=adata_S,list_adata_query=[adata_C, adata_C2])

## add annotations of two batches
adata_all.obs['entity_group'] = ""
adata_all.obs.loc[adata_C.obs_names, 'entity_group'] = "rna"
adata_all.obs.loc[adata_C2.obs_names, 'entity_group'] = "dna"
adata_all.obs.loc[adata_S.obs_names, 'entity_group'] = "SNV"
adata_all.obs.loc[adata_S.obs_names, 'cluster'] = "SNV"
adata_all.obs.loc[adata_S.obs_names, 'phenotype'] = "SNV"
adata_all.obs.loc[adata_S.obs_names, 'batch'] = "SNV"

clusters = list(adata_all.obs['cluster'])
for i, c in enumerate(clusters):
    if c is not None and isinstance(c, float) and math.isnan(c):
        clusters[i] = 'Unclustered'

adata_all.obs['cluster'] = clusters
adata_all.obs['cluster'] 

In [None]:
# Change CNV clusters to include phenotype

selected = adata_all.obs.loc[(adata_all.obs["entity_group"] == "dna"), ['cluster', 'phenotype']]
# Assuming 'selected' is your DataFrame from the previous operation
selected['cluster'] = selected.apply(lambda row: f"{row['cluster']}_{row['phenotype'].lower()}" if row['cluster'] == "normal_cnv" else row['cluster'], axis=1)
adata_all.obs.loc[selected.index, 'cluster'] = selected['cluster']

adata_all.obs.loc[:, ['cluster', 'phenotype']]


In [None]:
NNeighbors=15
NComp=2

si.tl.umap(adata_all,n_neighbors=NNeighbors,n_components=NComp)  
si.pp.pca(adata_all, n_components=50)

In [None]:
dict_palette = {
    'entity_group': {'dna': '#f46806', 'rna': '#47aa26', 'SNV': '#123456'},
    'phenotype': {'Tripolar': '#f46806', 'Bipolar': '#47aa26', 'SNV': '#123456'},
    'batch': {'s143': '#aa2647', 's145': '#2647aa', 'CHI-006': '#f46806', 'CHI-007': '#40e0d0', 'SNV': '#123456'},
    'cluster': {'SNV': '#123456', 'Seurat_0': '#f46806', "Seurat_1": "#47aa26", "Seurat_2": "#516cbb", "chaotic_cnv": "#e2068c", "normal_cnv_bipolar": "#aaaaaa", "normal_cnv_tripolar": "#10001d", "Unclustered": "#eccdb8"}  # Example colors
}
ps = set(list(adata_all.obs['cluster']))
ps

In [None]:
si.pl.umap(adata_all,color=['cluster', 'phenotype', 'entity_group'],
               dict_palette=dict_palette,
               drawing_order='random',
               alpha=0.7, 
               fig_size=(5.5,4))

In [None]:
si.pl.umap(adata_all,color=['batch', 'cluster', 'phenotype'],
               drawing_order='random',
               dict_palette=dict_palette,
               alpha=0.9, 
               fig_size=(7,7),
          save_fig=True, fig_path=f"{workdir}/SNV_projected", fig_name=f"umap_cluster_with_SNVs.pdf")

In [None]:
adata_all = adata_all[np.where(adata_all.obs["id_dataset"] != "ref")[0]]
adata_all = adata_all[np.where(adata_all.obs["cluster"] != "Unclustered")[0]]

In [None]:
si.pl.umap(adata_all,color=['entity_group', 'batch','phenotype','cluster'],
               drawing_order='random',
               dict_palette=dict_palette,
               alpha=0.7, 
               fig_size=(5.5,4))

In [None]:
adata_all_bp = adata_all[np.where(adata_all.obs["phenotype"] == "Bipolar")[0]]
adata_all_tp = adata_all[np.where(adata_all.obs["phenotype"] == "Tripolar")[0]]
adata_all_bp

In [None]:
si.pl.umap(adata_all,color=['batch', 'cluster', 'phenotype'],
               drawing_order='random',
               dict_palette=dict_palette,
               alpha=0.9, 
               fig_size=(7,7),
          save_fig=True, fig_path=f"{workdir}/SNV_projected", fig_name=f"umap_cluster_noSNVs.pdf")
# si.pl.umap(adata_all_tp,color=['cluster'],
#                drawing_order='random',
#                alpha=0.9, 
#                fig_size=(5,5),
#           save_fig=True, fig_path=f"{workdir}/integrated2", fig_name=f"umap_TPcluster.pdf")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.colors as mc
import numpy as np

def heatmap_similarity(adata, phenotype=True, cluster=True, layer="X_umap", d="euclidean", c=True, name="Heatmap"):
    # Calculate cosine similarity matrix
    #similarity_matrix = cosine_similarity(adata.X)

    name = f"{name}_{'Clustered_' if c else ''}{d}.{layer}.pdf"
    
    if layer == "X":
        l = adata.X
    else:
        l = adata.obsm[layer]
    if d == "euclidean":
        similarity_matrix = euclidean_distances(l)
    elif d == "cosine":
        similarity_matrix = cosine_similarity(l)

    print(adata.obs.sort_values(by='cluster', inplace=True))
    
    # Reset index of .obs to ensure alignment
    obs_data = adata.obs.reset_index(drop=True)
    
    col_colors = []
    legend_info = []
    
    
    # Prepare phenotype colors
    if phenotype:
        phenotype_palette = sns.color_palette("hls", len(obs_data['entity_group'].unique()))
        phenotype_color_map = {phenotype: color for phenotype, color in zip(obs_data['entity_group'].unique(), phenotype_palette)}
        phenotype_colors = [phenotype_color_map[phenotype] for phenotype in obs_data['entity_group']]
        phenotype_hex_colors = {key: mc.to_hex(value) for key, value in phenotype_color_map.items()}
        
    else:
        phenotype_colors = None

    # Prepare cluster colors
    if cluster:
        cluster_palette = sns.color_palette("bright", len(obs_data['cluster'].unique()))
        cluster_color_map = {cluster: color for cluster, color in zip(obs_data['cluster'].unique(), cluster_palette)}
        cluster_colors = [cluster_color_map[cluster] for cluster in obs_data['cluster']]
        cluster_hex_colors = {key: mc.to_hex(value) for key, value in cluster_color_map.items()}  

    else:
        cluster_colors = None

    # Flatten col_colors if it's nested
    columns = []
    legend_colors = {}
    
    if phenotype:
        columns.append(phenotype_colors)
        legend_colors.update(**phenotype_hex_colors)
    if cluster:
        columns.append(cluster_colors)
        legend_colors.update(**cluster_hex_colors)
    
    # Plotting
    plt.figure(figsize=(8, 8))
    # Adjust the size of the figure and the clustermap properties as needed
    g = sns.clustermap(similarity_matrix, col_colors=columns, row_colors=columns, cmap="viridis", yticklabels=False, col_cluster=c, row_cluster=c, figsize=(10, 10))
    
    # Add legend for phenotypes and clusters
    legend_patches = [
        mpatches.Patch(color=color, label=batch) for batch, color in legend_colors.items()
    ]    
    
    plt.legend(handles=legend_patches, title="Metadata", bbox_to_anchor=(1.06, 0.0), loc='upper right')
    plt.savefig(f"{workdir}/{name}")
    plt.show()


In [None]:
# Calculate cosine similarity matrix for your data
# Here, `data_matrix` should be the feature matrix for your samples

heatmap_similarity(adata_all, layer="X_pca", d="cosine")

In [None]:
workdir

In [None]:
heatmap_similarity(adata_all, layer="X_pca", d="cosine", c=False)

In [None]:
heatmap_similarity(adata_all, layer="X_pca", d="euclidean")
heatmap_similarity(adata_all, layer="X_pca", d="euclidean", c=False)

In [None]:
heatmap_similarity(adata_all, layer="X", d="euclidean")
heatmap_similarity(adata_all, layer="X", d="euclidean", c=False)


In [None]:
similarity_matrix = euclidean_distances(adata_all.X)
adata_all.X.shape

In [None]:
# Extract cluster assignments
cluster_assignments = adata_all.obs["cluster"]

# Initialize containers for similarities
within_cluster_stats = {}
between_cluster_stats = {}

# Initialize a matrix of zeros
distance_matrix = np.zeros((len(clusters), len(clusters)))


# Calculate within-cluster similarity
for cluster in np.unique(cluster_assignments):
    indices = np.where(cluster_assignments == cluster)[0]
    within_distances = similarity_matrix[np.ix_(indices, indices)]
    within_cluster_stats[cluster] = {
        "mean": np.mean(within_distances),
        "std": np.std(within_distances),
        "median": np.median(within_distances),
        "distances": within_distances
    }


# Calculate between-cluster similarity (simplified example)
for cluster1, cluster2 in itertools.combinations(cluster_assignments, 2):
        if cluster1 != cluster2 and (cluster2, cluster1) not in between_cluster_stats.keys():
            indices1 = np.where(cluster_assignments == cluster1)[0]
            indices2 = np.where(cluster_assignments == cluster2)[0]
            between_distances = similarity_matrix[np.ix_(indices1, indices2)].flatten()
            between_cluster_stats[(cluster1, cluster2)] = {
                "mean": np.mean(between_distances),
                "std": np.std(between_distances),
                "median": np.median(between_distances),
                "distances": between_distances
            }

In [None]:
# Extract unique cluster names
clusters = set()
for pair in between_cluster_stats.keys():
    clusters.update(pair)
clusters = sorted(list(clusters))

# Initialize a matrix of zeros
distance_matrix = np.zeros((len(clusters), len(clusters)))

# Fill the matrix with your data
for i, cluster1 in enumerate(clusters):
    for j, cluster2 in enumerate(clusters):
        if cluster1 == cluster2:
            # Distance to self can be 0 or a max value for better visualization
            distance_matrix[i, j] = within_cluster_stats.get(cluster1, np.nan)['mean']
        else:
            # Check both possible keys since the input might not have a consistent order
            key = (cluster1, cluster2)
            reverse_key = (cluster2, cluster1)
            distance_matrix[i, j] = between_cluster_stats.get(key, between_cluster_stats.get(reverse_key, np.nan))['mean']

# Convert the matrix into a DataFrame for better labeling in seaborn
distance_df = pd.DataFrame(distance_matrix, index=clusters, columns=clusters)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(distance_df, annot=True, cmap="coolwarm", fmt=".4f")
plt.title("Cluster Distance Visualization")
plt.tight_layout()
plt.savefig(f"{workdir}/Cluster_distance_euclidean.pdf")
plt.show()

In [None]:
# Prepare a list to collect data
boxplot_data = []

# Iterate through each pair and their stats
for (cluster1, cluster2), stats in between_cluster_stats.items():
    for distance in stats['distances']:
        # Append a tuple (or list) with the pair label and the distance
        boxplot_data.append((f"{cluster1} vs {cluster2}", distance))

# Convert to DataFrame for easier plotting
boxplot_df = pd.DataFrame(boxplot_data, columns=['Cluster Pair', 'Distance'])

# Quick check on the DataFrame
print(boxplot_df.head())

plt.figure(figsize=(12, 8))
sns.violinplot(x='Cluster Pair', y='Distance', data=boxplot_df, palette='pastel')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.title('Between-Cluster Distances')
plt.tight_layout()
plt.savefig(f"{workdir}/Cluster_distance_violin.pdf")
plt.show()

In [None]:
wcluster = ["Seurat_1", "Seurat_2", "chaotic_cnv", "normal_cnv_tripolar"]
# Create a boolean mask where each row is True if its 'cluster' value is in `wcluster`
mask = adata_all.obs['cluster'].isin(wcluster)

# Use the mask to select rows from `adata_all`
adata_extracted = adata_all[mask, :]
adata_extracted

similarity_matrix = euclidean_distances(adata_extracted.X)

# Extract cluster assignments
cluster_assignments = adata_extracted.obs["cluster"]

# Initialize containers for similarities
within_cluster_stats = {}
between_cluster_stats = {}

# Initialize a matrix of zeros
distance_matrix = np.zeros((len(clusters), len(clusters)))


# Calculate within-cluster similarity
for cluster in np.unique(cluster_assignments):
    indices = np.where(cluster_assignments == cluster)[0]
    within_distances = similarity_matrix[np.ix_(indices, indices)]
    within_cluster_stats[cluster] = {
        "mean": np.mean(within_distances),
        "std": np.std(within_distances),
        "median": np.median(within_distances),
        "distances": within_distances
    }


# Calculate between-cluster similarity (simplified example)
for cluster1, cluster2 in itertools.combinations(cluster_assignments, 2):
        if cluster1 != cluster2 and (cluster2, cluster1) not in between_cluster_stats.keys():
            indices1 = np.where(cluster_assignments == cluster1)[0]
            indices2 = np.where(cluster_assignments == cluster2)[0]
            between_distances = similarity_matrix[np.ix_(indices1, indices2)].flatten()
            between_cluster_stats[(cluster1, cluster2)] = {
                "mean": np.mean(between_distances),
                "std": np.std(between_distances),
                "median": np.median(between_distances),
                "distances": between_distances
            }

# Extract unique cluster names
clusters = set()
for pair in between_cluster_stats.keys():
    clusters.update(pair)
clusters = sorted(list(clusters))

# Initialize a matrix of zeros
distance_matrix = np.zeros((len(clusters), len(clusters)))

# Fill the matrix with your data
for i, cluster1 in enumerate(clusters):
    for j, cluster2 in enumerate(clusters):
        if cluster1 == cluster2:
            # Distance to self can be 0 or a max value for better visualization
            distance_matrix[i, j] = within_cluster_stats.get(cluster1, np.nan)['mean']
        else:
            # Check both possible keys since the input might not have a consistent order
            key = (cluster1, cluster2)
            reverse_key = (cluster2, cluster1)
            distance_matrix[i, j] = between_cluster_stats.get(key, between_cluster_stats.get(reverse_key, np.nan))['mean']

# Convert the matrix into a DataFrame for better labeling in seaborn
distance_df = pd.DataFrame(distance_matrix, index=clusters, columns=clusters)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(distance_df, annot=True, cmap="coolwarm", fmt=".4f")
plt.title("Cluster Distance Visualization")
plt.savefig(f"{workdir}/Cluster_subset_distance_euclidean.pdf")
plt.show()

In [None]:
boxplot_data = []

# Iterate through each pair and their stats
for (cluster1, cluster2), stats in between_cluster_stats.items():
    for distance in stats['distances']:
        # Append a tuple (or list) with the pair label and the distance
        boxplot_data.append((f"{cluster1} vs {cluster2}", distance))

# Convert to DataFrame for easier plotting
boxplot_df = pd.DataFrame(boxplot_data, columns=['Cluster Pair', 'Distance'])

# Quick check on the DataFrame
print(boxplot_df.head())

plt.figure(figsize=(12, 8))
sns.violinplot(x='Cluster Pair', y='Distance', data=boxplot_df, palette='pastel')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.title('Between-Cluster Distances')
plt.tight_layout()
plt.savefig(f"{workdir}/Cluster_subset_distance_violin.pdf", pad_inches=1)
plt.show()

In [None]:
import scipy.stats as stats

# Step 2: Pairwise comparisons
p_values = []
comparisons = []

# Sample data from the previous step, representing 'distances' for different cluster pairs
sample_distances = {
    ('Seurat_1', 'Seurat_2'): between_cluster_stats[('Seurat_1', 'Seurat_2')]['distances'],
    ('Seurat_1', 'chaotic_cnv'): between_cluster_stats[('Seurat_1', 'chaotic_cnv')]['distances'],
    ('Seurat_1', 'normal_cnv_tripolar'): between_cluster_stats[('Seurat_1', 'normal_cnv_tripolar')]['distances'],
    ('Seurat_2', 'chaotic_cnv'): between_cluster_stats[('Seurat_2', 'chaotic_cnv')]['distances'],
    ('Seurat_2', 'normal_cnv_tripolar'): between_cluster_stats[('Seurat_2', 'normal_cnv_tripolar')]['distances'],
    ('chaotic_cnv', 'normal_cnv_tripolar'): between_cluster_stats[('chaotic_cnv', 'normal_cnv_tripolar')]['distances'],
}

# Perform Shapiro-Wilk test for normality
normality_test_results = {cluster_pair: stats.shapiro(distances) for cluster_pair, distances in sample_distances.items()}
print(normality_test_results)

# Extract all unique cluster combinations
clusters = list(between_cluster_stats.keys())

for i in range(len(clusters)):
    for j in range(i+1, len(clusters)):
        cluster_pair_1 = clusters[i]
        cluster_pair_2 = clusters[j]
        
        distances_1 = between_cluster_stats[cluster_pair_1]['distances']
        distances_2 = between_cluster_stats[cluster_pair_2]['distances']
        
        # Perform Mann-Whitney U test
        stat, p = stats.mannwhitneyu(distances_1, distances_2, alternative='two-sided')
        p_values.append(p)
        comparisons.append(f"{cluster_pair_1} vs {cluster_pair_2}")

# Step 3: Adjust for multiple comparisons (example using Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests
rejections, corrected_p_values, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# Display results
for comparison, p_value, reject in zip(comparisons, corrected_p_values, rejections):
    print(f"{comparison}: p={p_value:.4e}, significant={reject}")


In [None]:
p_values, corrected_p_values

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster Pair', y='Distance', data=boxplot_df, palette='pastel')
plt.xticks(rotation=90)  # Rotate labels for better readability
plt.title('Between-Cluster Distances')
plt.show()

In [None]:
within_cluster_stats
pd.DataFrame.from_dict(within_cluster_stats, orient='index')

In [None]:
pd.DataFrame.from_dict(between_cluster_stats, orient='index')

In [None]:

phenotype_count = {}

ps = list(adata_all.obs['cluster'])

for p in ps:
    if p not in phenotype_count.keys():
        phenotype_count[p] = 1
    else:
        phenotype_count[p] += 1

pd.DataFrame.from_dict(phenotype_count, orient='index', columns=["Count"])


In [None]:
import session_info
session_info.show(dependencies=True)

In [None]:
name = "Heatmmap"
c = False
d = 'euclidean'
layer = 'X'
name = f"{name}_{'Clustered_' if c else ''}{d}.{layer}"
name

## Debug code

In [None]:
"""
This is part of https://github.com/pinellolab/simba/blob/dev/simba/tools/_pbg.py
To try and identify the error
"""

import pandas as pd



id_ent = pd.Index([])  # ids of all entities
dict_ent_type = dict()
ctr_ent = 0  # counter for entity types
entity_alias = pd.DataFrame(columns=['alias'])
dict_graph_stats = dict()
prefix = ''

col_names = ["source", "relation", "destination", "weight"]

df_edges = pd.DataFrame(columns=col_names)

data_for_graph = list(all_datasets.values())

# Adding a debug statement to check how many iterations in the loop
print(f"Number of elements in data_for_graph: {len(data_for_graph)}")

for ctr_rel, adata_ori in enumerate(data_for_graph):
    print(f"Processing data_for_graph element #{ctr_rel}")

    obs_names = adata_ori.obs_names
    var_names = adata_ori.var_names

    # Debug statement to check the current obs_names and var_names
    print(f"Current obs_names: {obs_names}")
    print(f"Current var_names: {var_names}")

    if len(set(obs_names).intersection(id_ent)) == 0:
        prefix_i = f'{prefix}{ctr_ent}'
        id_ent = id_ent.union(adata_ori.obs_names)
        entity_alias_obs = pd.DataFrame(
            index=obs_names,
            columns=['alias'],
            data=[f'{prefix_i}.{x}'
                  for x in range(len(obs_names))])

        dict_ent_type[prefix_i] = obs_names
        entity_alias = pd.concat(
            [entity_alias, entity_alias_obs],
            ignore_index=False)
        obs_type = prefix_i
        ctr_ent += 1

        # Debug statement to confirm addition of new entity type
        print(f"Added new entity type: {prefix_i}")

    else:
        for k, item in dict_ent_type.items():
            if len(set(obs_names).intersection(item)) > 0:
                obs_type = k
                break
        if not set(obs_names).issubset(id_ent):
            id_ent = id_ent.union(adata_ori.obs_names)
            adt_obs_names = list(set(obs_names)-set(item))
            entity_alias_obs = pd.DataFrame(
                index=adt_obs_names,
                columns=['alias'],
                data=[f'{prefix_i}.{len(item)+x}'
                      for x in range(len(adt_obs_names))])
            dict_ent_type[obs_type] = obs_names.union(adt_obs_names)
            entity_alias = pd.concat(
                [entity_alias, entity_alias_obs],
                ignore_index=False)

            # Debug statement for updated entity type
            print(f"Updated entity type: {obs_type}")

    if len(set(var_names).intersection(id_ent)) == 0:
        prefix_i = f'{prefix}{ctr_ent}'
        id_ent = id_ent.union(adata_ori.var_names)
        entity_alias_var = pd.DataFrame(
            index=var_names,
            columns=['alias'],
            data=[f'{prefix_i}.{x}'
                  for x in range(len(var_names))])

        dict_ent_type[prefix_i] = var_names
        entity_alias = pd.concat(
            [entity_alias, entity_alias_var],
            ignore_index=False)
        var_type = prefix_i
        ctr_ent += 1

        # Debug statement to confirm addition of new variable type
        print(f"Added new variable type: {prefix_i}")

    else:
        for k, item in dict_ent_type.items():
            if len(set(var_names).intersection(item)) > 0:
                var_type = k
                break
        if not set(var_names).issubset(id_ent):
            id_ent = id_ent.union(adata_ori.var_names)
            adt_var_names = list(set(var_names)-set(item))
            entity_alias_var = pd.DataFrame(
                index=adt_var_names,
                columns=['alias'],
                data=[f'{prefix_i}.{len(item)+x}'
                      for x in range(len(adt_var_names))])
            dict_ent_type[var_type] = var_names.union(adt_var_names)
            entity_alias = pd.concat(
                [entity_alias, entity_alias_var],
                ignore_index=False)

            # Debug statement for updated variable type
            print(f"Updated variable type: {var_type}")

    arr_simba = adata_ori.X
    _row, _col = arr_simba.nonzero()
    df_edges_x = pd.DataFrame(columns=col_names)

    # Issue is here! >>>>

    print(_row, _col)
    
    print('Entity_alias:')
    print(entity_alias)
    print("OBS")

    print(entity_alias.loc[obs_names[_row], 'alias'])
    print('VAR')

    print(entity_alias.loc[var_names[_col], 'alias'])
    
    var_alias = entity_alias.loc[var_names[_col], 'alias'].values
    df_edges_x['source'] = entity_alias.loc[obs_names[_row], 'alias'].values
    df_edges_x['relation'] = f'r{ctr_rel}'
    df_edges_x['destination'] = entity_alias.loc[var_names[_col], 'alias'].values
    df_edges_x['weight'] = arr_simba[_row, _col].A.flatten()

    # Debug statements for edges data frame
    print(f"Relation {ctr_rel}: source: {obs_type}, destination: {var_type}")
    print(f"#edges: {df_edges_x.shape[0]}")

    df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True)

    # Debug statements for updated df_edges
    print(f"Updated df_edges after relation {ctr_rel}: {df_edges.shape}")

    adata_ori.obs['pbg_id'] = ""
    adata_ori.var['pbg_id'] = ""
    adata_ori.obs.loc[obs_names, 'pbg_id'] = entity_alias.loc[obs_names, 'alias'].copy()
    adata_ori.var.loc[var_names, 'pbg_id'] = entity_alias.loc[var_names, 'alias'].copy()

    # Debug statement after updating pbg_id
    print("Updated pbg_id in adata_ori")
entity_alias

entity_alias.index.isunique.sum()