In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
annotated_clusters_FINAL =  pd.read_csv('./use_data/VICREG_clustering_highinputdropout_5000epochs_rev_annotation.csv')
all_epithelial_cells = annotated_clusters_FINAL[annotated_clusters_FINAL['Tier_1'] == 'Epithelial cells']
annotated_clusters_FINAL



In [None]:
cluster_result = pd.read_parquet('../scGPT_embeddings/clusterings/VICREG_clustering_highinputdropout_5000epochs.parquet')[['id', 'cluster']]
cluster_result



In [None]:
embeddings = pd.read_parquet('../scGPT_embeddings/embeddings/VICREG_embedding_highinputdropout_5000epochs.parquet').drop(columns = ['Tier_1', 'Tier_2', 'Tier_3', 'Tier_4'])
embeddings = embeddings.merge(cluster_result, left_on = 'id', right_on = 'id')

embeddings = embeddings.merge(annotated_clusters_FINAL, left_on = 'cluster', right_on = 'Cluster_ID')
embeddings



In [None]:
patients = embeddings['Pseudo'].drop_duplicates().to_numpy()
patients


In [None]:
results = []

for current_patient in tqdm(patients):
    current_embeddings = embeddings[np.isin(embeddings['Pseudo'], current_patient)]


    current_embeddings_tumor =  current_embeddings[(current_embeddings['Tier_2'] == 'Tumor')]
    current_embeddings_normal =  current_embeddings[(current_embeddings['Tier_2'] == 'Normal epithelial cells')&(current_embeddings['Tier_3'] != 'Tumor')].iloc[:,:]


    average_expression = current_embeddings_normal.iloc[:,2:66].mean(axis=0)
    current_embeddings_normal = current_embeddings_normal.iloc[:1,:]
    current_embeddings_normal.iloc[:,2:66] = average_expression

    if current_embeddings_normal.shape[0] == 0:
        print('NO NORMAL')
        continue

    current_embeddings = pd.concat((current_embeddings_normal, current_embeddings_tumor))

    adata = sc.AnnData(current_embeddings.iloc[:,2:66])
    adata.uns['iroot']  = 0
    adata.obs['cell_id'] = np.array(current_embeddings.id)
    adata.obs['Pseudo'] = np.array(current_embeddings.Pseudo)
    adata.obs['Tier_3'] = np.array(current_embeddings.Tier_3)
    adata.obs['cluster'] = np.array(current_embeddings.cluster).astype(str)

    sc.pp.pca(adata)
    sc.pp.neighbors(adata)  # build the neighborhood graph

    sc.tl.diffmap(adata)  # diffusion pseudotime
    sc.tl.dpt(adata)  # diffusion pseudotime
    sc.tl.umap(adata)  # for visualization


    pca_coordinates = pd.DataFrame(adata.obsm['X_umap'], columns = ['UMAP1', 'UMAP2'])

    pseudotime = pd.concat((adata.obs[['dpt_pseudotime', 'cell_id', 'Pseudo', 'Tier_3', 'cluster']].reset_index(), pca_coordinates), axis=1)
    results.append(pseudotime)


        

In [None]:
all_results = pd.concat(results, axis=0)
all_results.to_csv('./use_data/tumor_dpt_new.csv')

In [None]:
all_results