In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial import KDTree


CHANNELS = ['cy5', 'TxRed', 'cy3', 'FAM']
BASE_DIR = Path('E:/TMC/PRISM_pipeline/dataset/processed')
RUN_ID = '20230227_test'
src_dir = BASE_DIR / f'{RUN_ID}_processed'
stc_dir = src_dir / 'stitched'
read_dir = src_dir / 'readout'
seg_dir = src_dir / 'segmented'
os.makedirs(read_dir, exist_ok=True)

c:\Users\Mingchuan\anaconda3\envs\cell-typing\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\Mingchuan\anaconda3\envs\cell-typing\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


## load files and preprocess

In [3]:
# fun remove duplicates
def remove_duplicates(coordinates):
    tree = KDTree(coordinates)
    pairs = tree.query_pairs(2)
    neighbors = {} #dictionary of neighbors
    
    for i,j in pairs: #iterate over all pairs
        if i not in neighbors: neighbors[i] = set([j])
        else: neighbors[i].add(j)
        
        if j not in neighbors: neighbors[j] = set([i])
        else: neighbors[j].add(i)

    keep = []
    discard = set() # a list would work, but I use a set for fast member testing with `in`
    nodes = set([s[0] for s in pairs]+[s[1] for s in pairs])
    for node in nodes:
        if node not in discard: # if node already in discard set: skip
            keep.append(node) # add node to keep list
            discard.update(neighbors.get(node,set())) #add node's neighbors to discard set
    centroids_simplified = np.delete(coordinates, list(discard), axis=0)
    return centroids_simplified


def load_rnas(centroids, input_dir=read_dir/'mapped_genes.csv', preprocess=True):
    # load rnas
    rna_raw = pd.read_csv(input_dir)
    if not preprocess:
        rna_df = rna_raw.copy()
        rna_df = rna_df.loc[:, ~rna_df.columns.str.contains('^Unnamed')]
        print(rna_df)

    else:
        rna_raw = rna_raw[['x_in_pix','y_in_pix','z_in_pix','Gene']]
        print(f'ori_rna_num:\t{len(rna_raw)}')

        df = rna_raw[['x_in_pix','y_in_pix','z_in_pix','Gene']]
        df_reduced = pd.DataFrame()
        for gene in tqdm(set(df['Gene']), desc='deduplicating'):
            df_gene = df[df['Gene'] == gene]
            coordinates = df_gene[['x_in_pix','y_in_pix','z_in_pix']].values
            coordinates = remove_duplicates(coordinates)
            df_gene_reduced = pd.DataFrame(coordinates, columns=['x_in_pix','y_in_pix','z_in_pix'])
            df_gene_reduced['Gene'] = gene
            df_reduced = pd.concat([df_reduced, df_gene_reduced], axis=0)
        print(f'dedu_rna_num:\t{df_reduced.shape[0]}')


        # assign rna to nearest centroid
        rna_df = df_reduced.copy()
        rna_df['z_in_pix'] *= 3.36 # scale z to xy
        rna_pos = rna_df[['z_in_pix', 'x_in_pix','y_in_pix']].to_numpy()
        tree = KDTree(centroids)
        _, indices = tree.query(rna_pos, k=1, distance_upper_bound=100)
        rna_df['Cell Index'] = indices
        rna_df = rna_df[rna_df['Cell Index'] < centroids.shape[0]]

        # rm other
        rna_df = rna_df[rna_df['Gene']!='Other']
        num = len(rna_df['Cell Index'].unique())
        print(f'rm_oth_rna_num:\t{rna_df.shape[0]}')
        print(f'cell_num:\t{num}')
        print(rna_df.loc[:5])

        rna_df.to_csv(input_dir.replace('.csv', '_preprocessed.csv'), index=False)

    return rna_df


def create_exp_matrix(rna_df):
    match_df = rna_df.copy()
    match_df['Count'] = np.ones(len(match_df))
    match_df_group = match_df[['Cell Index','Gene','Count']].groupby(['Cell Index','Gene']).count()
    matrix = match_df_group.unstack().fillna(0)
    matrix.columns = matrix.columns.droplevel()
    matrix.columns.name = 'Gene'
    matrix.index.name = 'Cell Index'
    return matrix

In [5]:
# read cell centroid
centroids = pd.read_csv(seg_dir/'dapi_predict.csv', index_col=0).to_numpy(dtype=np.float64)
centroids[:,0] *= 3.36
centroids[:,1] *= 1
centroids[:,2] *= 1
print(f'centroid_num:\t{len(centroids)}')

centroid_num:	8556


In [11]:
# rad rna information
rna_df = load_rnas(centroids=centroids, input_dir=seg_dir/'mapped_genes.csv', preprocess=False)

        x_in_pix  y_in_pix  z_in_pix   Gene  Cell Index  in_nu
0           1753      3756     53.76  Nr4a2        7023  False
1            346       655    107.52  Nr4a2         213   True
2           1444      5474    137.76  Nr4a2        6034   True
3            965       438    191.52  Nr4a2        3185  False
4            704       192    117.60  Nr4a2        1724   True
...          ...       ...       ...    ...         ...    ...
235027      1113      3002    248.64    Nov        4005  False
235028      1351      5115    104.16    Nov        5976  False
235029      1701      3348    201.60    Nov        6933  False
235030      1166      2349     90.72    Nov        3866  False
235031       770      4685    110.88    Nov        2845  False

[235032 rows x 6 columns]


In [12]:
## replace gene names
PRISM_list = [f'PRISM_{_}' for _ in range(1,31)]
gene_order_list = ['Gapdh','Slc1a3', 'Slc17a7', 'Snap25',
             'Rasgrf2','Rgs4', 'Prox1', 'Plcxd2', 'Vxn', 'Pcp4', 'Nr4a2', 'Ctgf',
             'Gad1', 'Gad2', 'Pvalb', 'Sst', 'Vip', 'Lamp5',
             'Aqp4', 'Apod', 'Plp1', 'Cx3cr1', 'Pmch', 'Gfap',
             'Cck', 'Mbp', 'Rprm', 'Enpp2', 'Nov', 'Rorb', 
             ]

gene_list = list(pd.read_csv(r'E:\TMC\cell_typing\dataset_spatial\PRISM_mousebrain\var.csv')['gene_names'])
replace = {PRISM_list[_]:gene_list[_] for _ in range(len(PRISM_list))}
rna_df['Gene'] = rna_df['Gene'].replace(replace)
rna_df['Gene'] = rna_df['Gene'].replace({'3110035E14Rik':'Vxn'})
rna_df['Gene'] = pd.Categorical(rna_df['Gene'], categories=gene_order_list, ordered=True)

In [13]:
matrix = create_exp_matrix(rna_df)
matrix.to_csv(seg_dir/'expression_matrix.csv')