# spaOTsc Analysis: 2D Coordinates Recovery of Single Cells

## Xenium Breast Cancer, InSitu Replicate 1

https://www.10xgenomics.com/products/xenium-in-situ/human-breast-dataset-explorer


### 1. Import packages

In [1]:
# imports
import novosparc as ns

import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import altair as alt
from scipy.spatial.distance import cdist, squareform, pdist
from scipy.stats import ks_2samp
from scipy.stats import pearsonr
from scanpy import read_10x_h5
import random


In [2]:
import cv2 as cv
from skimage import io, color
import torch
from torch.nn import functional as F
import json

In [3]:
import os,csv,re
import math
from skimage import io, color

from scipy.sparse import issparse
import random, torch
import warnings
warnings.filterwarnings("ignore")
import pickle
from sklearn.model_selection import train_test_split
from anndata import AnnData, read_h5ad
import seaborn as sns

import json

import dit
from dit import pid
from spaotsc import SpaOTsc

In [None]:
import sys
sys.path.append('/Users/ameliaschroeder/CeLEry/CeLEry_package')

import CeLEry as cel

### 2. Load data

In [7]:
# cells to be filtered out based on total UMI and number of genes expressed
os.chdir("")
lowUMI_cellID = pd.read_csv("cell_ID_toRemove_filtered_75_25.csv",sep=",",na_filter=False,index_col=0)

In [8]:
#Read in gene expression and spatial location
os.chdir("")
adata = read_10x_h5("Xenium_FFPE_Human_Breast_Cancer_Rep1_cell_feature_matrix.h5")
spatial_full = pd.read_csv("Xenium_FFPE_Human_Breast_Cancer_Rep1_cells.csv",sep=",",na_filter=False,index_col=0)

In [9]:
os.chdir("") #scheme 4

train_index = pd.read_csv("trainCell_index.csv",sep=",")
test_index = pd.read_csv("testCell_index.csv",sep=",")

train_index = list(train_index.iloc[:,1])
test_index = list(test_index.iloc[:,1])

In [12]:
spatial_full

Unnamed: 0_level_0,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_area
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,377.663005,843.541888,154,0,0,154,110.361875,45.562656
2,382.078658,858.944818,64,0,0,64,87.919219,24.248906
3,319.839529,869.196542,57,0,0,57,52.561875,23.526406
4,259.304707,851.797949,120,0,0,120,75.230312,35.176719
5,370.576291,865.193024,120,0,0,120,180.218594,34.499375
...,...,...,...,...,...,...,...,...
167778,7455.404785,5115.021094,238,1,0,239,219.956094,61.412500
167779,7483.771045,5111.720703,80,0,0,80,38.427969,25.964844
167780,7470.119580,5119.350366,406,0,0,406,287.690469,86.158125
167781,7477.704004,5128.963086,120,0,0,120,235.670469,25.016563


In [13]:
# The pixel size of Xenium 0.2125 microns. 
# Coordinates in microns from cells.csv.gz can be converted to pixel coordinates 
# by dividing by the pixel size. The origin of the coordinate system is the upper left of the TIFF image.


pixel_size = 0.2125

spatial = pd.DataFrame()
spatial['0'] = spatial_full.x_centroid
spatial['1'] = [1] * 167782
spatial['2'] = spatial_full.x_centroid
spatial['3'] = spatial_full.y_centroid 
spatial['4'] = spatial_full.x_centroid / pixel_size
spatial['5'] = spatial_full.y_centroid / pixel_size

b = ['CellID '] * 167782
a = list(range(1,167783))

#spatial.index = [m+str(n) for m,n in zip(b,a)]

spatial.index = spatial.index.astype('str')


spatial = spatial.drop(['0'], axis = 1)
spatial.index.name = '0'
spatial.columns = spatial.columns.astype('int64')
spatial = spatial.astype('int64')

adata.obs_names = spatial.index.astype('str')

In [20]:
TrainDatafull = cel.make_annData_spatial(adata.copy(), spatial, filtered = True)
TrainDatafull.var['genename'] = TrainDatafull.var.gene_ids
TrainDatafull

AnnData object with n_obs × n_vars = 166313 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

### 3. Filter out cells with low UMI and low number of genes expressed


In [22]:
os.chdir("")
TrainDatafull.obs_names = TrainDatafull.obs_names.astype(np.int64) 

TrainDatafull_filtered = TrainDatafull.copy()[~TrainDatafull.obs_names.isin(lowUMI_cellID.x), :]
TrainDatafull_filtered

View of AnnData object with n_obs × n_vars = 42228 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

In [24]:
cel.get_zscore(TrainDatafull_filtered)

### 4. Split data into train and test data 

In [26]:
### Spliting into training and testing data for prediction/evalutaiton:

# split the data into training and testing data (90% train, 10% test)
# making sure to use same sets across all methods (see CelERY script for generating random splits of the test/train data)a

train_index = np.array(train_index).astype('str')
test_index = np.array(test_index).astype('str')

# 90% train
DataSubtrain90_coor = TrainDatafull_filtered[train_index,]

# 10% holdoff
DataSubtest10_coor = TrainDatafull_filtered[test_index,]


In [27]:
DataSubtrain90_coor

View of AnnData object with n_obs × n_vars = 38006 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

In [28]:
DataSubtest10_coor

View of AnnData object with n_obs × n_vars = 4222 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

In [29]:
holdoff = 10
dataSection1 = DataSubtrain90_coor 
dataSection2 = DataSubtest10_coor

In [30]:
dataSection1.obs

Unnamed: 0_level_0,select,x_cord,y_cord,x_pixel,y_pixel,n_counts
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,1,296,856,1394,4031,473.0
32,1,277,856,1306,4029,298.0
54,1,182,852,856,4013,294.0
55,1,132,864,623,4069,488.0
59,1,148,843,696,3968,332.0
...,...,...,...,...,...,...
167761,1,7486,5212,35231,24531,267.0
167771,1,7458,5095,35099,23977,305.0
167772,1,7469,5093,35148,23970,391.0
167776,1,7446,5111,35041,24053,334.0


In [31]:
dataSection1.obs['x'] = dataSection1.obs['x_cord']
dataSection1.obs['y'] = dataSection1.obs['y_cord']

dataSection2.obs['x'] = dataSection2.obs['x_cord']
dataSection2.obs['y'] = dataSection2.obs['y_cord']

In [32]:
dataSection1.X = dataSection1.X.toarray()
dataSection2.X = dataSection2.X.toarray()

In [33]:
dataSection1

AnnData object with n_obs × n_vars = 38006 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts', 'x', 'y'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

In [34]:
dataSection2

AnnData object with n_obs × n_vars = 4222 × 313
    obs: 'select', 'x_cord', 'y_cord', 'x_pixel', 'y_pixel', 'n_counts', 'x', 'y'
    var: 'gene_ids', 'feature_types', 'genome', 'genename'
    uns: 'log1p'

In [35]:
# remove extra data stored 

del adata
del TrainDatafull_filtered
del TrainDatafull

In [None]:
datatrain = dataSection1
datatest = dataSection2
genes = dataSection1.var.index

### 5. Run spaOTsc

In [41]:
random.seed(2021)
torch.manual_seed(2021)
np.random.seed(2021)

os.chdir("")


## Running spaOTsc
df_sc = pd.DataFrame(datatest.X)
is_dmat = cdist(datatrain.obs.to_numpy()[:,0:2], datatrain.obs.to_numpy()[:,0:2], 'minkowski', p=2.)
sc_dmat = cdist(datatest.X, datatest.X, 'minkowski', p=2.)
is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')
sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')


In [42]:
spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)


In [43]:
cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')
        

In [44]:
location_pred = spsc.transport_plan(cost_matrix)
location_pred_df = pd.DataFrame(location_pred)
location_pred_df.to_csv("location_pred_probs.csv", sep=',')


In [69]:
location_pred = np.array(location_pred_df)
location_pred.shape

(4222, 38006)

## Expected position:

In [70]:
location_sum = np.sum(location_pred, axis=1)
location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)
pred_cord_transform = location_pred_copy.dot(np.array(datatrain.obs[['x_cord', 'y_cord']]))

In [71]:
pred_cord_transform

array([[2223.35288385, 2251.8120195 ],
       [2098.93669228, 2331.00031686],
       [2794.47220162, 2332.61304812],
       ...,
       [2494.38986343, 2382.04782971],
       [2042.55707426, 2251.00206154],
       [2181.70260197, 2285.28090654]])

In [72]:
#pd.cor(pred_cord_transform[0:,], datatest.obs['x_cord'])

np.corrcoef(pred_cord_transform[:,0],datatest.obs['x_cord'])

array([[1.        , 0.73321875],
       [0.73321875, 1.        ]])

In [73]:
np.corrcoef(pred_cord_transform[:,1],datatest.obs['y_cord'])

array([[1.        , 0.47914602],
       [0.47914602, 1.        ]])