# Preprocessing Nanostring Lung cancer data

### 1. Download dataset

Download the dataset from [here](https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/)
You will get:
1. gene expression matrix "Lung13_Rep1_exprMat_file.csv"
2. gene expression annotation "matched_annotation_all_lung13.csv"
3. Histology images for 20 fovs (CellComposite_F001.jpg , ...)

Folder structure should be as follows

The data folder structure is like:  
```
├── dataset  
│   └── nanostring 
│        └── Lung13_Rep1_exprMat_file.csv  
│        └── matched_annotation_all_lung13.csv  
│        └──lung13 
│              └── fov1  
│                   ├── CellComposite_F001.jpg  
│              └── fov2  
│                   ├── CellComposite_F002.jpg  
│              └── ...  
│              └── fov20  
│                    ├── ...  
```

### 2. Import required modules

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as AD
import cv2
import os
import warnings
warnings.filterwarnings('ignore')

### 3. Define fovs

In [2]:
ids = [
        'fov1', 'fov2', 'fov3', 'fov4', 'fov5',
        'fov6', 'fov7', 'fov8', 'fov9', 'fov10',
        'fov11', 'fov12', 'fov13', 'fov14', 'fov15',
        'fov16', 'fov17', 'fov18', 'fov19', 'fov20'
    ]
img_names = [
    'F001', 'F002', 'F003', 'F004', 'F005',
    'F006', 'F007', 'F008', 'F009', 'F010',
    'F011', 'F012', 'F013', 'F014', 'F015',
    'F016', 'F017', 'F018', 'F019', 'F020',
]

### 4. Step by step processing for one fov

### 4.1. Define parameters

In [3]:
root = '../dataset/nanostring/lung13'
id = 'fov1'
img_id = 'F001'
fov = 1

### 4.2. Read gene expression and image

In [4]:
img_root = os.path.join(root, id, 'CellComposite_%s.jpg'%(img_id))
img = cv2.imread(img_root)
height, width, c = img.shape
gene_expression = os.path.join(root, 'Lung13_exprMat_file.csv')
ge = pd.read_csv(gene_expression, delimiter=',')

### 4.3. Filter gene expression for current fov 1

In [5]:
gene_f1 = ge[ge['fov'] == int(fov)]
gene_f1 = gene_f1.drop(columns=['fov'])
gene_f1 = gene_f1.set_index('cell_ID')
idx = gene_f1.index

### 4.4. Get annotation for fov 1

In [6]:
annor = os.path.join(root, 'matched_annotation_lung13.csv')
anno = pd.read_csv(annor)
anno_f1 = anno[anno['fov'] == int(fov)]	


### 4.5. Get image patch match cell centre to image

In [7]:
w, h = 60, 60

for i, row in anno_f1.iterrows():
    cx, cy = float(anno_f1['CenterX_local_px'][i]), float(anno_f1['CenterY_local_px'][i])
    anno_f1['CenterY_local_px'][i] = height - float(anno_f1['CenterY_local_px'][i])

    if cx - w < 0 or cx + w > width or cy - h < 0 or cy + h > height:
        anno_f1['cell_type'][i] = np.nan
        
anno_f1 = anno_f1.set_index('cell_ID').reindex(idx)

### 4.6. Drop corresponding rows in gene expression if the annotation is nan

In [8]:
gene_f1['cell_type'] = anno_f1['cell_type']
# gene_f1['niche'] = anno_f1['niche']
gene_f1 = gene_f1.dropna(axis=0, how='any')
gene_f1 = gene_f1.drop(columns=['cell_type'])

### 4.7. Build anndata

In [9]:
adata = AD.AnnData(gene_f1)
anno_f1.index = anno_f1.index.map(str)

adata.obs['cell_type'] = anno_f1.loc[adata.obs_names, 'cell_type']
# adata.obs['niche'] = anno_f1.loc[adata.obs_names, 'niche']

adata.obs['cx'] = anno_f1.loc[adata.obs_names, 'CenterX_local_px']
adata.obs['cy'] = anno_f1.loc[adata.obs_names, 'CenterY_local_px']

adata.obs['cx_g'] = anno_f1.loc[adata.obs_names, 'CenterX_global_px']
adata.obs['cy_g'] = anno_f1.loc[adata.obs_names, 'CenterY_global_px']

df = pd.DataFrame(index=adata.obs.index)
df['cx'] = adata.obs['cx']
df['cy'] = adata.obs['cy']
arr = df.to_numpy()
adata.obsm['spatial'] = arr

df = pd.DataFrame(index=adata.obs.index)
df['cx_g'] = adata.obs['cx_g']
df['cy_g'] = adata.obs['cy_g']
arr = df.to_numpy()

adata.obsm['spatial_global'] = arr


### 4.8. Merge cell types if required

In [10]:
dicts = {}

dicts['T CD8 memory'] = 'lymphocyte'
dicts['T CD8 naive'] = 'lymphocyte'
dicts['T CD4 naive'] = 'lymphocyte'
dicts['T CD4 memory'] = 'lymphocyte'
dicts['Treg'] = 'lymphocyte'
dicts['B-cell'] = 'lymphocyte'
dicts['plasmablast'] = 'lymphocyte'
dicts['NK'] = 'lymphocyte'
dicts['monocyte'] = 'Mcell'
dicts['macrophage'] = 'Mcell'
dicts['mDC'] = 'Mcell'
dicts['pDC'] = 'Mcell'
dicts['tumors'] = 'tumors'
dicts["myeloid"] = "myeloid"
dicts["lymphocyte"] = "lymphocyte"
dicts['epithelial'] = 'epithelial'
dicts['mast'] = 'mast'
dicts['endothelial'] = 'endothelial'
dicts['fibroblast'] = 'fibroblast'
dicts['neutrophil'] = 'neutrophil'

adata.obs['merge_cell_type'] = np.zeros(adata.shape[0])
for key, v in dicts.items():
    idx = (adata.obs['cell_type'] == key)
    adata.obs['merge_cell_type'][idx] = v

### 4.9. Save anndata

In [11]:
# adata.obs['merge_cell_type'] = adata.obs['cell_type'].astype('category')
adata.obs.columns = adata.obs.columns.astype(str)
adata.var.columns = adata.var.columns.astype(str)
adata.write(os.path.join(root, id, 'sampledata.h5ad'))


### 4.10. Define function for preprocessing

In [12]:
def gen_h5ad(id, img_id, fov):
    root = '../dataset/nanostring/lung13/'
    img_root = os.path.join(root, id, 'CellComposite_%s.jpg'%(img_id))
    print(img_root)
    img = cv2.imread(img_root)
    height, width, c = img.shape
    gene_expression = os.path.join(root, 'Lung13_exprMat_file.csv')
    ge = pd.read_csv(gene_expression, delimiter=',')
    gene_f1 = ge[ge['fov'] == int(fov)]
    gene_f1 = gene_f1.drop(columns=['fov'])
    gene_f1 = gene_f1.set_index('cell_ID')
    idx = gene_f1.index
    annor = os.path.join(root, 'matched_annotation_lung13.csv')
    anno = pd.read_csv(annor)
    anno_f1 = anno[anno['fov'] == int(fov)]
    w, h = 60, 60

    for i, row in anno_f1.iterrows():
        cx, cy = float(anno_f1['CenterX_local_px'][i]), float(anno_f1['CenterY_local_px'][i])
        anno_f1['CenterY_local_px'][i] = height - float(anno_f1['CenterY_local_px'][i])

        if cx - w < 0 or cx + w > width or cy - h < 0 or cy + h > height:
            anno_f1['cell_type'][i] = np.nan

    anno_f1 = anno_f1.set_index('cell_ID').reindex(idx)



    gene_f1['cell_type'] = anno_f1['cell_type']
    #gene_f1['niche'] = anno_f1['niche']
    gene_f1 = gene_f1.dropna(axis=0, how='any')
    gene_f1 = gene_f1.drop(columns=['cell_type'])

    adata = AD.AnnData(gene_f1)
    anno_f1.index = anno_f1.index.map(str)

    adata.obs['cell_type'] = anno_f1.loc[adata.obs_names, 'cell_type']
    # adata.obs['niche'] = anno_f1.loc[adata.obs_names, 'niche']

    adata.obs['cx'] = anno_f1.loc[adata.obs_names, 'CenterX_local_px']
    adata.obs['cy'] = anno_f1.loc[adata.obs_names, 'CenterY_local_px']

    adata.obs['cx_g'] = anno_f1.loc[adata.obs_names, 'CenterX_global_px']
    adata.obs['cy_g'] = anno_f1.loc[adata.obs_names, 'CenterY_global_px']

    df = pd.DataFrame(index=adata.obs.index)
    df['cx'] = adata.obs['cx']
    df['cy'] = adata.obs['cy']
    arr = df.to_numpy()
    adata.obsm['spatial'] = arr

    df = pd.DataFrame(index=adata.obs.index)
    df['cx_g'] = adata.obs['cx_g']
    df['cy_g'] = adata.obs['cy_g']
    arr = df.to_numpy()

    adata.obsm['spatial_global'] = arr

    dicts = {}

    dicts['T CD8 memory'] = 'lymphocyte'
    dicts['T CD8 naive'] = 'lymphocyte'
    dicts['T CD4 naive'] = 'lymphocyte'
    dicts['T CD4 memory'] = 'lymphocyte'
    dicts['Treg'] = 'lymphocyte'
    dicts['B-cell'] = 'lymphocyte'
    dicts['plasmablast'] = 'lymphocyte'
    dicts['NK'] = 'lymphocyte'
    dicts['monocyte'] = 'Mcell'
    dicts['macrophage'] = 'Mcell'
    dicts['mDC'] = 'Mcell'
    dicts['pDC'] = 'Mcell'
    dicts['tumors'] = 'tumors'
    dicts['myeloid'] = 'myeloid'
    dicts['lymphocyte'] = 'lymphocyte'
    dicts['epithelial'] = 'epithelial'
    dicts['mast'] = 'mast'
    dicts['endothelial'] = 'endothelial'
    dicts['fibroblast'] = 'fibroblast'
    dicts['neutrophil'] = 'neutrophil'

    adata.obs['merge_cell_type'] = np.zeros(adata.shape[0])
    for key, v in dicts.items():
        idx = (adata.obs['cell_type'] == key)
        adata.obs['merge_cell_type'][idx] = v

    #adata.obs['merge_cell_type'] = adata.obs['cell_type'].astype('category')
    adata.obs.columns = adata.obs.columns.astype(str)
    adata.var.columns = adata.var.columns.astype(str)

    # adata.raw.var.columns = adata.raw.var.columns.astype(str)
    # adata.obsm['imgs'].columns = adata.obsm['imgs'].columns.astype(str)
    adata.write(os.path.join(root, id, 'sampledata.h5ad'))


### 4.11. Preprocessing for each fov for lung13

In [13]:
fov = 1
for id, imname in zip(ids, img_names):
    gen_h5ad(id, imname, fov)
    fov += 1

../dataset/nanostring/lung13/fov1/CellComposite_F001.jpg
../dataset/nanostring/lung13/fov2/CellComposite_F002.jpg
../dataset/nanostring/lung13/fov3/CellComposite_F003.jpg
../dataset/nanostring/lung13/fov4/CellComposite_F004.jpg
../dataset/nanostring/lung13/fov5/CellComposite_F005.jpg
../dataset/nanostring/lung13/fov6/CellComposite_F006.jpg
../dataset/nanostring/lung13/fov7/CellComposite_F007.jpg
../dataset/nanostring/lung13/fov8/CellComposite_F008.jpg
../dataset/nanostring/lung13/fov9/CellComposite_F009.jpg
../dataset/nanostring/lung13/fov10/CellComposite_F010.jpg
../dataset/nanostring/lung13/fov11/CellComposite_F011.jpg
../dataset/nanostring/lung13/fov12/CellComposite_F012.jpg
../dataset/nanostring/lung13/fov13/CellComposite_F013.jpg
../dataset/nanostring/lung13/fov14/CellComposite_F014.jpg
../dataset/nanostring/lung13/fov15/CellComposite_F015.jpg
../dataset/nanostring/lung13/fov16/CellComposite_F016.jpg
../dataset/nanostring/lung13/fov17/CellComposite_F017.jpg
../dataset/nanostring/l