# Integration of D14 RGC-iNs with Published Datasets

In [None]:
import scanpy as sc
import scanorama as sr
import pandas as pd
import anndata as ad

sc.settings.verbosity = 3 # hints 

### Data Collection

In [None]:
### Read data into an AnnData objects (this step may take a few minutes the first time)

# NAIP2 D14 RGC-iNs
adata_14 = sc.read_10x_mtx(
    './../../sc_data/d14/',
    var_names='gene_symbols',
    cache=True
)

# Other published datasets
adata_gudiseva = sc.read_10x_mtx(
    './../../sc_data/gudiseva_ipsc_rgcs',
    var_names='gene_symbols',
    cache=True
) 

adata_lu = sc.read_10x_mtx(
    './../../sc_data/lu_organoid_rgcs',
    var_names='gene_symbols',
    cache=True
) 

adata_sridhar = sc.read_10x_mtx(
    './../../sc_data/sridhar_fetal_rgcs',
    var_names='gene_symbols',
    cache=True
) 

data = [adata_14, adata_gudiseva, adata_lu, adata_sridhar]

for adata in data:
    adata.var_names_make_unique()
    display(adata)

# Add 'sample' column in order to later distinguish cells from different timepoints
adata_14.obs['sample'] = 'd14'
adata_gudiseva.obs['sample'] = 'gudiseva'
adata_lu.obs['sample'] = 'lu'
adata_sridhar.obs['sample'] = 'sridhar'

### Preprocessing

In [None]:
# Preprocess each dataset separately
for adata in data:
    adata.var_names_make_unique()
    
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'

    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                 jitter=0.4, multi_panel=True)

    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
### Wahlin D14 RGC-iNs
# filter out cells with an abnormal number of total_counts and those with high mitochondrial gene presence
adata_14 = adata_14[adata_14.obs.total_counts < 250000, :]
adata_14 = adata_14[adata_14.obs.pct_counts_mt < 5, :]

# normalize and logarthmize data
sc.pp.normalize_total(adata_14, target_sum=1e4)
sc.pp.log1p(adata_14)

sc.pp.highly_variable_genes(adata_14, min_mean=0.0125, max_mean=3, min_disp=0.5)

display(adata_14)

### Gudiseva D40 iPSC-RGCs
# filter out cells with an abnormal number of total_counts and those with high mitochondrial gene presence
adata_gudiseva = adata_gudiseva[adata_gudiseva.obs.total_counts < 40000, :]
adata_gudiseva = adata_gudiseva[adata_gudiseva.obs.pct_counts_mt < 5, :]

# normalize and logarthmize data
sc.pp.normalize_total(adata_gudiseva, target_sum=1e4)
sc.pp.log1p(adata_gudiseva)

sc.pp.highly_variable_genes(adata_gudiseva, min_mean=0.0125, max_mean=3, min_disp=0.5)

display(adata_gudiseva)

### Lu D45 Organoid
# filter out cells with an abnormal number of total_counts and those with high mitochondrial gene presence
adata_lu = adata_lu[adata_lu.obs.total_counts < 30000, :]
adata_lu = adata_lu[adata_lu.obs.pct_counts_mt < 5, :]

# normalize and logarthmize data
sc.pp.normalize_total(adata_lu, target_sum=1e4)
sc.pp.log1p(adata_lu)

sc.pp.highly_variable_genes(adata_lu, min_mean=0.0125, max_mean=3, min_disp=0.5)

display(adata_lu)


### Sridhar D59 Fetal
# filter out cells with an abnormal number of total_counts and those with high mitochondrial gene presence
adata_sridhar = adata_sridhar[adata_sridhar.obs.total_counts < 10000, :]
adata_sridhar = adata_sridhar[adata_sridhar.obs.pct_counts_mt < 5, :]

# normalize and logarthmize data
sc.pp.normalize_total(adata_sridhar, target_sum=1e4)
sc.pp.log1p(adata_sridhar)

sc.pp.highly_variable_genes(adata_sridhar, min_mean=0.0125, max_mean=3, min_disp=0.5)

display(adata_sridhar)

In [None]:
# Concatenate each published dataset with D14 RGC-iNs

adata_gudiseva = ad.concat([adata_gudiseva, adata_14])
adata_sridhar = ad.concat([adata_sridhar, adata_14])
adata_lu = ad.concat([adata_lu, adata_14])

adatas = [adata_gudiseva, adata_sridhar, adata_lu]
display(adata_gudiseva, adata_sridhar, adata_lu)

In [None]:
# compute principle components, integrate data using scanorama, and finally batch correct using combat
for adata in adatas:
    sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=False)
    sc.pl.pca(adata)
    sc.pl.pca_variance_ratio(adata, log=True)

    sc.external.pp.scanorama_integrate(adata, key='sample')

    sc.pp.combat(adata, key='sample')

In [None]:
# neighborhood generation, louvain clustering, and UMAP dimensionality reduction
for adata in adatas:
    sc.pp.neighbors(adata, use_rep = 'X_scanorama')
    sc.tl.umap(adata)
    sc.tl.louvain(adata, resolution = 0.3)

### Gudiseva Integration

In [None]:
highlight_d14 = {
    'gudiseva':'gray',
    'd14':'darkblue'
}

# Plot UMAPs colored for louvain clusters and sample origin
sc.pl.umap(adata_gudiseva, color = ['louvain'])
sc.pl.umap(adata_gudiseva, color=['sample'], palette=highlight_d14)

In [None]:
# Remove non-integrated clusters to focus on integration areas
remove = [0, 3, 4, 5]

for clust in remove:
    adata_gudiseva = adata_gudiseva[~adata_gudiseva.obs['louvain'].isin([str(clust)]),:]

In [None]:
# Plot UMAPs of the filtered clusters showing sample origin and RGC marker expression
sc.pl.umap(adata_gudiseva, color = 'sample', palette=highlight_d14, save='_d14_gudiseva_integration.pdf')
sc.pl.umap(adata_gudiseva, color = ['GAP43'], cmap='Greens', vmax=1, vmin=0, save='_d14_gudiseva_integration_gap43.pdf')
sc.pl.umap(adata_gudiseva, color = ['STMN2'], cmap='Greens', vmax=1, vmin=0, save='_d14_gudiseva_integration_stmn2.pdf')

### Lu Integration

In [None]:
highlight_d14 = {
    'lu':'gray',
    'd14':'darkblue'
}

# Plot UMAPs colored for louvain clusters and sample origin
sc.pl.umap(adata_lu, color = ['louvain'])
sc.pl.umap(adata_lu, color=['sample'], palette=highlight_d14)

In [None]:
# Remove non-integrated clusters to focus on integration areas
remove = [0, 2, 3, 4, 6, 7]

for clust in remove:
    adata_lu = adata_lu[~adata_lu.obs['louvain'].isin([str(clust)]),:]

In [None]:
# Plot UMAPs of the filtered clusters showing sample origin and RGC marker expression
sc.pl.umap(adata_lu, color = 'sample', palette=highlight_d14, save='_d14_lu_integration.pdf')
sc.pl.umap(adata_lu, color = ['GAP43'], cmap='Greens', vmax=1, vmin=0, save='_d14_lu_integration_gap43.pdf')
sc.pl.umap(adata_lu, color = ['STMN2'], cmap='Greens', vmax=1, vmin=0, save='_d14_lu_integration_stmn2.pdf')

### Sridhar Integration

In [None]:
highlight_d14 = {
    'sridhar':'gray',
    'd14':'darkblue'
}

# Plot UMAPs colored for louvain clusters and sample origin
sc.pl.umap(adata_sridhar, color = ['louvain'])
sc.pl.umap(adata_sridhar, color=['sample'])

In [None]:
# Remove non-integrated clusters to focus on integration areas
remove = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for clust in remove:
    adata_sridhar = adata_sridhar[~adata_sridhar.obs['louvain'].isin([str(clust)]),:]

In [None]:
sc.pl.umap(adata_sridhar, color = 'sample', palette=highlight_d14, save='_d14_sridhar_integration.pdf')
sc.pl.umap(adata_sridhar, color = ['GAP43'], palette=highlight_d14, cmap='Greens', save='_d14_sridhar_integration_gap43.pdf')
sc.pl.umap(adata_sridhar, color = ['STMN2'], palette=highlight_d14, cmap='Greens', save='_d14_sridhar_integration_stmn2.pdf')