#### Imports

In [1]:
# builtins
from time import time
# general
import numpy as np
import pandas as pd
import scipy
# statistical tests
from scipy.stats import ttest_ind, ranksums, ks_2samp
# spatial transcriptomics packages
import anndata as ad
import scanpy as sc

# plotting
import bokeh.io
import bokeh.plotting
import bokeh.layouts
import iqplot

bokeh.io.output_notebook()

#### Loading data

In [2]:
cell_assignments = pd.read_csv('cell_assignments_filtered.csv')
region_assignments = pd.read_csv('concordex_cerebellum_res_2024-06-10.csv')
cereb_coords = np.load('cerebellum_coords_mat.npy')
cereb_counts = np.load('cerebellum_counts_mat.npy')
gene_labels = np.load('cerebellum_gene_labels.npy', allow_pickle=True)

cell_assignments = cell_assignments['first_type']
region_assignments = region_assignments['concordex_pred']

passes_bonferroni = pd.read_csv('passes_bonferroni.csv')

#### Preprocessing

In [3]:
# Setting up AnnData object
cereb_adata = ad.AnnData(cereb_counts)
cereb_adata.obs_names = ['Bead_'+str(i) for i in range(cereb_adata.n_obs)]
cereb_adata.var_names = gene_labels
cereb_adata.obs['cell_type'] = pd.Categorical(cell_assignments)
cereb_adata.obs['region'] = pd.Categorical(region_assignments)

# inverting y_coordinates because spatial plotting libraries were plotting the cerebellum
# upside down from usual display
x = cereb_coords[:, 0]
y = cereb_coords[:, 1]
new_y = y.max() - y
new_cereb_coords = np.array(list(zip(x, new_y)))
cereb_adata.obsm['spatial'] = new_cereb_coords


# Calculating % MT counts
mt_gene_ids = [i for i in range(14089, 14160)] + [i for i in range(23068, 23092)] # figured out these are the correct indices
mt_pcts = []
counts_matrix = cereb_adata.X
for bead_num in range(cereb_adata.n_obs):
    mt_pcts.append(np.sum(counts_matrix[bead_num][mt_gene_ids])/np.sum(counts_matrix[bead_num]))

cereb_adata.obs['pct_counts_mt'] = mt_pcts


# Filtering
print('Initial cells:', cereb_adata.n_obs)
print('Initial genes:', cereb_adata.n_vars)
sc.pp.filter_genes(cereb_adata, min_counts=50)
sc.pp.filter_genes(cereb_adata, min_cells=25)
cereb_adata = cereb_adata[cereb_adata.obs["pct_counts_mt"] < 0.1].copy()
print('Final cells:', cereb_adata.n_obs)
print('Final genes:', cereb_adata.n_vars)


# Finding total_counts (per gene) and n_cells_by_counts
cereb_adata.var.rename(columns={'n_counts':'total_counts', 'n_cells':'n_cells_by_counts'}, inplace=True)
    # filtering process somehow creates columns but we need to rename and reevaluate post-filtering

# Finding total_counts (gene) and n_cells_by_counts
cells_expressing  = []
expressions = []
X = cereb_adata.X.T 
    # transposes rows and columns so that genes are now rows
for gene_num in range(cereb_adata.n_vars):
    expressions.append(np.sum(X[gene_num]))
    cells_expressing.append(np.count_nonzero(X[gene_num]))

cereb_adata.var['n_cells_by_counts'] = cells_expressing
cereb_adata.var['total_counts'] = expressions


# Finding total_counts (cell) and n_genes_by_counts
genes_expressed  = []
total_counts = []
X = cereb_adata.X
for bead_num in range(cereb_adata.n_obs):
    total_counts.append(np.sum(X[bead_num]))
    genes_expressed.append(np.count_nonzero(X[bead_num]))

cereb_adata.obs['n_genes_by_counts'] = genes_expressed
cereb_adata.obs['total_counts'] = total_counts


# Normalization + log1p
sc.pp.normalize_total(cereb_adata)
sc.pp.log1p(cereb_adata)


# Finding post-normalization log1p_total_counts (cell and gene)
    # cell
total_counts = []
X = cereb_adata.X
for bead_num in range(cereb_adata.n_obs):
    total_counts.append(np.sum(X[bead_num]))

cereb_adata.obs['log1p_total_counts'] = total_counts

    # gene
total_counts = []
X = cereb_adata.X.T
for gene_num in range(cereb_adata.n_vars):
    total_counts.append(np.sum(X[gene_num]))

cereb_adata.var['log1p_total_counts'] = total_counts

Initial cells: 9985
Initial genes: 23096
Final cells: 9051
Final genes: 10293


#### Making plots and saving to file

In [7]:
#                blue/gran   green/PB  orange/oligo red/MLI     gray
master_palette = ['#1A476F', '#55752F', '#E69F00', '#C10534','#C1C1C1']

In [None]:
gene_names = passes_bonferroni['gene_name'].unique()
cell_types = passes_bonferroni['cell_type'].unique()

start = time()
print('Counting up to', len(gene_names))
counter = 0

for counter, gene_name in enumerate(gene_names):
    if counter % 10 == 0:
        end = time()
        print(counter, ':', betterTime(end-start))
    
    for cell_type in cell_types:
        
        rows = passes_bonferroni[(passes_bonferroni['gene_name'] == gene_name) & 
                                    (passes_bonferroni['cell_type'] == cell_type)]
        if len(rows) != 0:
            regions = np.sort(pd.concat((rows['region1'], rows['region2'])).unique())
            handle = gene_name+'_'+cell_type
            
            
            # making df for plot
            cell_gene = cereb_adata[cereb_adata.obs['cell_type'] == cell_type][:, gene_name]
            
            ls1 = cell_gene.obs['region'] # regions
            ls2 = cell_gene.X.flatten()   # count data
            data = [[reg, ls2[ind]] for ind, reg in enumerate(ls1) if reg in regions] # only if it is of the regions we want
            index = list(range(len(ls1)))
            
            dfdict = dict(zip(index, data))
            col = ['region', 'log1p counts']
            newdf = pd.DataFrame.from_dict(data=dfdict, orient='index', columns=col)

            palette = [master_palette[i] for i in regions]
            
            box = iqplot.stripbox(
                data=newdf,
                q='log1p counts',
                cats='region',
                order=regions,
                frame_width=500,
                frame_height=300,
                spread='jitter',
                x_axis_label=gene_name+' counts in '+cell_type+' (log1p)',
                y_axis_label='region',
                palette=palette
            )
            
            ecdf = iqplot.ecdf(
                data=newdf,
                q='log1p counts',
                cats='region',
                order=regions,
                frame_width=500,
                frame_height=300,
                x_axis_label=gene_name+' counts in '+cell_type+' (log1p)',
                y_axis_label='cumulative probability',
                legend_location='bottom_right',
                palette=palette
            )
            
            
            multiplot = bokeh.layouts.gridplot([box, ecdf], ncols=1)
     
            fname = './PassesBonferroniPlots/'+handle+'_'+str(regions)+'.html'
            bokeh.plotting.output_file(filename=fname)
            bokeh.plotting.save(multiplot)