In [20]:
# Import packages
import scanpy as sc
import anndata as ad
import pandas as pd
import time

Function definitions

Time wrapper

In [21]:
# Time wrapper
def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} executed in {end_time - start_time} seconds")
        return result
    return wrapper

Filtering cells by gene for UMAP

In [22]:
# DEFAULT QC VALUES. Calibrated to Sarah Teichmann's paper "Cells of the human intestinal tract mapped across space and time." These QC values will apply by default for this entire script.
def filter_cells_for_UMAP(data, min_ct = 2000, min_gen = 500, min_cell = 3, mt_pct = 60, max_genes = 0, normed = 0, d_score = 0.24): 
    adata = data # This is to avoid writing into the file that's entered as an argument
    sc.pp.filter_cells(adata, min_counts = min_ct) # Filter cells based on number of RNA reads
    sc.pp.filter_cells(adata, min_genes= min_gen) # Filter cells based on the number of recognized genes
    sc.pp.filter_genes(adata, min_cells = min_cell) # Filter genes based on the minimum number of cells expressing it
    adata_prefilt = adata[adata.obs['doublet_scores'] < 0.24]
    if max_genes > 0:
        adata_prefilt = adata_prefilt[adata_prefilt.obs['n_genes_by_counts'] < max_genes]
        
    if not normed:
        adata_filt = adata_prefilt[adata_prefilt.obs['pct_counts_mt'] < mt_pct] # Filtering based on percentage of mitochondrial genes
    else:
        adata_filt = adata_prefilt
    return adata_filt    

UMAP processing

In [23]:
@time_it
def process_for_UMAP(data, leiden_res = 0.8, filtering = 1, min_ct = 2000, min_gen = 500, min_cell = 3, mt_pct = 60, max_genes = 0, normed = 0, d_score = 0.24): # DEFAULT QC VALUES
    adata = data # This is to avoid writing into the file that's entered as an argument
    if filtering:
        adata_filt = filter_cells_for_UMAP(data = adata, min_ct = min_ct, min_gen = min_gen, min_cell = min_cell, max_genes = max_genes, mt_pct = mt_pct, d_score = d_score)
    else:
        adata_filt = adata       
    sc.pp.normalize_total(adata_filt, target_sum=1e4) # Normalize
    sc.pp.log1p(adata_filt) # Log scaling
    sc.pp.highly_variable_genes(adata_filt, min_mean = 0.0125, max_mean = 3, min_disp = 0.5) # Compute differentially expressed genes within the sample
    adata_filt.raw = adata_filt # Store the raw files in its own layer
    #adata_filt = adata_filt[:, adata_filt.var.highly_variable] # Filter on genes that are highly variable
    sc.pp.regress_out(adata_filt, ['total_counts', 'pct_counts_mt']) # Regression. Not sure what that is.
    sc.pp.scale(adata_filt, max_value = 10) # Scale the data
    sc.tl.pca(adata_filt, svd_solver='arpack') # Compute PCA
    sc.tl.tsne(adata_filt) # Calculate tsne
    sc.pp.neighbors(adata_filt) # Calculate neighbors
    sc.tl.leiden(adata_filt, resolution = leiden_res) # Calculate Leiden clusters
    sc.tl.paga(adata_filt) # Calculate PAGA
    sc.pl.paga(adata_filt, plot = 1)  # remove `plot=False` if you want to see the coarse-grained graph
    sc.tl.umap(adata_filt, init_pos='paga') # Plot PAGA
    sc.tl.umap(adata_filt) # Calculate UMAP
    sc.pl.umap(adata_filt, color = ['leiden']) # Plot UMAP and show Leiden clusters
    return adata_filt

Function for recalculating the UMAP

In [24]:
@time_it
def recalc_UMAP(data_filt, leiden_res = 0.8):
    adata_filt = data_filt
    sc.tl.pca(adata_filt, svd_solver='arpack') # Compute PCA
    sc.tl.tsne(adata_filt) # Calculate tsne
    sc.pp.neighbors(adata_filt) # Calculate neighbors
    sc.tl.leiden(adata_filt, resolution = leiden_res) # Calculate Leiden clusters)
    sc.tl.paga(adata_filt) # Calculate PAGA
    sc.pl.paga(adata_filt, plot = 1)  # remove `plot=False` if you want to see the coarse-grained graph
    sc.tl.umap(adata_filt, init_pos='paga') # Calculate PAGA
    sc.tl.umap(adata_filt) # Calculate UMAP
    sc.pl.umap(adata_filt, color = ['leiden']) # Plot UMAP and show Leiden clusters
    return adata_filt

Isolate cells by gene expression

In [25]:
def isolate_cells_by_gene(data, gene, threshold):
    # Now subset_ant_mt_filt contains only the highly variable genes
    data_subset = data[data[:, gene].X > threshold]
    return data_subset

Filter clusters by differential gene expression

In [26]:
# This function filters the leiden clusters that are positivefor the gene you specify
# It assumes that you already did the differential expression analysis. 
# diff is boolean specifying if differential expresion is already done
# threshold is the threshold of expression
def filter_clusters_by_gene(data, gene, threshold = 0.5):
    # Load your AnnData object
    adata = data
    sc.tl.rank_genes_groups(adata, groupby='leiden')
    # Extract the DataFrame for the differential expression results
    de_results = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    # Define a threshold for significant expression (adjust as needed)
    expression_threshold = threshold
    # Find clusters with significant gene expression
    significant_clusters = []
    for cluster in de_results.columns:
        gene_presence = de_results[cluster].str.contains(gene)
        gene_expression = adata.uns['rank_genes_groups']['logfoldchanges'][cluster][gene_presence]
        if any(gene_expression >= expression_threshold):
            significant_clusters.append(cluster)
    # Subset the data to include only cells from the significant clusters
    adata_subset = adata[adata.obs['leiden'].isin(significant_clusters)].copy()
    return adata_subset

Setting envuronmental and other variables

In [27]:
#%% Environment settings and misc variables
sc.settings.verbosity = 3
sc.set_figure_params(dpi = 600)
# MIK67 = Ki67, TNSFRSF19 = TROY
inspect_stem = ['LGR5', 'MKI67', 'TNFRSF19', 'BMI1', 'LRIG1', 'leiden', 'Localization']
global_res = 0.5
start_time = time.time()

Reading files

In [28]:
#%% Read the files
path = 'S:/data cache/code_in_out/agr2/agr2_init_files/Aline/raw_data/'
col_org_unfilt = sc.read_h5ad(path + 'agr2colon_organoids_unfilt.h5ad')
ant_unfilt = sc.read_h5ad(path + 'agr2_unfilt_antrum.h5ad')
ant_unfilt_2 = ant_unfilt.copy()
all_org_harmony = sc.read_h5ad('S:/data cache/code_in_out/agr2/agr2_init_files/agr2_nneigh30_pcs75_dir/agr2_bcharmony_meteuclidean_neighbors.h5ad')

ant_file = process_for_UMAP(ant_unfilt)

sc.pp.normalize_total(ant_unfilt_2, target_sum=1e4)

In [29]:
sc.pp.filter_genes(ant_unfilt_2, min_cells = 3)

filtered out 14288 genes that are detected in less than 3 cells


In [30]:
str(ant_unfilt_2)

"AnnData object with n_obs × n_vars = 9980 × 22313\n    obs: 'sample_id', 'SampleID', 'Patient', 'Biopsies', 'Site', 'Inflammation', 'Age', 'Sex', 'doublet_scores', 'predicted_doublets', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ig', 'pct_counts_ig', 'S_score', 'G2M_score', 'phase', 'cellbarcode'\n    var: 'gene_ids', 'feature_types', 'hb', 'ig', 'rp', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'\n    uns: 'sample_id_colors'"

sc.pp.filter_cells(ant_unfilt_2, min_counts = 2000)

In [31]:
sc.pp.filter_cells(ant_unfilt_2, min_genes = 500)

filtered out 2331 cells that have less than 500 genes expressed


In [32]:
miss = pd.read_csv('S:/missing_barcodes.csv')

In [33]:
# Check if barcodes are in adata.obs_names
barcode_in_obs = miss.isin(ant_unfilt_2.obs_names)

In [34]:
str(barcode_in_obs)

'    barcodes\n0       True\n1       True\n2       True\n3       True\n4       True\n..       ...\n74      True\n75      True\n76      True\n77      True\n78      True\n\n[79 rows x 1 columns]'

In [35]:
print(barcode_in_obs)

    barcodes
0       True
1       True
2       True
3       True
4       True
..       ...
74      True
75      True
76      True
77      True
78      True

[79 rows x 1 columns]


In [36]:
# Print or store the result
print(f"Barcodes in CSV file: {len(miss)}")
print(f"Barcodes in .obs_names: {barcode_in_obs.sum()} out of {len(miss)}")

Barcodes in CSV file: 79
Barcodes in .obs_names: barcodes    79
dtype: int64 out of 79


In [None]:
sc.pp.normalize_total(all_org_harmony, target_sum=1e4)

In [None]:
sc.pp.filter_cells(all_org_harmony, min_genes = 200)

In [None]:
sc.pp.filter_cells(all_org_harmony, min_counts = 2000)

In [None]:
sc.pp.filter_genes(all_org_harmony, min_cells = 3)

In [None]:
# Check if barcodes are in adata.obs_names
barcode_in_obs = miss.isin(all_org_harmony.obs_names)

In [None]:
str(barcode_in_obs)

In [None]:
print(barcode_in_obs)

In [None]:
# Print or store the result
print(f"Barcodes in CSV file: {len(miss)}")
print(f"Barcodes in .obs_names: {barcode_in_obs.sum()} out of {len(miss)}")

In [None]:
str(ant_file)

In [None]:
str(ant_unfilt)

In [None]:
str(all_org_harmony[all_org_harmony.obs['Site'] == 'Antrum'])