# Differentially expressed genes as identified in cells with cryptic| mitochondrial mutations

We use single-cell RNA-sequencing data from Enge et al. to demonstrate that we can identify DEGs.

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import argparse
import pickle as pkl
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats

import seaborn as sns

sns.set_context("paper")
sns.set_style("white")
sns.set(font_scale=1.2) 

## 0. Data loading and preparation

In [4]:
# Load the dataframe with mutation information
varianceData = pkl.load( open( "./../../data/precomputedData/Enge/engeVariantsSTAR200.pkl", "rb" ) )
# expression matrix
adata = sc.read_h5ad("./../../data/precomputedData/Enge/engeFilteredExpression200.h5ad")
# metadata
metadata = pkl.load( open( "./../../data/precomputedData/Enge/enge_metadata.pkl", "rb" ) )

In [5]:
# add age and cell type as annotations to the expression matrix 
# get all ages from the variance data
age = [ metadata[metadata['SRR_id'] == item]['donor_age'].iloc[0] if (item in metadata['SRR_id'].values) else 0 for item in adata.obs_names]
# add it to the expression matrix
adata.obs['donor_age'] = age
# cellType = [ metadata[metadata['SRR_id'] == item]['inferred_cell_type'].iloc[0] if (item in metadata['SRR_id'].values) else 0 for item in adata.obs_names]
# adata.obs['cell_type'] = cellType

In [6]:
# normalisation
sc.pp.normalize_total(adata,target_sum=1e6)
sc.pp.log1p(adata)

In [7]:
# define a function that computes mitochondrial load and adds it to the data frame
def computeMitochondrialLoad(expressionMatrix,varianceDataFrame,
                             threshold = 0.05, #threshold: heteroplasmy threshold (only mutations above this threshold are considered)
                             mutantType = 'all', # which type of mutations should be considered, list of strings, e.g., ['Common mutation','Developmental']
                             removeSynonymous = 0, # whether to remove synonymous mutations from the mutations
                             pathologyType = 'all', # which type of pathology we look at
                             subsetCells = 'all', # whether to keep only certain type of cells
                             rescaling= False, # rescaling by considering the number of bases passed (not recommended)
                             muName = 'MU' # name given to the mitochondrial load in the expression matrix Anndata object
                            ): 
    
    # Apply heteroplasmy threshold 
    varianceDataFrame = varianceDataFrame[threshold<varianceDataFrame['HF']]
    
    # Keep only certain type of mutations
    if (mutantType == 'all'):
        pass
    else:
        # keep only variance which is in this list
         varianceDataFrame = varianceDataFrame[varianceDataFrame['mutant_type'].isin(mutantType)]
    
    # Keep only synonomous or non-synonomous mutation
    if removeSynonymous == True:
        varianceDataFrame= varianceDataFrame[~(varianceDataFrame['MutPred_Prediction'] == 'Synonymous')]
    elif (removeSynonymous == -1):
        # keep only synonomous mutations
        varianceDataFrame= varianceDataFrame[(varianceDataFrame['MutPred_Prediction'] == 'Synonymous')]
    
    # Keep only mutations of a certain pathology
    if (pathologyType == 'all'):
        pass
    else:
        # keep only variance which is in this list
         varianceDataFrame = varianceDataFrame[varianceDataFrame['MutPred_Prediction'].isin(pathologyType)]
    
    # Keep only certain cells
    if (subsetCells == 'all'):
        pass
    else:
        # keep only variance which is in this list
         varianceDataFrame = varianceDataFrame[varianceDataFrame['sample_id'].isin(subsetCells)]
    
    # optional rescaling (it is easier to do this before the summation and the math works out)
    if (rescaling == True):
        varianceDataFrame['scalingFactor'] = 16569/varianceDataFrame['bases_passed']
        #varianceDataFrame['scalingFactor'][np.isinf(varianceDataFrame['scalingFactor'])] = 0
        varianceDataFrame['HF'] = varianceDataFrame.HF * varianceDataFrame.scalingFactor
        

    # compute the mitochondrial load
    groupedData = varianceDataFrame.groupby('sample_id') # group for each cell
    mu = groupedData.sum()['HF'] # mitochondrial load is then the sum over the heteroplasmies
    
    
    # add the mitoLoad as observable MU
    expressionMatrix.obs[muName] = mu
    expressionMatrix.obs[muName] = expressionMatrix.obs[muName] .fillna(0)
    
    return(expressionMatrix)

In [8]:
# DEG calculation 

def findDEGMitoLoad(expressionMatrix,varianceDataFrame,
                    threshold = 0.05, #threshold: heteroplasmy threshold (only mutations above this threshold are considered)
                    mutantType = 'all', # which type of mutations should be considered, list of strings, e.g., ['Common mutation','Developmental']
                    removeSynonymous = 0, # whether to remove synonymous mutations from the mutations
                    pathologyType = 'all', # which type of pathology we look at
                    subsetCells = 'all', # whether to keep only certain type of cells
                    muThreshold = 0.0, # threshold to divide cells into with or without mutation
                    significanceLevel=0.05, # if zero, all genes are returned
                    rescaling= False): # rescaling by considering the number of bases passed (not recommended)):
    
    # compute the mitochondrial load
    expressionMatrixMu = computeMitochondrialLoad(expressionMatrix,varianceDataFrame,threshold = threshold,mutantType = mutantType, # which type of mutations should be considered, list of strings, e.g., ['Common mutation','Developmental']
                    removeSynonymous = removeSynonymous, # whether to remove synonymous mutations from the mutations
                    pathologyType = pathologyType, # which type of pathology we look at
                    subsetCells = subsetCells, # whether to keep only certain type of cells
                    rescaling= rescaling)
    
    
    # we need at least two cells with a mutation and two cells without
    nCellsWithMutation = np.sum(expressionMatrixMu.obs['MU'] >muThreshold)
    nCells = len(expressionMatrixMu.obs_names)
    if (   (nCellsWithMutation>1) & (nCellsWithMutation<nCells-1)  ):
        # create a new observable that is binary whether the mitoload is above a threshold
        maxMitoLoad = np.max(expressionMatrixMu.obs['MU'])
        expressionMatrixMu.obs['mitLoadBinary'] = pd.Categorical(np.ceil(expressionMatrixMu.obs['MU']/maxMitoLoad))
        # do the DEG with a wilcoxon
        sc.tl.rank_genes_groups(expressionMatrixMu, 'mitLoadBinary', method='wilcoxon',n_genes=expressionMatrixMu.n_vars)
        # get out which genes are differentially expressed
        DEGresult = expressionMatrixMu.uns['rank_genes_groups']
        
        if significanceLevel>0:
            significantIndexes = np.where(DEGresult['pvals_adj']['1.0']<significanceLevel)

            listSignificantGenes = pd.DataFrame()
            listSignificantGenes['names'] = DEGresult['names']['1.0'][significantIndexes]
            listSignificantGenes['pvals_adj'] = DEGresult['pvals_adj']['1.0'][significantIndexes]
            listSignificantGenes['names'] = DEGresult['names']['1.0'][significantIndexes]
            listSignificantGenes['logfoldchanges'] = DEGresult['logfoldchanges']['1.0'][significantIndexes]
        else: # otherwise return it for all genes
            listSignificantGenes = pd.DataFrame()
            listSignificantGenes['names'] = DEGresult['names']['1.0']
            listSignificantGenes['pvals_adj'] = DEGresult['pvals_adj']['1.0']
            listSignificantGenes['pvals'] = DEGresult['pvals']['1.0']
            listSignificantGenes['logfoldchanges'] = DEGresult['logfoldchanges']['1.0']
        # sort by adjusted p-value
        listSignificantGenes = listSignificantGenes.sort_values(by='pvals_adj')
    else:
        # empty dataframe if not enough mitoload to call
        listSignificantGenes = pd.DataFrame()
    return(listSignificantGenes)


# def DEG_threshold(expressionMatrix,varianceDataFrame,
#                   thresholdVec = np.arange(0,1,0.05),
#                   mutantTypeVec = ['all'], # which type of mutations should be considered, list of strings, e.g., ['Common mutation','Developmental']
#                   removeSynonymousVec = [0], # whether to remove synonymous mutations from the mutations
#                   pathologyTypeVec = ['all'], # which type of pathology we look at
#                   subsetCells = 'all', # whether to keep only certain type of cells
#                   rescalingVec= [False]): # rescaling by considering the number of bases passed (not recommended)):

#     outputDf = pd.DataFrame()
#     colNames = ['nSig','nSig (upregulated)','nSig (downregulated)','pVal combined','threshold','mutant type','removed synonomous','pathology','rescaling']
    
#     # go over the heteroplasmy thresholds
#     print('thresholds:')
#     for t in thresholdVec:
#         print(t)
#         for m in mutantTypeVec:
#             for rSyn in removeSynonymousVec:
#                 for pat in pathologyTypeVec:
#                     for rescale in rescalingVec:

#                         listSignificantGenes = findDEGMitoLoad(expressionMatrix,varianceDataFrame,
#                                     threshold = t, #threshold: heteroplasmy threshold (only mutations above this threshold are considered)
#                                     mutantType = m, # which type of mutations should be considered, list of strings, e.g., ['Common mutation','Developmental']
#                                     removeSynonymous = rSyn, # whether to remove synonymous mutations from the mutations
#                                     pathologyType = pat, # which type of pathology we look at
#                                     rescaling= rescale)
#                         # compute the number of significante genes
#                         nSig = listSignificantGenes.shape[0]
#                         if nSig>0:
#                             nSigUp = np.sum(listSignificantGenes['logfoldchanges']>0) # number of upregulated genes
#                             nSigDown = np.sum(listSignificantGenes['logfoldchanges']<0) # number of downregulated genes
#                             pValCombined = stats.combine_pvalues(listSignificantGenes['pvals_adj'])[1] # Fisher's method
                            
#                             df = pd.DataFrame([[nSig,nSigUp,nSigDown,pValCombined,t,m[0],rSyn,pat[0],rescale]], columns=colNames)
#                             outputDf = outputDf.append(df)
#                         else:
#                             # add that no genes are significant
#                             df = pd.DataFrame([[0,0,0,1,t,m[0],rSyn,pat[0],rescale]], columns=colNames)
#                             outputDf = outputDf.append(df)
                        
#     return(outputDf)

In [9]:
# We only look at mutations that could have been detected in at least ten cells
varianceSubset = varianceData[varianceData['cells_possible']>10]

## 2. Compute DEGs for at a different thresholds

In [10]:
# run analysis
DEGs_Cryptic_10 = findDEGMitoLoad(adata,varianceSubset,threshold=0.10,mutantType=['Cryptic'],rescaling=True,removeSynonymous=1,significanceLevel=0.05)
DEGs_Cryptic_30 = findDEGMitoLoad(adata,varianceSubset,threshold=0.3,mutantType=['Cryptic'],rescaling=True,removeSynonymous=1,significanceLevel=0.05)
DEGs_Cryptic_40 = findDEGMitoLoad(adata,varianceSubset,threshold=0.40,mutantType=['Cryptic'],rescaling=True,removeSynonymous=1,significanceLevel=0.05)
DEGs_Cryptic_95 = findDEGMitoLoad(adata,varianceSubset,threshold=0.95,mutantType=['Cryptic'],rescaling=True,removeSynonymous=1,significanceLevel=0.05)

In [16]:
# save
DEGs_Cryptic_10.to_csv('./DEG_out/deg_Enge_t10.csv')
DEGs_Cryptic_30.to_csv('./DEG_out/deg_Enge_t30.csv')
DEGs_Cryptic_40.to_csv('./DEG_out/deg_Enge_t40.csv')
DEGs_Cryptic_95.to_csv('./DEG_out/deg_Enge_t95.csv')
