In [1]:
import numpy as np
import pandas as pd
import glob
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 10)

In [2]:
dnds_df = pd.read_csv('../results/Ensembl98_human/human.92_species_dNdS.all_genes.tsv',sep='\t',index_col=1)

In [3]:
dnds_df.drop(['Gene stable ID', 'Gene description'],axis=1,inplace=True)

In [4]:
# Get a list of species name (including "average" of the 92 species)
species_list=[]
for species in dnds_df.columns:
    species_list.append(species[:-5])

In [9]:
# Find human orthologs of mouse genes
human_ortho_df = pd.read_csv('../data/human.orthologs_of_mouse_genes.tsv', sep='\t', index_col=['Gene name'])
human_ortho_df.replace(['ortholog_one2many','ortholog_many2many'],np.nan,inplace=True)
human_ortho_df.dropna(inplace=True)
# drop genes with duplicated names
human_ortho_df=human_ortho_df.loc[~human_ortho_df.index.duplicated(keep=False)]

In [10]:
celltype_list=['neuron','endothelia','glia','astrocyte','microglia','oligodendrocyte']

In [11]:
df_list=[]
df_dict={}
arr_dict={}
for celltype in celltype_list:
    df=pd.read_csv('../results/mouse.celltype-specific_genes/protein-coding_w_dNdS/'+celltype+'.list.txt',sep='\t',index_col=0,header=None)
    df.index.name='Gene name'
    df=pd.merge(df,human_ortho_df,how='inner',left_index=True,right_index=True)
    df.set_index('Human gene name',inplace=True)
    df=pd.merge(df,dnds_df,how='inner',left_index=True,right_index=True)
    df.index.name='Human gene name'
    df.drop(['Human homology type','Human orthology confidence [0 low, 1 high]'],axis=1,inplace=True)
    df_dict[celltype]=df.copy(deep=True)
    df['Cell Type']=celltype
    df_list.append(df)
    array=df.loc[:,'amelanoleuca_dNdS':'vvulpes_dNdS'].to_numpy()
    array=array[~np.isnan(array)]
    arr_dict[celltype]=array
concat_df=pd.concat(df_list)

In [15]:
concat_df

Unnamed: 0_level_0,amelanoleuca_dNdS,anancymaae_dNdS,bbbison_dNdS,bmutus_dNdS,btaurus_dNdS,capalliatus_dNdS,caperea_dNdS,catys_dNdS,ccanadensis_dNdS,ccapucinus_dNdS,...,tgelada_dNdS,ttruncatus_dNdS,uamericanus_dNdS,umaritimus_dNdS,uparryii_dNdS,vpacos_dNdS,vursinus_dNdS,vvulpes_dNdS,average_dNdS,Cell Type
Human gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C12orf75,,0.695652,,,,,,0.263052,0.080182,0.534923,...,0.367989,,,,0.200955,,0.116169,0.197981,0.258465,neuron
C5orf49,0.257951,0.378788,0.141478,,0.145773,0.194595,0.111052,0.329767,,0.260894,...,0.307235,,0.265584,0.264392,0.162530,,0.144164,0.184505,0.238237,neuron
C8orf89,,1.000886,0.547280,0.525451,0.549308,2.626506,,1.065375,0.438141,0.923077,...,0.973684,,0.591868,0.531669,0.567142,,,,0.740657,neuron
C14orf39,0.410969,0.366640,0.385385,0.373490,,0.423208,,0.667488,0.319782,0.445763,...,0.645833,0.343657,,,0.283533,,0.434653,0.350623,0.396867,neuron
C3orf67,0.254692,0.329989,,,0.295218,0.429739,0.292560,0.397203,0.240876,0.301969,...,0.376936,,,0.248072,0.215691,,0.190933,0.231298,0.297918,neuron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZDHHC2,0.113739,0.443910,0.131903,,0.109912,0.328125,0.187064,0.283439,0.141338,0.122257,...,0.310559,,0.086110,0.059859,0.089984,,0.066772,0.137826,0.153084,oligodendrocyte
ZDHHC9,0.043700,0.064250,0.183082,0.068680,0.064942,0.026585,,0.025948,0.041534,,...,,,0.043820,0.051253,0.030824,0.058460,0.056510,0.041509,0.060469,oligodendrocyte
ZNF239,,,0.110661,0.275347,0.272305,,,,0.319017,,...,0.547965,,0.076876,,0.275821,,,0.357183,0.347662,oligodendrocyte
ZNF365,0.210198,0.517122,0.229111,0.227774,0.228626,0.489510,0.302326,0.468032,0.364662,0.520833,...,0.521570,0.342025,0.180468,0.175917,0.297585,0.297746,0.149125,0.161802,0.314504,oligodendrocyte


# Statistics

In [16]:
from scipy import stats
import heapq

### All data points (no averaging across 92 species)

In [17]:
# NEG three cell types analysis
stats.mannwhitneyu(arr_dict['neuron'],arr_dict['endothelia'],alternative="two-sided")
stats.mannwhitneyu(arr_dict['neuron'],arr_dict['glia'],alternative="two-sided")
stats.mannwhitneyu(arr_dict['glia'],arr_dict['endothelia'],alternative="two-sided")
stats.kruskal(arr_dict['neuron'],arr_dict['glia'],arr_dict['endothelia'])

MannwhitneyuResult(statistic=1679565513.5, pvalue=0.0)

MannwhitneyuResult(statistic=2276857802.0, pvalue=0.0)

MannwhitneyuResult(statistic=2132131296.5, pvalue=1.057431357883925e-18)

KruskalResult(statistic=11159.814305246628, pvalue=0.0)

In [18]:
# NEAMO five cell types analysis
stats.mannwhitneyu(arr_dict['neuron'],arr_dict['astrocyte'],alternative="two-sided")
stats.mannwhitneyu(arr_dict['neuron'],arr_dict['microglia'],alternative="two-sided")
stats.mannwhitneyu(arr_dict['neuron'],arr_dict['oligodendrocyte'],alternative="two-sided")
stats.kruskal(arr_dict['neuron'],arr_dict['endothelia'],arr_dict['astrocyte'],arr_dict['microglia'],arr_dict['oligodendrocyte'])

MannwhitneyuResult(statistic=1750530154.0, pvalue=0.0)

MannwhitneyuResult(statistic=1832554383.5, pvalue=0.0)

MannwhitneyuResult(statistic=1216167086.5, pvalue=1.738289166335243e-171)

KruskalResult(statistic=13704.319236071444, pvalue=0.0)

In [19]:
negamo_stats_df = pd.DataFrame(
    columns=['Celltype','Median_dNdS',
             'Confidence_Interval_Low','Confidence_Interval_High'])

for celltype in ['neuron','endothelia','glia','astrocyte','microglia','oligodendrocyte']:
        arr=arr_dict[celltype]
        med=np.median(arr)
        low = stats.binom.interval(alpha=.95,n=arr.shape[0],p=.5)[0]
        high = stats.binom.interval(alpha=.95,n=arr.shape[0],p=.5)[1]
        CI_low = heapq.nsmallest(low.astype(int),arr)[-1]
        CI_high = heapq.nsmallest(high.astype(int),arr)[-1]
        negamo_stats_df = negamo_stats_df.append(
            {'Celltype':celltype,'Median_dNdS':med,
             'Confidence_Interval_Low':CI_low,'Confidence_Interval_High':CI_high},
            ignore_index=True)

In [20]:
negamo_stats_df

Unnamed: 0,Celltype,Median_dNdS,Confidence_Interval_Low,Confidence_Interval_High
0,neuron,0.085396,0.084668,0.08613
1,endothelia,0.145031,0.143702,0.146384
2,glia,0.136756,0.135678,0.137773
3,astrocyte,0.124624,0.12332,0.125948
4,microglia,0.148231,0.146874,0.149367
5,oligodendrocyte,0.103832,0.102714,0.105196


### Individual Species (including each gene's dN/dS ratio averaged across 92 species) 

In [23]:
common_name_df=pd.read_excel('../data/species.xlsx',index_col=0)
common_name_df=common_name_df.append(pd.Series(['average','average','average'],name='average',index=common_name_df.columns))

In [22]:
# Find the rare cases where neuron-specific genes don't have the lowest dN/dS
for species in species_list:
    if np.median(df_dict['neuron'][[species+'_dNdS']].iloc[:,0].dropna().values) > np.median(df_dict['oligodendrocyte'][[species+'_dNdS']].iloc[:,0].dropna().values):
        print(species) # turn out to be closest relatives to human

ppaniscus
ptroglodytes


##### Descriptive Statistics: median and confidence interval

In [30]:
full_dict = {} # This dictionary stores dataframes for later use 
descriptive_stats_df = pd.DataFrame(
    columns=['Species','Celltype','Median_dNdS',
             'Confidence_Interval_Low','Confidence_Interval_High'])
for species in species_list:
    full_dict[species] = {}
    for celltype in ['neuron','endothelia','glia','astrocyte','microglia','oligodendrocyte']:
        df=df_dict[celltype][[species+'_dNdS']]
        full_dict[species][celltype]=df.dropna().values
        arr=df.iloc[:,0].dropna().values
        med=np.median(arr)
        low = stats.binom.interval(alpha=.95,n=arr.shape[0],p=.5)[0]
        high = stats.binom.interval(alpha=.95,n=arr.shape[0],p=.5)[1]
        CI_low = heapq.nsmallest(low.astype(int),arr)[-1]
        CI_high = heapq.nsmallest(high.astype(int),arr)[-1]
        descriptive_stats_df = descriptive_stats_df.append(
            {'Species':species,'Celltype':celltype,'Median_dNdS':med,
             'Confidence_Interval_Low':CI_low,'Confidence_Interval_High':CI_high},
            ignore_index=True)

descriptive_stats_df=pd.merge(left=common_name_df,right=descriptive_stats_df,left_index=True,right_on='Species',how='right')
descriptive_stats_df.set_index('SpeciesCommonName',inplace=True)
descriptive_stats_df.drop(['SpeciesLatinName','Order','Species'],axis=1,inplace=True)

In [25]:
descriptive_stats_df

Unnamed: 0_level_0,Celltype,Median_dNdS,Confidence_Interval_Low,Confidence_Interval_High
SpeciesCommonName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Panda,neuron,0.077993,0.072263,0.084587
Panda,endothelia,0.137035,0.128403,0.145969
Panda,glia,0.131816,0.124442,0.140913
Panda,astrocyte,0.113861,0.104617,0.122374
Panda,microglia,0.140856,0.131009,0.150822
...,...,...,...,...
average,endothelia,0.153537,0.139793,0.164814
average,glia,0.143529,0.135893,0.154208
average,astrocyte,0.130252,0.121575,0.138020
average,microglia,0.155177,0.143620,0.164311


In [26]:
# add a tab to existing descriptive statistics excel file:

from openpyxl import load_workbook

path = '../results/celltype-specific_descriptive_stats.xlsx'
book = load_workbook(path)
writer = pd.ExcelWriter(path, engine = 'openpyxl') # note the engine
# writer = pd.ExcelWriter(path, engine = 'xlsxwriter') # use this engine if creating a new file
writer.book = book

descriptive_stats_df.to_excel(writer, sheet_name = 'human reference genome',index=True)

writer.save()
writer.close()

### Inferential statistics: Kruskal Wallis H and Mann Whitney U between celltypes

In [36]:
inferential_stats_df=pd.DataFrame(
    columns=['Species','Kruskal H NEG','Kruskal p NEG',
             'Kruskal H NEAMO','Kruskal p NEAMO',
             'Mann U NG','Mann p NG',
             'Mann U NE','Mann p NE',
             'Mann U EG','Mann p EG',
             'Mann U NA','Mann p NA',
             'Mann U NM','Mann p NM',
             'Mann U NO','Mann p NO'])

for species in species_list:

    h_NEG,kp_NEG=stats.kruskal(
        full_dict[species]['neuron'],
        full_dict[species]['endothelia'],
        full_dict[species]['glia'])

    h_NEAMO,kp_NEAMO=stats.kruskal(
        full_dict[species]['neuron'],
        full_dict[species]['endothelia'],
        full_dict[species]['astrocyte'],
        full_dict[species]['microglia'],
        full_dict[species]['oligodendrocyte'])
    
    # June 18, 2020: debug on Jan_20_2020-Confirming_w_Couple_Species-Ensembl98Mouse
    # June 24, 2020: debug stats.mannwhitneyu(), include alternative="two-sided", 
    # default "None" is deprecated and generate a smaller p value
    # Now avoid nan values influencing test results.
    # U statistics and p values from Mann Whitney's U test 
    # comparing Neuron-specific genes and Glia-specific genes' dN/dS ratios
    u_NG, mp_NG=stats.mannwhitneyu(
        full_dict[species]['neuron'],
        full_dict[species]['glia'],
        alternative="two-sided")
    
    # similarly, compare neuron-specific genes and endothelia-specific genes
    u_NE, mp_NE=stats.mannwhitneyu(
        full_dict[species]['neuron'],
        full_dict[species]['endothelia'],
        alternative="two-sided")
    
    # and endothelia vs glia
    u_EG, mp_EG=stats.mannwhitneyu(
        full_dict[species]['glia'],
        full_dict[species]['endothelia'],
        alternative="two-sided")
    
    # and neuron vs the three glial subtypes
    u_NA, mp_NA=stats.mannwhitneyu(
        full_dict[species]['neuron'],
        full_dict[species]['astrocyte'],
        alternative="two-sided")
    
    u_NM, mp_NM=stats.mannwhitneyu(
        full_dict[species]['neuron'],
        full_dict[species]['microglia'],
        alternative="two-sided")
    
    u_NO, mp_NO=stats.mannwhitneyu(
        full_dict[species]['neuron'],
        full_dict[species]['oligodendrocyte'],
        alternative="two-sided")
    
    inferential_stats_df = inferential_stats_df.append(
            {'Species':species,'Kruskal H NEG':h_NEG,'Kruskal p NEG':kp_NEG,
            'Kruskal H NEAMO':h_NEAMO,'Kruskal p NEAMO':kp_NEAMO,
            'Mann U NG':u_NG,'Mann p NG':mp_NG,
            'Mann U NE':u_NE,'Mann p NE':mp_NE,
            'Mann U EG':u_EG,'Mann p EG':mp_EG,
            'Mann U NA':u_NA,'Mann p NA':mp_NA,
            'Mann U NM':u_NM,'Mann p NM':mp_NM,
            'Mann U NO':u_NO,'Mann p NO':mp_NO},
            ignore_index=True)
    
inferential_stats_df=pd.merge(left=common_name_df,right=inferential_stats_df,left_index=True,right_on='Species',how='right')
inferential_stats_df.set_index('SpeciesCommonName',inplace=True)
inferential_stats_df.drop(['SpeciesLatinName','Order','Species'],axis=1,inplace=True)

In [37]:
inferential_stats_df

Unnamed: 0_level_0,Kruskal H NEG,Kruskal p NEG,Kruskal H NEAMO,Kruskal p NEAMO,Mann U NG,Mann p NG,Mann U NE,Mann p NE,Mann U EG,Mann p EG,Mann U NA,Mann p NA,Mann U NM,Mann p NM,Mann U NO,Mann p NO
SpeciesCommonName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Panda,169.438178,1.610524e-37,201.105014,2.174161e-42,354519.0,1.392111e-29,265944.0,2.371244e-27,349596.0,0.409178,274950.0,1.730097e-12,285679.0,2.161542e-34,193295.0,0.000286
Ma's night monkey,153.708697,4.193537e-34,191.272961,2.823224e-40,362141.0,6.256331e-21,244616.0,3.013816e-30,310864.0,0.005674,270677.0,9.549699e-12,287641.0,3.388091e-27,195322.5,0.014637
American bison,154.280262,3.151133e-34,185.912506,4.004405e-39,225572.0,2.153954e-27,172761.0,4.735802e-25,239838.0,0.541960,179929.0,4.357303e-13,186782.0,7.296842e-33,124728.0,0.000013
Wild yak,154.533603,2.776223e-34,185.942005,3.946394e-39,250236.0,5.596982e-28,197399.0,1.007964e-24,274833.5,0.700864,199339.0,2.395754e-14,207459.0,3.542879e-32,137631.0,0.000130
Cow,174.493229,1.286106e-38,219.216257,2.764277e-46,357878.0,7.318747e-30,266602.0,2.123932e-28,341443.0,0.434413,282772.0,2.013142e-13,283212.0,6.898188e-38,199713.0,0.000162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Arctic ground squirrel,198.549677,7.682197e-44,256.197274,3.008780e-54,353331.0,8.820689e-32,255604.0,4.068322e-34,340973.0,0.090929,280444.0,9.485720e-17,267107.0,8.670766e-42,201831.0,0.000677
Alpaca,34.168475,3.805484e-08,40.155435,4.019590e-08,7526.0,3.334764e-07,7673.0,6.176964e-07,11411.0,0.752722,9520.0,8.038924e-03,5262.0,4.956850e-08,3323.0,0.013120
Common wombat,100.223879,1.724492e-22,111.762508,3.062308e-23,286891.0,1.582962e-15,208537.0,3.748905e-19,245321.0,0.114805,221431.0,1.285878e-10,236630.0,6.466586e-17,145569.0,0.000180
Red fox,112.052744,4.656460e-25,153.642552,3.373201e-32,340480.0,9.202963e-19,260052.0,6.488902e-20,311713.0,0.257204,270399.0,2.842958e-07,277653.0,4.385953e-26,188781.5,0.055176


In [38]:
# Write to existing excel file for inferential stats
path = '../results/celltype-specific_inferential_stats.xlsx'
book = load_workbook(path)
writer = pd.ExcelWriter(path, engine = 'openpyxl') # note the engine
# writer = pd.ExcelWriter(path, engine = 'xlsxwriter') # use this engine if creating a new file
writer.book = book
inferential_stats_df.to_excel(writer, sheet_name = 'human reference genome', index=True)

writer.save()
writer.close()

# Visualization

In [79]:
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import matplotlib.ticker as mtick
from statannot import add_stat_annotation
matplotlib.rcParams['figure.dpi']= 300

In [80]:
sns.set_style('ticks')
palette={'endothelia':'tab:blue','glia':'tab:green','neuron':'tab:red','oligodendrocyte':'lightblue','astrocyte':'lightgreen','microglia':'tab:orange'}