# Tissue-specific genes based on MGI Gene Expression Database

In [1]:
import numpy as np
import pandas as pd

Import mean dNdS scores for each mouse protein coding gene, indexed with Ensembl ID. 

In [2]:
info_df = pd.read_csv('../data/info.mouse_protein_coding_genes.tsv',sep='\t',header=0,index_col=0)
info_df.drop_duplicates(subset='Gene name',inplace=True) # Drop the duplicated gene names

In [3]:
stats_df = pd.read_csv('../results/Ensembl98_mouse/mouse.dNdS_stats.all_genes.tsv',sep='\t',index_col=0)

Import MGI IDs for genes specific to each organ.

In [4]:
MGI_df = pd.read_csv('../results/MGI_organs/MGI_ID-tissue_type.tsv',sep='\t',index_col=0,names=['MGI ID','Tissue'])

Import the matching ensembl IDs

In [5]:
Ensembl_df = pd.read_csv('../data/MGI_organs/Ensembl_ID-MGI_ID.tsv',sep='\t',index_col=1)
Ensembl_df = Ensembl_df[~Ensembl_df.index.duplicated(keep='first')] # remove duplicated Ensembl IDs for the same gene

In [6]:
tissue_df = pd.merge(MGI_df,Ensembl_df, left_index=True, right_index=True, how='right')
tissue_df = tissue_df.reset_index().set_index('Gene stable ID')
tissue_df = pd.merge(tissue_df,stats_df,left_index=True,right_index=True,how='left')
tissue_df.dropna(inplace=True)
tissue_df=pd.merge(info_df,tissue_df,left_index=True,right_index=True,how='right')

In [7]:
tissue_df

Unnamed: 0_level_0,Gene name,Gene description,MGI ID,Tissue,count,mean,std,min,25%,50%,75%,max
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000000049,Apoh,apolipoprotein H [Source:MGI Symbol;Acc:MGI:88...,MGI:88058,liver,74.0,0.215338,0.047188,0.125469,0.186157,0.204020,0.224057,0.389129
ENSMUSG00000000049,Apoh,apolipoprotein H [Source:MGI Symbol;Acc:MGI:88...,MGI:88058,liver,74.0,0.215338,0.047188,0.125469,0.186157,0.204020,0.224057,0.389129
ENSMUSG00000000605,Clcn4,"chloride channel, voltage-sensitive 4 [Source:...",MGI:104571,heart,77.0,0.016013,0.025864,0.005139,0.008509,0.009622,0.011225,0.189252
ENSMUSG00000000605,Clcn4,"chloride channel, voltage-sensitive 4 [Source:...",MGI:104571,heart,77.0,0.016013,0.025864,0.005139,0.008509,0.009622,0.011225,0.189252
ENSMUSG00000000794,Kcnn3,potassium intermediate/small conductance calci...,MGI:2153183,heart,77.0,0.047890,0.032956,0.012382,0.037018,0.043321,0.050230,0.280899
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000109336,Samd4b,sterile alpha motif domain containing 4B [Sour...,MGI:2448542,brain,79.0,0.049798,0.039833,0.017568,0.031868,0.039254,0.051114,0.272936
ENSMUSG00000110086,Gm45623,predicted gene 45623 [Source:MGI Symbol;Acc:MG...,MGI:5791459,brain,50.0,0.053854,0.025075,0.012989,0.044657,0.050247,0.062072,0.161806
ENSMUSG00000110086,Gm45623,predicted gene 45623 [Source:MGI Symbol;Acc:MG...,MGI:5791459,brain,50.0,0.053854,0.025075,0.012989,0.044657,0.050247,0.062072,0.161806
ENSMUSG00000112129,Pbld1,phenazine biosynthesis-like protein domain con...,MGI:1915621,kidney,9.0,0.204431,0.062927,0.144982,0.168154,0.174836,0.242931,0.314268


In [8]:
brain_df = tissue_df[tissue_df['Tissue']=='brain']
brain_arr = brain_df['mean'].dropna().values
heart_df = tissue_df[tissue_df['Tissue']=='heart']
heart_arr = heart_df['mean'].dropna().values
kidney_df = tissue_df[tissue_df['Tissue']=='kidney']
kidney_arr = kidney_df['mean'].dropna().values
liver_df = tissue_df[tissue_df['Tissue']=='liver']
liver_arr = liver_df['mean'].dropna().values
lung_df = tissue_df[tissue_df['Tissue']=='lung']
lung_arr = lung_df['mean'].dropna().values
pancreas_df = tissue_df[tissue_df['Tissue']=='pancreas']
pancreas_arr = pancreas_df['mean'].dropna().values
skin_df = tissue_df[tissue_df['Tissue']=='skin']
skin_arr = skin_df['mean'].dropna().values

### Plotting

In [9]:
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats
import matplotlib.ticker as mtick

In [10]:
matplotlib.rcParams['figure.dpi']= 300

In [11]:
# Creating a figure 
fig = plt.figure(figsize=(10,7.5)) # Size of a letter size paper in horizontal
fig.suptitle('Distribution of dN/dS of Mouse Tissue-specific Genes', fontsize=14)

# Setting subplot space
grid = plt.GridSpec(nrows=1,ncols=1)
#grid.update(wspace=0.5, hspace=0.3)

# The subplot for distribution histogram 
distr_plot = fig.add_subplot(grid[:,:])
#distr_hist = dist_GO_df.plot(ax=distr_plot, kind='hist',alpha=0.3,bins=np.logspace(np.log10(0.001),np.log10(10), 100), logx=True, color=['royalblue','orange','lightgreen'])

# Set up the bins for log scale x-axis, and get the centers
bins=np.logspace(np.log10(0.001),np.log10(10), 50)
bins_cntr = (bins[1:] + bins[:-1]) / 2

# brain-specific genes' dN/dS histogram and curve
brain_counts, brain_bin_edges, ignored = distr_plot.hist(brain_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(brain_arr)) / len(brain_arr),color='cyan',label='dN/dS of brain-specific genes (med={0:.3f})'.format(np.median(brain_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    brain_area_hist = ((brain_bin_edges[1:] - brain_bin_edges[:-1]) * brain_counts).sum()
    brain_shape, brain_loc, brain_scale = stats.lognorm.fit(brain_arr)
    # pdf-values using cdf 
    brain_fit_log_cntr_ = stats.lognorm.cdf(bins, brain_shape, loc=brain_loc, scale=brain_scale)
    brain_fit_log_cntr = np.diff(brain_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, brain_fit_log_cntr * brain_counts.sum(),'c-', 
                    label='lognormal fit of brain-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# heart-specific genes' dN/dS histogram and curve
heart_counts, heart_bin_edges, ignored = distr_plot.hist(heart_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(heart_arr)) / len(heart_arr),color='red',label='dN/dS of heart-specific genes (med={0:.3f})'.format(np.median(heart_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    heart_area_hist = ((heart_bin_edges[1:] - heart_bin_edges[:-1]) * heart_counts).sum()
    heart_shape, heart_loc, heart_scale = stats.lognorm.fit(heart_arr)
    # pdf-values using cdf 
    heart_fit_log_cntr_ = stats.lognorm.cdf(bins, heart_shape, loc=heart_loc, scale=heart_scale)
    heart_fit_log_cntr = np.diff(heart_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, heart_fit_log_cntr * heart_counts.sum(),'r-', 
                    label='lognormal fit of heart-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# kidney-specific genes' dN/dS histogram and curve
kidney_counts, kidney_bin_edges, ignored = distr_plot.hist(kidney_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(kidney_arr)) / len(kidney_arr),color='black',label='dN/dS of kidney-specific genes (med={0:.3f})'.format(np.median(kidney_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    kidney_area_hist = ((kidney_bin_edges[1:] - kidney_bin_edges[:-1]) * kidney_counts).sum()
    kidney_shape, kidney_loc, kidney_scale = stats.lognorm.fit(kidney_arr)
    # pdf-values using cdf 
    kidney_fit_log_cntr_ = stats.lognorm.cdf(bins, kidney_shape, loc=kidney_loc, scale=kidney_scale)
    kidney_fit_log_cntr = np.diff(kidney_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, kidney_fit_log_cntr * kidney_counts.sum(),'k-', 
                    label='lognormal fit of kidney-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# liver-specific genes' dN/dS histogram and curve
liver_counts, liver_bin_edges, ignored = distr_plot.hist(liver_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(liver_arr)) / len(liver_arr),color='green',label='dN/dS of liver-specific genes (med={0:.3f})'.format(np.median(liver_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    liver_area_hist = ((liver_bin_edges[1:] - liver_bin_edges[:-1]) * liver_counts).sum()
    liver_shape, liver_loc, liver_scale = stats.lognorm.fit(liver_arr)
    # pdf-values using cdf 
    liver_fit_log_cntr_ = stats.lognorm.cdf(bins, liver_shape, loc=liver_loc, scale=liver_scale)
    liver_fit_log_cntr = np.diff(liver_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, liver_fit_log_cntr * liver_counts.sum(),'g-', 
                    label='lognormal fit of liver-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# lung-specific genes' dN/dS histogram and curve
lung_counts, lung_bin_edges, ignored = distr_plot.hist(lung_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(lung_arr)) / len(lung_arr),color='magenta',label='dN/dS of lung-specific genes (med={0:.3f})'.format(np.median(lung_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    lung_area_hist = ((lung_bin_edges[1:] - lung_bin_edges[:-1]) * lung_counts).sum()
    lung_shape, lung_loc, lung_scale = stats.lognorm.fit(lung_arr)
    # pdf-values using cdf 
    lung_fit_log_cntr_ = stats.lognorm.cdf(bins, lung_shape, loc=lung_loc, scale=lung_scale)
    lung_fit_log_cntr = np.diff(lung_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, lung_fit_log_cntr * lung_counts.sum(),'m-', 
                    label='lognormal fit of lung-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# pancreas-specific genes' dN/dS histogram and curve
pancreas_counts, pancreas_bin_edges, ignored = distr_plot.hist(pancreas_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(pancreas_arr)) / len(pancreas_arr),color='blue',label='dN/dS of pancreas-specific genes (med={0:.3f})'.format(np.median(pancreas_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    pancreas_area_hist = ((pancreas_bin_edges[1:] - pancreas_bin_edges[:-1]) * pancreas_counts).sum()
    pancreas_shape, pancreas_loc, pancreas_scale = stats.lognorm.fit(pancreas_arr)
    # pdf-values using cdf 
    pancreas_fit_log_cntr_ = stats.lognorm.cdf(bins, pancreas_shape, loc=pancreas_loc, scale=pancreas_scale)
    pancreas_fit_log_cntr = np.diff(pancreas_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, pancreas_fit_log_cntr * pancreas_counts.sum(),'b-', 
                    label='lognormal fit of pancreas-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# skin-specific genes' dN/dS histogram and curve
skin_counts, skin_bin_edges, ignored = distr_plot.hist(skin_arr,bins,histtype='stepfilled',alpha=0.3,weights=np.ones(len(skin_arr)) / len(skin_arr),color='yellow',label='dN/dS of skin-specific genes (med={0:.3f})'.format(np.median(skin_arr)))
try:
    # calculate area of histograms (area under PDF should be 1)
    skin_area_hist = ((skin_bin_edges[1:] - skin_bin_edges[:-1]) * skin_counts).sum()
    skin_shape, skin_loc, skin_scale = stats.lognorm.fit(skin_arr)
    # pdf-values using cdf 
    skin_fit_log_cntr_ = stats.lognorm.cdf(bins, skin_shape, loc=skin_loc, scale=skin_scale)
    skin_fit_log_cntr = np.diff(skin_fit_log_cntr_)
    # plot fitted and scaled PDFs into histogram
    distr_plot.plot(bins_cntr, skin_fit_log_cntr * skin_counts.sum(),'y-', 
                    label='lognormal fit of skin-specific dN/dS distribution', linewidth=2)
except ValueError:
    pass

# Axis labels
distr_plot.set_xlabel(xlabel='dN/dS')
distr_plot.set_ylabel(ylabel='percentage of genes')
distr_plot.set_xscale('log')
distr_plot.legend(loc='best')
distr_plot.yaxis.set_major_formatter(mtick.PercentFormatter(1))

fig.savefig('../figures/mouse.organs.pdf')
fig.savefig('../figures/mouse.organs.eps')
fig.savefig('../figures/mouse.organs.png')
plt.close()

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript back