Adapted from human.all_genes.ipynb

In [1]:
import numpy as np
import pandas as pd
import glob
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 10)

In [20]:
from functools import reduce

# Average dN/dS

In [7]:
# Define a function for calculating dN/dS when both dN and dS is zero. 
def weird_division(df):
    if df[5]==0 and df[6]==0:
        return 0
    elif df[5]==0:
        return 0
    elif df[6]==0:
        return np.NaN
    return df[5] / df[6]

Import ortholog information, including dN and dS, which was downloaded from Ensembl 98. 

In [16]:
df_list = []
for file in sorted(glob.glob('../results/Ensembl98_chicken/chicken_protein_coding_genes.*.txt')):
    species_code_name = (file[58:-4])
#     print(species_code_name)
    df = pd.read_csv(file, sep='\t', header=None, na_values=('ortholog_one2many', 'ortholog_many2many') ,index_col=[0,1])
    df = pd.DataFrame(df.dropna().drop_duplicates().apply(weird_division, axis=1),columns=[species_code_name+'_dNdS'])
    df.index.set_names(['Gene stable ID','Gene name'],inplace=True)
    df_list.append(df.dropna().drop_duplicates())

In [28]:
len(df_list)

30

In [19]:
df_list[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,abrachyrhynchus_dNdS
Gene stable ID,Gene name,Unnamed: 2_level_1
ENSGALG00000009692,SH3RF1,0.120964
ENSGALG00000004571,PPP1CC,0.053981
ENSGALG00000009966,FRS2,0.041991
ENSGALG00000034421,SENP1,0.204625
ENSGALG00000036738,IGSF3,0.043960
...,...,...
ENSGALG00000033656,DQX1,0.053220
ENSGALG00000006134,HOGA1,0.091229
ENSGALG00000041309,ALKBH3,0.223664
ENSGALG00000045317,NHLRC4,0.154679


In [21]:
integrate_df = reduce(lambda x, y: pd.merge(x, y, left_index=True, right_index=True, how='outer'),df_list)

In [29]:
integrate_df.columns

Index(['abrachyrhynchus_dNdS', 'acarolinensis_dNdS', 'ahaastii_dNdS',
       'aowenii_dNdS', 'applatyrhynchos_dNdS', 'arowi_dNdS',
       'cabingdonii_dNdS', 'ccaeruleus_dNdS', 'cjaponica_dNdS',
       'cpbellii_dNdS', 'cporosus_dNdS', 'cpugnax_dNdS', 'cpygmaea_dNdS',
       'dnovaehollandiae_dNdS', 'falbicollis_dNdS', 'gagassizii_dNdS',
       'jhyemalis_dNdS', 'lcoronata_dNdS', 'lsdomestica_dNdS',
       'mgallopavo_dNdS', 'mundulatus_dNdS', 'mvitellinus_dNdS',
       'nmeleagris_dNdS', 'nperdicaria_dNdS', 'pmajor_dNdS', 'psinensis_dNdS',
       'scanaria_dNdS', 'smerianae_dNdS', 'spunctatus_dNdS',
       'zalbicollis_dNdS'],
      dtype='object')

Calculate the statistics of each human protein-coding gene.

In [24]:
stats_df = integrate_df.apply(pd.DataFrame.describe, axis=1)

In [25]:
stats_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Gene stable ID,Gene name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSGALG00000000003,PANX2,29.0,0.069405,0.024910,0.032274,0.055618,0.063331,0.082282,0.131676
ENSGALG00000000011,C10orf88,19.0,0.396707,0.108293,0.133712,0.320246,0.409735,0.470563,0.586503
ENSGALG00000000044,WFIKKN1,26.0,0.030648,0.031813,0.000999,0.001050,0.034224,0.043835,0.123722
ENSGALG00000000048,,19.0,0.048765,0.043083,0.001242,0.031045,0.035015,0.056529,0.189826
ENSGALG00000000055,LAMTOR3,16.0,0.062023,0.054064,0.002640,0.036443,0.055103,0.064609,0.215841
...,...,...,...,...,...,...,...,...,...
ENSGALG00000055075,FMNL3,18.0,0.028027,0.017700,0.001033,0.016585,0.028823,0.042328,0.050868
ENSGALG00000055081,SEMA4C,16.0,0.046023,0.027878,0.001538,0.044365,0.048761,0.055266,0.118243
ENSGALG00000055094,CD63,8.0,0.042108,0.077337,0.002879,0.005171,0.008934,0.035832,0.229987
ENSGALG00000055101,TASOR2,18.0,0.353952,0.152249,0.072121,0.284065,0.342623,0.441898,0.624340


Save the tables. 

In [30]:
integrate_df.to_csv('../results/Ensembl98_chicken/chicken.30_species_dNdS.all_genes.tsv',sep='\t')

In [31]:
stats_df.to_csv('../results/Ensembl98_chicken/chicken.dNdS_stats.all_genes.tsv',sep='\t')