In [3]:
import numpy as np
import pandas as pd
import os, sys

## Distribution of gene copy number in orthologous groups (2022-03-18) 

In [8]:
gene_group = {}
with open(r'../../data/Orthology/homologene.data.txt') as f:
    for row in f:
        col = row.strip().split('\t')
        if col[1] == '9606':
            gene = "Hs|" + col[3]
            gene_group.setdefault(col[0], []).append(gene)
        elif col[1] == '10090':
            gene = "Mm|" + col[3]
            gene_group.setdefault(col[0], []).append(gene)
        elif col[1] == '7227':
            gene = "Dm|" + col[3]
            gene_group.setdefault(col[0], []).append(gene)
        elif col[1] == '4932':
            gene = "Sc|" + col[3]
            gene_group.setdefault(col[0], []).append(gene)
        else:
            pass

In [37]:
gene_group['1000']

['Hs|CLDN4', 'Mm|Cldn4']

In [45]:
len(gene_group.keys())

26477

In [71]:
groups = sorted(gene_group.keys(), key = lambda x:int(x))
with open('../../data/Orthology/gene_groups.txt', "w") as f:
    for g in groups:
        f.write(g + "\t" + "\t".join(gene_group[g]) + "\n")

In [72]:
n_species = 4
n_group = len(gene_group)
matrix = np.zeros(shape=(n_group, n_species)) #define matrix

name_list = ["Hs", "Mm", "Dm", "Sc"]
group_count = 0
with open('../../data/Orthology/gene_groups.txt') as f:
    for row in f:
        data= row.strip().split("\t")
        for s in name_list:
            matrix[group_count, name_list.index(s)] += len([m.startswith(s) for m in data if m.startswith(s) is True])
        group_count += 1
        
df = pd.DataFrame(matrix, index = groups)
df.columns = name_list
for c in name_list:
    df[c] = pd.to_numeric(df[c]).round(0).astype(int) 
    
df.to_csv(path_or_buf = '../../data/Orthology/gene_gropus_out.txt', sep = '\t')

## One-to-one orthologs (2022-03-25)

In [4]:
gropus_out = pd.read_csv('../../data/Orthology/gene_gropus_out.txt', sep="\t")
gropus_out = gropus_out.rename(columns = {'Unnamed: 0': 'OG'})
gropus_out.head()

Unnamed: 0,OG,Hs,Mm,Dm,Sc
0,3,1,1,1,0
1,5,1,1,1,0
2,6,1,1,1,1
3,7,1,1,1,0
4,9,1,1,0,0


In [5]:
oneNum = lambda x: np.sum(x == 1)
gropus_out['One_number'] = gropus_out.iloc[:,1:].apply(oneNum, axis='columns')
ortho1to1 = gropus_out[gropus_out.One_number == 4]
ortho1to1.head()

Unnamed: 0,OG,Hs,Mm,Dm,Sc,One_number
2,6,1,1,1,1,4
5,12,1,1,1,1,4
8,16,1,1,1,1,4
9,17,1,1,1,1,4
15,32,1,1,1,1,4


In [31]:
ortho1to1.shape

(888, 6)

In [69]:
OGs = ortho1to1['OG'].tolist()
with open(r'../../data/Orthology/gene_groups.txt') as fr, open('ortho1to1.txt', 'a') as fw:
    fw.write("\t".join(['OG', 'Fruit_fly', 'Human', 'Mouse', 'Yeast']) + "\n")
    for row in fr:
        cols = row.strip().split('\t')
        if int(cols[0]) in OGs:
            genes = [x.split("|")[1] for x in sorted(cols[1:])]
            fw.write(cols[0] + "\t" + "\t".join(genes) + "\n")

### The common one-to-one orthologs with human (2022-04-11)

In [8]:
gropus_out['Max_number'] = gropus_out.iloc[:,1:5].apply(max, axis='columns') 
gropus_out.head()

Unnamed: 0,OG,Hs,Mm,Dm,Sc,One_number,Max_number
0,3,1,1,1,0,3,1
1,5,1,1,1,0,3,1
2,6,1,1,1,1,4,1
3,7,1,1,1,0,3,1
4,9,1,1,0,0,2,1


In [12]:
ortho_hs = gropus_out[(gropus_out.Max_number == 1) & (gropus_out.Hs == 1)]
ortho_hs.head()

Unnamed: 0,OG,Hs,Mm,Dm,Sc,One_number,Max_number
0,3,1,1,1,0,3,1
1,5,1,1,1,0,3,1
2,6,1,1,1,1,4,1
3,7,1,1,1,0,3,1
4,9,1,1,0,0,2,1


In [25]:
ortho_hs2 = pd.melt(ortho_hs.iloc[:,:5], ['OG', 'Hs'])
ortho_hs2 = ortho_hs2[ortho_hs2.value > 0].iloc[:,:3]
ortho_hs2.columns = ['OG', 'Human', 'Species']
ortho_hs2['Human'] = ['Hs'] * ortho_hs2.shape[0]
ortho_hs2 = ortho_hs2.sort_values(by=['OG'])
ortho_hs2.head()

Unnamed: 0,OG,Human,Species
0,3,Hs,Mm
18050,3,Hs,Dm
18051,5,Hs,Dm
1,5,Hs,Mm
2,6,Hs,Mm


In [32]:
ortho_hs2.shape

(21347, 3)

In [None]:
## Run in Linux terminal: cat ../data/Orthology/gene_groups.txt | awk -F '\t' -v OFS='\t' '{for(i = 2;i <= NF; i++) print $1, $i}' | tr '|' '\t' > ../data/Orthology/gene_groups2.txt

In [29]:
gene_OGs = pd.read_csv('../../data/Orthology/gene_groups2.txt', sep="\t", header = None)
gene_OGs.columns = ['OG', 'Species', 'Gene']
gene_OGs.head()

Unnamed: 0,OG,Species,Gene
0,3,Hs,ACADM
1,3,Mm,Acadm
2,3,Dm,CG12262
3,5,Hs,ACADVL
4,5,Mm,Acadvl


In [52]:
ortho_hs3 = pd.merge(ortho_hs2, gene_OGs, left_on = ['OG', 'Human'], right_on = ['OG', 'Species']) # 默认内连接inner
ortho_hs3 = ortho_hs3.iloc[:,[0,1,2,4]]
ortho_hs3.columns = ['OG', 'Human', 'Species', 'Human_gene']
ortho_hs3 = pd.merge(ortho_hs3, gene_OGs, left_on = ['OG', 'Species'], right_on = ['OG', 'Species'])
ortho_hs3.head()

Unnamed: 0,OG,Human,Species,Human_gene,Gene
0,3,Hs,Mm,ACADM,Acadm
1,3,Hs,Dm,ACADM,CG12262
2,5,Hs,Dm,ACADVL,CG7461
3,5,Hs,Mm,ACADVL,Acadvl
4,6,Hs,Mm,ACAT1,Acat1


In [53]:
ortho_hs3.loc[ortho_hs3['Human'] == 'Hs', 'Human'] = 'Human'
ortho_hs3.loc[ortho_hs3['Species'] == 'Mm', 'Species'] = 'Mouse'
ortho_hs3.loc[ortho_hs3['Species'] == 'Dm', 'Species'] = 'Fruit fly'
ortho_hs3.loc[ortho_hs3['Species'] == 'Sc', 'Species'] = 'Yeast'
ortho_hs3.head()

Unnamed: 0,OG,Human,Species,Human_gene,Gene
0,3,Human,Mouse,ACADM,Acadm
1,3,Human,Fruit fly,ACADM,CG12262
2,5,Human,Fruit fly,ACADVL,CG7461
3,5,Human,Mouse,ACADVL,Acadvl
4,6,Human,Mouse,ACAT1,Acat1


In [56]:
ortho_hs3.to_csv(path_or_buf = '../../data/Orthology/ortho1to1_with_human.txt', sep = '\t', index = False)