In [1]:
# what genes experienced putative gene family collapse?

In [1]:
import pandas as pd
from tools.sqlInterface import *
from collections import *

In [2]:
df = load_filter_evaluation('/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Bonobo.db')

In [3]:
ref_df = load_annotation('/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Human.db')

In [4]:
merged = df.merge(ref_df, on=['GeneId', 'TranscriptId'])

In [5]:
merged_collapsed = merged[~merged.CollapsedGeneIds.isnull()]

In [6]:
r = []
# this was actually calculated per-gene in the original analysis, so we can just look at the first transcript
for (gene_biotype, gene_id, gene_name), d in merged_collapsed.groupby(['GeneBiotype', 'GeneId', 'GeneName']):
    d = d.iloc[0]
    r.append([gene_id, gene_name, gene_biotype, d.CollapsedGeneIds.count(',') + 1, d.CollapsedGeneIds, d.CollapsedGeneNames])
collapsed_df = pd.DataFrame(r, columns=['GeneId', 'GeneName', 'GeneBiotype', 'NumberOfCollapsedCopies', 'CollapsedGeneIds', 'CollapsedGeneNames'])

In [7]:
c = collapsed_df[collapsed_df.GeneBiotype == 'protein_coding']
Counter(c.NumberOfCollapsedCopies), len(c)

(Counter({1: 174, 2: 19, 3: 7, 7: 1, 4: 2, 5: 3}), 206)

In [8]:
# split gene analysis
# this was done per-transcript, so the intervals may vary per transcript
split = merged[~merged.PossibleSplitGeneLocations.isnull()]
split = split[['GeneId', 'GeneName', 'TranscriptId', 'PossibleSplitGeneLocations']].sort_values(['GeneId', 'GeneName'])
split.columns = ['Gene ID', 'Gene Name', 'Transcript ID', 'Locations of split mappings']
print(len(set(split['Gene ID'])))

128


In [9]:
# this was done per gene
paralogy = merged[~merged.GeneAlternateLoci.isnull()]
paralogy = paralogy[['GeneId', 'GeneName', 'GeneBiotype', 'GeneAlternateLoci']].groupby('GeneId').first().reset_index()
paralogy.columns = ['Gene ID', 'Gene Name', 'Gene Biotype', 'Loci of possible paralogous mappings']
paralogy['Number of possible paralogous mappings'] = [x.count(',') + 1 for x in paralogy['Loci of possible paralogous mappings']]
paralogy = paralogy.sort_values(['Gene Biotype', 'Gene ID', 'Gene Name'])
Counter(paralogy['Number of possible paralogous mappings']), len(paralogy)

(Counter({2: 621,
          1: 21780,
          3: 200,
          5: 68,
          4: 106,
          7: 38,
          8: 38,
          6: 55,
          9: 10,
          10: 7,
          18: 2,
          16: 2,
          23: 2,
          11: 16,
          13: 5,
          12: 4,
          27: 1,
          14: 5,
          15: 1,
          17: 2,
          19: 1,
          24: 1,
          26: 1}),
 22966)

In [10]:
paralogy_coding = paralogy[paralogy['Gene Biotype'] == 'protein_coding']
Counter(paralogy_coding['Number of possible paralogous mappings']), len(paralogy_coding)

(Counter({1: 14298,
          7: 9,
          2: 261,
          4: 35,
          5: 19,
          3: 65,
          6: 23,
          12: 2,
          11: 6,
          8: 18,
          9: 4,
          14: 3,
          19: 1,
          13: 2,
          10: 2,
          16: 1,
          24: 1}),
 14750)

In [11]:
with pd.ExcelWriter('bonobo_transMap_metrics.xlsx') as fh:
    collapsed_df.to_excel(fh, sheet_name='Gene Family Collapse')
    split.to_excel(fh, sheet_name='Split Gene Analysis')
    paralogy.to_excel(fh, sheet_name='Putative Paralogous Loci')

In [96]:
os.getcwd()

'/public/groups/cgl/cat/primates_evan/primates-2020/notebooks'