In [1]:
import collections
import pandas as pd

import tools.intervals
import tools.misc
import tools.mathOps
import tools.fileOps
import tools.sqlInterface
import tools.transcripts
import tools.nameConversions
import tools.procOps
from cat.consensus import *
from argparse import ArgumentParser

In [2]:
db_path = "/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Bonobo.db"
ref_db_path = "/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Human.db"

In [3]:


tm_eval_df = load_transmap_evals(db_path)
ref_df = tools.sqlInterface.load_annotation(ref_db_path)
tx_modes = ['transMap']
mrna_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'mRNA') for tx_mode in tx_modes])
cds_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'CDS') for tx_mode in tx_modes])
eval_df = pd.concat([load_evaluations_from_db(db_path, tx_mode) for tx_mode in tx_modes]).reset_index()
hgm_df = pd.concat([load_hgm_vectors(db_path, tx_mode) for tx_mode in tx_modes])


# if chrXOnly:
#     cmd = [['grep', 'chrX', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']]
#     chrx_txs = set(tools.procOps.call_proc_lines(cmd))
#     ref_df = ref_df[ref_df.TranscriptId.isin(chrx_txs)]
# else:
#     # remove chrY
#     cmd = [['grep', 'chrY', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']]
#     chry_txs = set(tools.procOps.call_proc_lines(cmd))
#     ref_df = ref_df[~ref_df.TranscriptId.isin(chry_txs)]


num_txs = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].TranscriptId))
num_genes = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId))


## code below is from the consensus module. I ripped out from the combine_and_filter_dfs method
## because it needs the genePred, but the info is also present elsewhere.

#add the reference information to gain biotype information
hgm_ref_df = pd.merge(hgm_df, ref_df, on=['GeneId', 'TranscriptId'])
# combine in homGeneMapping results
hgm_ref_tm_df = pd.merge(hgm_ref_df, tm_eval_df, on=['GeneId', 'TranscriptId'])
# split merged_df into coding and noncoding
coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype == 'protein_coding']
non_coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype != 'protein_coding']
# add metrics information to coding df
metrics_df = pd.merge(mrna_metrics_df, cds_metrics_df, on='AlignmentId', suffixes=['_mRNA', '_CDS'])
coding_df = pd.merge(coding_df, metrics_df, on='AlignmentId')
# add evaluation information to coding df, where possible. This adds information on frame shifts.
coding_df = pd.merge(coding_df, eval_df, on='AlignmentId', how='left')
# fill the original intron values to 100 so we don't filter them out -- means a no-intron gene
coding_df['OriginalIntronsPercent_mRNA'] = coding_df.OriginalIntronsPercent_mRNA.fillna(100)
coding_df['OriginalIntronsPercent_CDS'] = coding_df.OriginalIntronsPercent_CDS.fillna(100)
non_coding_df['TransMapOriginalIntronsPercent'] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100)


# rawest counts. homGeneMapping was ran on the unfiltered dataset, so use that
# do this only on coding transcripts for now
unique_genes = 0
unique_txs = 0
tmp = hgm_ref_df[hgm_ref_df.TranscriptBiotype == 'protein_coding']
num_coding_genes = len(set(tmp.GeneId))
num_coding_txs = len(set(tmp.TranscriptId))
for gene_id, d in tmp.groupby('GeneId'):
    paralogy = collections.Counter(x.split('-')[0] for x in d.AlignmentId)
    if sum(paralogy.values()) == len(paralogy):
        unique_genes += 1
    for tx_id, dd in d.groupby('TranscriptId'):
        if len(dd) == 1:
            unique_txs += 1

data = {}
data['GenesFound'] = num_coding_genes
data['GenesFoundPercent'] = 100.0 * num_coding_genes / num_genes
data['GenesMultiplyMapping'] = num_genes - unique_genes
data['GenesMultiplyMappingPercent'] = 100.0 * (num_genes - unique_genes) / num_genes
data['TranscriptsFound'] = num_coding_txs
data['TranscriptsFoundPercent'] = 100.0 * num_coding_txs / num_txs
data['TranscriptsMultiplyMapping'] = num_txs - unique_txs
data['TranscriptsMultiplyMappingPercent'] = 100.0 * (num_txs - unique_txs) / num_txs

# full coverage
full_cov_mrna = len(coding_df[coding_df.AlnCoverage_mRNA == 100])
full_cov_cds = len(coding_df[coding_df.AlnCoverage_CDS == 100])
data['FullmRNACoverage'] = full_cov_mrna
data['FullmRNACoveragePercent'] = 100.0 * full_cov_mrna / num_txs
data['FullCDSCoverage'] = full_cov_cds
data['FullCDSCoveragePercent'] = 100.0 * full_cov_cds / num_txs

# construct a stringent filter that requires the following:
# 1) Has all original introns
# 2) Full CDS Coverage
# 3) No Frame-shift
frameshift = len(coding_df[coding_df.Frameshift == True])
original_introns = len(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100])
cov = len(coding_df[coding_df.AlnCoverage_CDS == 100])
cov_frameshift = len(coding_df[(coding_df.AlnCoverage_CDS == 100) &
                                          (coding_df.Frameshift != True)])
cov_frameshift_original_introns = len(coding_df[(coding_df.AlnCoverage_CDS == 100) &
                                                (coding_df.Frameshift != True) &
                                                (coding_df.OriginalIntronsPercent_mRNA == 100)])
data['TranscriptsWithFrameshift'] = frameshift
data['TranscriptsWithFrameshiftPercent'] = 100.0 * frameshift / num_txs
data['TranscriptsWithOriginalIntrons'] = original_introns
data['TranscriptsWithOriginalIntronsPercent'] = 100.0 * original_introns / num_txs
data['TranscriptsWithFullCDSCoverage'] = cov
data['TranscriptsWithFullCDSCoveragePercent'] = 100.0 * cov / num_txs
data['TranscriptsWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift
data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_txs
data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns
data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_txs

# naive gene level
frameshift = len(set(coding_df[coding_df.Frameshift == True].GeneId))
original_introns = len(set(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100].GeneId))
cov = len(set(coding_df[(coding_df.ProperOrf == True) & (coding_df.AlnCoverage_CDS == 100)].GeneId))
cov_frameshift = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) &
                                   (coding_df.Frameshift != True)].GeneId))
cov_frameshift_original_introns = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) &
                                                    (coding_df.Frameshift != True) &
                                                    (coding_df.OriginalIntronsPercent_mRNA == 100)].GeneId))
data['GenesWithFrameshift'] = frameshift
data['GenesWithFrameshiftPercent'] = 100.0 * frameshift / num_genes
num_genes_all_shifted = 0
for gene_id, d in coding_df.groupby('GeneId'):
    if len(d[d.Frameshift == True]) == len(d):
        num_genes_all_shifted += 1
data['GenesWithFrameshiftAllIsoforms'] = num_genes_all_shifted
data['GenesWithFrameshiftAllIsoformsPercent'] = 100.0 * num_genes_all_shifted / num_genes
data['GenesWithOriginalIntrons'] = original_introns
data['GenesWithOriginalIntronsPercent'] = 100.0 * original_introns / num_genes
data['GenesWithFullCDSCoverage'] = cov
data['GenesWithFullCDSCoveragePercent'] = 100.0 * cov / num_genes
data['GenesWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift
data['GenesWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_genes
data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns
data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_genes

missing = set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId) - set(tmp.GeneId)

data['MissingGenes'] = len(missing)
data['MissingGenesPercent'] = (100.0 * len(missing)) / num_genes

data['Name'] = db_path.replace('.db', '')


In [4]:
genes_all_shifted = set()
for gene_id, d in coding_df.groupby('GeneId'):
    if len(d[d.Frameshift == True]) == len(d):
        genes_all_shifted.add(gene_id)

In [16]:
shifted = ref_df[ref_df.GeneId.isin(genes_all_shifted)].groupby("GeneId").first().reset_index()[["GeneId", "GeneName"]]

In [38]:
bonobo_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/work/transMap/Bonobo.filtered.gp")

In [39]:
bonobo_txs = tools.transcripts.group_transcripts_by_name2(bonobo_txs.values())

In [14]:
ref_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/work/reference/gencode.v33.annotation.gff3.gp")

In [41]:
bonobo_positions = []
for gene_id in shifted.GeneId:
    txs = bonobo_txs[gene_id]
    if len(txs) == 0:
        print(gene_id)
        continue
    chrom = txs[0].chromosome
    start = min(x.start for x in txs)
    end = max(x.stop for x in txs)
    bonobo_positions.append([gene_id, f"{chrom}:{start}-{end}"])

ENSG00000095917.14
ENSG00000134250.20
ENSG00000152076.18
ENSG00000153165.18
ENSG00000162825.16
ENSG00000163040.14
ENSG00000168970.22
ENSG00000185304.15
ENSG00000187627.15
ENSG00000196312.14
ENSG00000196862.9
ENSG00000198064.13
ENSG00000206077.11
ENSG00000219481.10
ENSG00000224712.12
ENSG00000228049.7
ENSG00000228570.7
ENSG00000240403.5
ENSG00000242019.1
ENSG00000254206.5
ENSG00000254852.8
ENSG00000266338.6
ENSG00000268043.7
ENSG00000269713.7
ENSG00000271425.9
ENSG00000276203.5
ENSG00000278662.5


In [43]:
bonobo_positions

[['ENSG00000027644.5', 'chr1:132183151-132195254'],
 ['ENSG00000069696.7', 'chr11:643268-646775'],
 ['ENSG00000075826.17', 'chr10:97086797-97119951'],
 ['ENSG00000075975.16', 'chr3:12481335-12510196'],
 ['ENSG00000082068.9', 'chr5:72626108-72992623'],
 ['ENSG00000083812.12', 'chr19:55471056-55479455'],
 ['ENSG00000084674.15', 'chr2a:20986463-21029004'],
 ['ENSG00000087074.8', 'chr19:45872609-45876372'],
 ['ENSG00000087086.15', 'chr19:45966446-45968015'],
 ['ENSG00000095203.14', 'chr9:80324948-80473720'],
 ['ENSG00000096384.20', 'chr6:43841920-43849642'],
 ['ENSG00000101204.17', 'chr20:59729909-59766795'],
 ['ENSG00000101350.8', 'chr20:28566748-28623966'],
 ['ENSG00000101435.5', 'chr20:23519972-23523714'],
 ['ENSG00000102053.12', 'chrX:54484692-54623859'],
 ['ENSG00000102174.9', 'chrX:14663154-14859292'],
 ['ENSG00000102794.10', 'chr13:58166018-58176146'],
 ['ENSG00000104974.12', 'chr19:51538613-51547451'],
 ['ENSG00000105205.7', 'chr19:36688131-36691178'],
 ['ENSG00000106648.14', 'chr7

In [45]:
coding_df[coding_df.GeneId == "ENSG00000069696.7"]

Unnamed: 0,GeneId,TranscriptId,AlignmentId,AllSpeciesIntronRnaSupport,AllSpeciesExonRnaSupport,IntronRnaSupport,ExonRnaSupport,IntronAnnotSupport,CdsAnnotSupport,ExonAnnotSupport,...,OriginalIntrons_CDS,PercentUnknownBases_CDS,ProperOrf_CDS,ValidStart_CDS,ValidStop_CDS,OriginalIntronsPercent_CDS,Frameshift,CodingInsertion,CodingDeletion,CodingMult3Indel
1247,ENSG00000069696.7,ENST00000176183.6,ENST00000176183.6-0,"[1, 1, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[1, 1, 0, 0, 0]","[1, 1, 0, 0, 1]",...,"[1, 1, 1]",0.0,0,1,1,100.0,True,False,True,True
