In [9]:
import pandas as pd
import os
import json
from collections import *
from cat.plots import *

In [15]:
ordered_genomes = ["Bonobo", "Chimp", "Gorilla"]
annot_db = "/public/groups/cgl/cat/primates_evan/v2/out/databases/Human.db"
metric_dir = "/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/plot_data/"

In [3]:
gene_biotype_map = tools.sqlInterface.get_gene_biotype_map(annot_db)

In [6]:
biotypes = sorted(tools.sqlInterface.get_transcript_biotypes(annot_db))

In [16]:
consensus_data = OrderedDict([[genome, json.load(open(os.path.join(metric_dir, genome, "consensus.json")))] for genome in ordered_genomes])

In [17]:
biotype_map = {
    '3prime_overlapping_ncRNA': 'ncRNA',
    '3prime_overlapping_ncrna': 'ncRNA',
    'IG_C_gene': 'C_segment',
    'IG_C_pseudogene': 'C_segment',
    'IG_D_gene': 'D_region',
    'IG_D_pseudogene': 'D_region',
    'IG_J_gene': 'J_segment',
    'IG_J_pseudogene': 'J_segment',
    'IG_V_gene': 'V_region',
    'IG_V_pseudogene': 'V_region',
    'IG_pseudogene': 'J_segment',
    'Mt_rRNA': 'rRNA',
    'Mt_tRNA': 'tRNA',
    'TEC': 'mRNA',
    'TR_C_gene': 'C_region',
    'TR_C_pseudogene': 'C_region',
    'TR_J_gene': 'J_segment',
    'TR_J_pseudogene': 'J_segment',
    'TR_V_gene': 'V_region',
    'TR_V_pseudogene': 'V_region',
    'antisense': 'ncRNA',
    'antisense_RNA': 'ncRNA',
    'bidirectional_promoter_lncRNA': 'ncRNA',
    'bidirectional_promoter_lncrna': 'ncRNA',
    'lincRNA': 'ncRNA',
    'lncRNA': 'ncRNA',
    'macro_lncRNA': 'ncRNA',
    'miRNA': 'ncRNA',
    'misc_RNA': 'ncRNA',
    'non_coding': 'ncRNA',
    'non_stop_decay': 'mRNA',
    'nonsense_mediated_decay': 'mRNA',
    'polymorphic_pseudogene': 'mRNA',  # coding in some humans
    'processed_pseudogene': 'pseudogene',
    'processed_transcript': 'ncRNA',
    'protein_coding': 'mRNA',
    'pseudogene': 'pseudogene',
    'rRNA_pseudogene': 'pseudogene',
    'rRNA': 'rRNA',
    'retained_intron': 'ncRNA',
    'ribozyme': 'ncRNA',
    'sRNA': 'ncRNA',
    'scaRNA': 'ncRNA',
    'scRNA': 'ncRNA',
    'sense_intronic': 'ncRNA',
    'sense_overlapping': 'ncRNA',
    'snRNA': 'ncRNA',
    'snoRNA': 'ncRNA',
    'transcribed_processed_pseudogene': 'pseudogene',
    'transcribed_unitary_pseudogene': 'pseudogene',
    'transcribed_unprocessed_pseudogene': 'pseudogene',
    'translated_unprocessed_pseudogene': 'pseudogene',
    'translated_processed_pseudogene': 'pseudogene',
    'unitary_pseudogene': 'pseudogene',   # only in human
    'unknown_likely_coding': 'denovo',
    'unprocessed_pseudogene': 'pseudogene',
    'vaultRNA': 'ncRNA',
    'ncRNA': 'ncRNA',
    'tRNA': 'tRNA'}

In [21]:
gene_missing_df = json_biotype_counter_to_df(consensus_data, 'Gene Missing')
transcript_missing_df = json_biotype_counter_to_df(consensus_data, 'Transcript Missing')

In [38]:
c = Counter()
for (genome, biotype), s in gene_missing_df.groupby(["genome", 'Gene Missing']):
    b = biotype_map.get(biotype)
    assert len(s) == 1
    if b:
        c[(genome, b, 'Gene')] += s.iloc[0]['count']
for (genome, biotype), s in transcript_missing_df.groupby(["genome", 'Transcript Missing']):
    b = biotype_map.get(biotype)
    assert len(s) == 1
    if b:
        c[(genome, b, 'Transcript')] += s.iloc[0]['count']

In [63]:
missing_df = pd.DataFrame.from_dict({"col": c}, orient="columns").reset_index()
missing_df.columns = ['Genome', "Biotype", "Annotation Type", "Number of items"]
missing_df = pd.melt(missing_df, id_vars=['Genome', "Biotype", "Annotation Type"])

In [65]:
missing_df = missing_df[missing_df.Biotype.isin(["mRNA", "ncRNA", "pseudogene"])]

In [94]:
with PdfPages("bonbo_missing.pdf") as pdf:
    sns.catplot(data=missing_df, x='Genome', y='value', col="variable", kind="bar", hue="Biotype",
               row="Annotation Type", sharey=False)
    g.fig.suptitle("Missing genes by category in final CAT annotation sets")
    multipage_close(pdf, tight_layout=False)

In [84]:
consensus_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['Consensus Indels'], orient='index').T
                                  for genome in ordered_genomes])
consensus_df['genome'] = ordered_genomes
consensus_df = pd.melt(consensus_df, id_vars=['genome'],
             value_vars=['CodingDeletion', 'CodingInsertion', 'CodingMult3Indel'])

In [93]:
with PdfPages("bonobo_indels.pdf") as pdf:
    g = sns.catplot(data=consensus_df, x='genome', y='value', hue="variable", kind="bar", sharey=False)
    g.fig.suptitle("Indel rates in final CAT annotation sets")
    multipage_close(pdf, tight_layout=False)

In [95]:
tm_data = OrderedDict([[genome, json.load(open(os.path.join(metric_dir, genome, "filter_tm_metrics.json")))] for genome in ordered_genomes])

In [96]:
split_df = json_biotype_counter_to_df(tm_data, 'Split Genes')
split_df.columns = ['category', 'count', 'genome']

In [102]:
with PdfPages("bonobo_split_genes.pdf") as pdf:
    g = sns.catplot(data=split_df, x='genome', y='count', hue="category", kind="bar", sharey=False)
    g.fig.suptitle("Discontiguous genes")
    multipage_close(pdf, tight_layout=False)