In [1]:
import collections
import pandas as pd

import tools.intervals
import tools.misc
import tools.mathOps
import tools.fileOps
import tools.sqlInterface
import tools.transcripts
import tools.nameConversions
import tools.procOps
from cat.consensus import *
from argparse import ArgumentParser

In [2]:
df = pd.read_csv("/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/consensus_gene_set/Bonobo.gp_info", sep="\t")

In [3]:
df_filt = df[~df.frameshift.isnull()]

In [4]:
frameshifted = []
for gene_id, d in df_filt.groupby("gene_id"):
    d = d[~d.frameshift.isnull()]
    if len(d[d.frameshift == True]) > 0:
        frameshifted.append(gene_id)

In [5]:
len(frameshifted)

1117

In [6]:
ref_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/work/reference/gencode.v33.annotation.gff3.gp")

In [7]:
ref_txs = tools.transcripts.group_transcripts_by_name2(ref_txs.values())

In [8]:
bonobo_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/consensus_gene_set/Bonobo.gp")

In [9]:
bonobo_txs = tools.transcripts.group_transcripts_by_name2(bonobo_txs.values())

In [10]:
frameshifted_df = df[df.gene_id.isin(frameshifted)][["gene_id", "source_gene", "alignment_id"]].groupby("gene_id").first().reset_index()

In [11]:
def get_coords(txs):
    chrom = txs[0].chromosome
    start = min(x.start for x in txs)
    end = max(x.stop for x in txs)
    return f"{chrom}:{start}-{end}"

In [12]:
frameshifted_df["bonobo_gene_coords"] = [get_coords(bonobo_txs[x]) for x in frameshifted_df.gene_id]

In [13]:
frameshifted_df["human_gene_coords"] = [get_coords(ref_txs[x]) for x in frameshifted_df.source_gene]

In [14]:
db_path = "/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Bonobo.db"
session = tools.sqlInterface.start_session(db_path)

eval_dfs = []
for tx_mode in ['transMap', 'augTM']:
    cds_table = tools.sqlInterface.tables['CDS'][tx_mode]['evaluation']
    cds_df = pd.read_sql_table(cds_table.__tablename__, session.bind.engine)
    eval_dfs.append(cds_df)
    
    
eval_df = pd.concat(eval_dfs).set_index("AlignmentId")

In [15]:
filt_eval_df = eval_df[eval_df.index.isin(frameshifted_df.alignment_id)]

In [16]:
filt_eval_df = filt_eval_df[["chromosome", "start", "stop", "name"]].reset_index()

In [17]:
final = frameshifted_df.merge(filt_eval_df, left_on="alignment_id", right_on="AlignmentId")

In [18]:
final = final.drop("AlignmentId", axis="columns")

In [19]:
final = final.set_index(["gene_id", "source_gene", "alignment_id", "bonobo_gene_coords", "human_gene_coords"])

In [20]:
final.to_csv("bonobo_frameshifts_any_isoform.csv")