In [1]:
import collections
import pandas as pd

import tools.intervals
import tools.misc
import tools.mathOps
import tools.fileOps
import tools.sqlInterface
import tools.transcripts
import tools.nameConversions
import tools.procOps
from cat.consensus import *
from argparse import ArgumentParser

In [15]:
df = pd.read_csv("/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/consensus_gene_set/Bonobo.gp_info", sep="\t")

In [16]:
df_filt = df[~df.tag.isnull()]
df_filt = df_filt[df_filt.tag.str.contains("MANE")]

In [17]:
frameshifted = []
for gene_id, d in df_filt.groupby("gene_id"):
    d = d[~d.frameshift.isnull()]
    if len(d[d.frameshift == True]) > 0:
        frameshifted.append(gene_id)

In [18]:
len(frameshifted)

234

In [19]:
ref_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/work/reference/gencode.v33.annotation.gff3.gp")

In [20]:
ref_txs = tools.transcripts.group_transcripts_by_name2(ref_txs.values())

In [21]:
bonobo_txs = tools.transcripts.get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/consensus_gene_set/Bonobo.gp")

In [22]:
bonobo_txs = tools.transcripts.group_transcripts_by_name2(bonobo_txs.values())

In [23]:
frameshifted_df = df[df.gene_id.isin(frameshifted)][["gene_id", "source_gene", "alignment_id"]].groupby("gene_id").first().reset_index()

In [24]:
def get_coords(txs):
    chrom = txs[0].chromosome
    start = min(x.start for x in txs)
    end = max(x.stop for x in txs)
    return f"{chrom}:{start}-{end}"

In [25]:
frameshifted_df["bonobo_gene_coords"] = [get_coords(bonobo_txs[x]) for x in frameshifted_df.gene_id]

In [26]:
frameshifted_df["human_gene_coords"] = [get_coords(ref_txs[x]) for x in frameshifted_df.source_gene]

In [27]:
db_path = "/public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/databases/Bonobo.db"
session = tools.sqlInterface.start_session(db_path)

eval_dfs = []
for tx_mode in ['transMap', 'augTM']:
    cds_table = tools.sqlInterface.tables['CDS'][tx_mode]['evaluation']
    cds_df = pd.read_sql_table(cds_table.__tablename__, session.bind.engine)
    eval_dfs.append(cds_df)
    
    
eval_df = pd.concat(eval_dfs).set_index("AlignmentId")

In [64]:
filt_eval_df = eval_df[eval_df.index.isin(frameshifted_df.alignment_id)]

In [140]:
filt_eval_txs = [tools.transcripts.Transcript(list(x)) for _, x in filt_eval_df.iterrows()]
for i, (_, s) in enumerate(filt_eval_df.iterrows()):
    filt_eval_txs[i].name = f"loc{i}"

In [209]:
with open("coords.bed", "w") as fh:
    for x in filt_eval_txs:
        for exon in x.exon_intervals:
            if exon.start != exon.stop:
                fh.write("\t".join(map(str, [x.chromosome, exon.start - 1, exon.stop, x.name])) + "\n")
            else:
                fh.write("\t".join(map(str, [x.chromosome, exon.start - 1, exon.stop + 1, x.name])) + "\n")


In [210]:
!halLiftover /public/groups/cgl/cat/primates_evan/v2/bonobo-only-v2.1/assemblyHub/primates_eichler-v2.hal Bonobo coords.bed Human human_coords.bed

In [211]:
!bedtools intersect -u -a human_coords.bed -b human_genes.bed > human_genes_intersected.bed

In [212]:
human_coords = pd.read_csv("human_coords.bed", header=None, sep="\t")
human_coords.columns = ["human_chrom", "human_start", "human_stop", "idx"]

In [213]:
human_merged = []
for idx, s in human_coords.groupby('idx'):
    vals = ';'.join([f"{x.human_chrom}:{x.human_start}-{x.human_stop}" for _, x in pd.DataFrame(s).iterrows()])
    human_merged.append([idx, vals])
human_merged_df = pd.DataFrame(human_merged, columns=["idx", "human_indel_coords"])

In [214]:
bonobo_df = [[aln_id, s["name"], f"loc{i}", f"{s.chromosome}:{s.start}-{s.stop}"] for i, (aln_id, s) in enumerate(filt_eval_df.iterrows())]
bonobo_df = pd.DataFrame(bonobo_df, columns=["AlignmentId", "indel type", "idx", "bonobo_indel_coords"])
bonobo_df = frameshifted_df.merge(bonobo_df, left_on="alignment_id", right_on="AlignmentId")
bonobo_df = bonobo_df.drop("AlignmentId", axis="columns")

In [228]:
final = bonobo_df.merge(human_merged_df, on="idx", how="outer")
final = final.drop("idx", axis="columns")
final = final[~final.human_indel_coords.isnull()]
final = final[final["indel type"].isin(["CodingDeletion", "CodingInsertion"])]

In [229]:
final.to_csv("bonobo_frameshifts_MANE_select.csv")

In [249]:
offset = 5

with open('indels.bed', 'w') as fh:
    for _, s in final.iterrows():
        chrom, pos = s.bonobo_indel_coords.split(":")
        start, stop = pos.split("-")
        start = int(start)
        stop = int(stop)
        start -= offset
        stop += offset
        fh.write("\t".join(map(str, [chrom, start, stop, s.bonobo_indel_coords])) + "\n")

In [250]:
!bedtools intersect -wo -a indels.bed -b /public/groups/cgl/cat/primates_evan/v2/bonobo_hifi/filtered.vcf.gz > overlaps.txt

chr16	6412	.	GTG	GCTG	75.3441	.	AB=0;ABP=0;AC=2;AF=1;AN=2;AO=2;CIGAR=1M1I2M;DP=2;DPB=2.66667;DPRA=0;EPP=3.0103;EPPR=0;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=6.27915;PAIRED=0;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=95;QR=0;RO=0;RPL=0;RPP=7.35324;RPPR=0;RPR=2;RUN=1;SAF=1;SAP=3.0103;SAR=1;SRF=0;SRP=0;SRR=0;TYPE=ins	GT:DP:AD:RO:QR:AO:QA:GL	.

chr16	6412	.	GTG	GCTG	75.3441	.	AB=0;ABP=0;AC=2;AF=1;AN=2;AO=2;CIGAR=1M1I2M;DP=2;DPB=2.66667;DPRA=0;EPP=3.0103;EPPR=0;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=6.27915;PAIRED=0;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=95;QR=0;RO=0;RPL=0;RPP=7.35324;RPPR=0;RPR=2;RUN=1;SAF=1;SAP=3.0103;SAR=1;SRF=0;SRP=0;SRR=0;TYPE=ins	GT:DP:AD:RO:QR:AO:QA:GL	.



In [257]:
overlap_df = pd.read_csv("overlaps.txt", sep='\t', header=None)

In [264]:
# column 3 basically says 'did this have a VCF hit'
final_vcf_filt = final[~final.bonobo_indel_coords.isin(set(overlap_df[3]))]

In [270]:
final_nochrXY = final_vcf_filt[(~final_vcf_filt.human_indel_coords.str.contains('chrX')) & (~final_vcf_filt.human_indel_coords.str.contains('chrY')) & (~final_vcf_filt.human_indel_coords.str.contains('alt'))]

In [271]:
final_nochrXY

Unnamed: 0,gene_id,source_gene,alignment_id,bonobo_gene_coords,human_gene_coords,indel type,bonobo_indel_coords,human_indel_coords
48,Bonobo_G0001089,ENSG00000227124.11,ENST00000652011.2-2,Super_Scaffold_200000119422:68749-72169,chr3:75678659-75785583,CodingDeletion,Super_Scaffold_200000119422:70212-70212,chr3:75737405-75737406
49,Bonobo_G0001089,ENSG00000227124.11,ENST00000652011.2-2,Super_Scaffold_200000119422:68749-72169,chr3:75678659-75785583,CodingDeletion,Super_Scaffold_200000119422:70375-70375,chr3:75737483-75737485
50,Bonobo_G0001089,ENSG00000227124.11,ENST00000652011.2-2,Super_Scaffold_200000119422:68749-72169,chr3:75678659-75785583,CodingDeletion,Super_Scaffold_200000119422:70409-70409,chr3:75737517-75737519
51,Bonobo_G0001089,ENSG00000227124.11,ENST00000652011.2-2,Super_Scaffold_200000119422:68749-72169,chr3:75678659-75785583,CodingInsertion,Super_Scaffold_200000119422:70420-70421,chr3:75737528-75737530
52,Bonobo_G0001089,ENSG00000227124.11,ENST00000652011.2-2,Super_Scaffold_200000119422:68749-72169,chr3:75678659-75785583,CodingDeletion,Super_Scaffold_200000119422:70503-70503,chr3:75737611-75737612;chr3:75737613-75737614
...,...,...,...,...,...,...,...,...
899,Bonobo_G0054032,ENSG00000177992.10,ENST00000325643.6-0,chr9:58978124-58984131,chr9:87882876-87888903,CodingDeletion,chr9:58981308-58981308,chr9:87886074-87886075;chr9:87886077-87886078
900,Bonobo_G0054032,ENSG00000177992.10,ENST00000325643.6-0,chr9:58978124-58984131,chr9:87882876-87888903,CodingDeletion,chr9:58982997-58982997,chr9:87887765-87887767
901,Bonobo_G0054032,ENSG00000177992.10,ENST00000325643.6-0,chr9:58978124-58984131,chr9:87882876-87888903,CodingDeletion,chr9:58983062-58983062,chr9:87887832-87887834
903,Bonobo_G0054608,ENSG00000136834.3,augTM-ENST00000259357.3-0,chr9:93603026-93603832,chr9:122476957-122477926,CodingDeletion,chr9:93603821-93603821,chr9:122477719-122477720;chr9:122477724-122477725


In [273]:
final_nochrXY.to_csv("bonobo_frameshifts_MANE_select_hifi_filtered.csv")