In [1]:
import pandas as ps
import sys
from pysam import FastaFile
from Bio.Seq import Seq
from IPython.core.debugger import set_trace

In [2]:
if len(sys.argv) < 2:
    print("Please pass the new annotations and the previous annotations to do a test merge")
else:
    indels1 = sys.argv[0]
    indels2 = sys.argv[1]

In [3]:
sequence_object = FastaFile('hg38.fa')

In [298]:
#Delete when converted to a script
indel1 = "PMLB_indels_mut.tsv"
indel2 = "PMLB_Indels_annotated.tsv"

In [299]:
original_annos = ps.read_csv(indel1, sep='\t')
reannotated_annos = ps.read_csv(indel2, sep='\t').iloc[:,5:].drop_duplicates()

In [300]:
def convertToVcfFormat(row):
    chromo = "chr" + row['chromosome']
    pos = row['seq_start_position']
    ref = row['ref_allele']
    alt = row['alt_allele']
    if ref == '-':
        ref = ''
    if alt == '-':
        alt = ''
    previousBaseStartInterval = pos - 2
    previousBasseEndInterval = pos - 1
    previousBaseByBaseCount = pos - 1
    previousSeq = sequence_object.fetch(chromo,previousBaseStartInterval,previousBasseEndInterval).upper()
    vcf_ref = previousSeq + ref
    vcf_alt = previousSeq + alt
    #print("Converted pos:ref:alt {}:{}:{} to {}:{}:{}".format(pos,ref,alt, previousBasePos,vcf_ref,vcf_alt))
    return previousBaseByBaseCount, vcf_ref, vcf_alt

In [301]:
fixedOriginalAnnos = original_annos.join(original_annos.apply(convertToVcfFormat, axis=1, result_type='expand')).rename(columns = 
        { 'seq_start_position' : 'prev_seq_start_position', 
         'ref_allele' :'prev_ref', 
         'alt_allele' : 'prev_alt', 
         0 : 'seq_start_position',
         1 : 'ref_allele', 2 : 'alt_allele', 
         'codon_change' : 'original_codon_change'}
       )

In [302]:
fixedOriginalAnnos.shape

(3529, 30)

In [303]:
fixedOriginalAnnos.iloc[:,5:].drop_duplicates().shape

(3429, 25)

In [304]:
reannotated_annos.shape

(3426, 22)

In [305]:
mergedTables = fixedOriginalAnnos.merge(reannotated_annos, on=['read_depth', 'chromosome', 'seq_start_position', 'ref_allele', 'alt_allele'], how='left')

In [306]:
mergedTables.columns

Index(['model_id', 'sample_id', 'sample_origin', 'host_strain_nomenclature',
       'passage', 'symbol_x', 'biotype_x', 'coding_sequence_change_x',
       'variant_class_x', 'original_codon_change', 'amino_acid_change_x',
       'consequence_x', 'functional_prediction_x', 'read_depth',
       'allele_frequency_x', 'chromosome', 'prev_seq_start_position',
       'prev_ref', 'prev_alt', 'ucsc_gene_id_x', 'ncbi_gene_id_x',
       'ncbi_transcript_id_x', 'ensembl_gene_id_x', 'ensembl_transcript_id_x',
       'variation_id_x', 'genome_assembly_x', 'platform_x',
       'seq_start_position', 'ref_allele', 'alt_allele', 'symbol_y',
       'biotype_y', 'coding_sequence_change_y', 'variant_class_y',
       'codon_change', 'amino_acid_change_y', 'consequence_y',
       'functional_prediction_y', 'allele_frequency_y', 'ucsc_gene_id_y',
       'ncbi_gene_id_y', 'ncbi_transcript_id_y', 'ensembl_gene_id_y',
       'ensembl_transcript_id_y', 'variation_id_y', 'genome_assembly_y',
       'platform_y'],

In [307]:
comparisonTable = mergedTables[['model_id', 'sample_id','symbol_x', 'symbol_y', 'read_depth', 'chromosome', 'seq_start_position', 'original_codon_change', 'codon_change', 'ref_allele', 'alt_allele']]

In [289]:
comparisonTable.to_csv('comparison_all.tsv', sep='\t', index=False)

In [308]:
mergedTables[["model_id","sample_id","sample_origin","host_strain_nomenclature","passage","symbol_x","biotype_y","coding_sequence_change_y","variant_class_y","codon_change","amino_acid_change_y","consequence_y","functional_prediction_y","read_depth","allele_frequency_x","chromosome","seq_start_position","ref_allele","alt_allele","ucsc_gene_id_y","ncbi_gene_id_y","ncbi_transcript_id_y","ensembl_gene_id_y","ensembl_transcript_id_y","variation_id_y","genome_assembly_x","platform_x"]].to_csv('fixed_indels', sep='\t')