In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import os
import shutil
from Bio import SeqIO
from Bio import AlignIO
import distance
import editdistance
import math
import subprocess
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import collections
import pybedtools
from sklearn.externals.joblib import Parallel, delayed
import itertools as it

In [7]:
PAML_PATH = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/paml'
PROTEIN_DICT = {}
CDS_DICT = {}
PAIRING_DICT = {}
PROTEIN_DICT['DK0911'] = os.path.join(PAML_PATH, 'DK_0911_v04LT_ph_ctg.protein.fa')
PROTEIN_DICT['Pst104E'] = os.path.join(PAML_PATH, 'Pst_104E_v13_ph_ctg.protein.fa')
CDS_DICT['DK0911'] = os.path.join(PAML_PATH, 'DK_0911_v04LT_ph_ctg.cds.fa')
CDS_DICT['Pst104E'] = os.path.join(PAML_PATH, 'Pst_104E_v13_ph_ctg.cds.fa')
PAIRING_DICT['DK0911'] = os.path.join(PAML_PATH, 'DK.combined.paring')
PAIRING_DICT['Pst104E'] = os.path.join(PAML_PATH, 'PAu.combined.paring')

genomes = ['DK0911', 'Pst104E']

In [8]:
def getFastaDict(fastaFile):
    d = {}
    for gene in SeqIO.parse(fastaFile, 'fasta'):
        d[gene.id] = gene
    return d

In [6]:
def writeAllelicFasta(alleleOne, alleleTwo, alleleType, fa_dict, outPath):
    '''writes fasta file containing fasta information for two alleles
    in the outPath'''
    assert(alleleType.upper() in ['CDS', 'GENE', 'PROTEIN'])
    
    alleleSeqRecords = []
    alleleSeqRecords.append(fa_dict[alleleOne])
    alleleSeqRecords.append(fa_dict[alleleTwo])
    
    with open(os.path.join(outPath, alleleType.lower() + '.fa'), 'w') as outFile:
        SeqIO.write(alleleSeqRecords, outFile, 'fasta')
    return True

def writeAlignmentScript(alleleOutPath, scriptLoc):
    with open(scriptLoc, 'a') as outFile:
        print('cd %s' % alleleOutPath, file=outFile)
        print('muscle -clwstrict -in protein.fa -out protein.aln', file=outFile)
        print('perl /home/benjamin/anaconda3/orthologr/inst/pal2nal/pal2nal.v14/pal2nal.pl -output paml protein.aln cds.fa > cds_codon.aln', file=outFile)
        print('perl /home/benjamin/anaconda3/orthologr/inst/pal2nal/pal2nal.v14/pal2nal.pl protein.aln cds.fa > cds_codon.clustal', file=outFile)
        print('cp %s/yn00.ctl ./' % PAML_PATH, file=outFile)
        print('yn00', file=outFile)
    return True

In [15]:
def prepareAlignmentBashScript(pairing_fn, protein_dict, cds_dict, script_fn, OUT_PATH):
    with open(script_fn, 'w') as pamlScript:
        print('#!/bin/bash', file=pamlScript)
    
    df = pd.read_csv(pairing_fn, sep='\t', header=None, names=['Query', 'Target'])
    
    for index, [Query, Target] in df.iloc[:, :2].iterrows():

        alleleOutPath = os.path.join(OUT_PATH, '%s_%s' % (Query, Target))
        if not os.path.exists(alleleOutPath):
            os.mkdir(os.path.join(OUT_PATH, '%s_%s' % (Query, Target)))

        writeAllelicFasta(Query, Target, 'CDS', cds_dict, alleleOutPath)
        writeAllelicFasta(Query, Target, 'PROTEIN', protein_dict, alleleOutPath)

        writeAlignmentScript(alleleOutPath, script_fn)
    return True

In [9]:
protein_dict = {}
for genome in genomes:
    protein_dict.update(getFastaDict(PROTEIN_DICT[genome]))

In [21]:
cds_dict = {}
for genome in genomes:
    cds_dict.update(getFastaDict(CDS_DICT[genome]))

In [22]:
for genome in genomes:
    BASE_OUT = os.path.join(PAML_PATH, genome)
    if not os.path.exists(BASE_OUT):
        os.makedirs(BASE_OUT)
    paml_script_fn = os.path.join(BASE_OUT, 'paml_script.sh')
    prepareAlignmentBashScript(PAIRING_DICT[genome], protein_dict, cds_dict, paml_script_fn, BASE_OUT)

#### Execute both paml scripts on the command line