In [3]:
import re

def getIdToLocusTagDict(df):
    df = df[df['type'] == 'mRNA']
    
    locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')
    idSearch = re.compile(r'ID=(.*?);')
    
    d = {}
    
    for attr in df['attributes']:
        val = locusSearch.match(attr).group(1)
        key = idSearch.match(attr).group(1)
        if key in d.keys():
            print("Unexpected: id key: %s already in dictionary." % key)
        d[key] = val
    return d

In [4]:
import pandas as pd
import os
import sys
import subprocess

BASE_OUT_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/defining_alleles/'
GENOME_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v03/'
PROTEINORTHO_OUT_PATH = os.path.join(BASE_OUT_PATH, 'proteinortho')

ORIG_H_CTG_GFF_PATH = os.path.join(GENOME_PATH, 'DK_0911_v03_h_ctg.anno.gff3')
ORIG_P_CTG_GFF_PATH = os.path.join(GENOME_PATH, 'DK_0911_v03_p_ctg.anno.gff3')

PROTEINORTHO_PATH = '/home/gamran/anaconda3/proteinortho_v5.16b/proteinortho5.pl'
PROJECT_NAME = 'ph_ctg_516'
H_CTG_FAA = 'DK_0911_v03_h_ctg.protein.faa'
P_CTG_FAA = 'DK_0911_v03_p_ctg.protein.faa'

In [5]:
def rewriteGffForProteinortho(gff_in_loc, gff_out_loc):
    '''Takes a gff file at gff_in_loc and rewrites it to conform to
    the format required for proteinortho analysis.'''
    ctg_df = pd.read_table(gff_in_loc, skiprows = 1, header = None, \
                           names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    
    # get dict mapping ID to locus_tag from mRNA entries
    # need attribute to have 'ID=locus_tag' because .faa file has ID=locus_tag
    # will be used in this way: CDS.parent = mRNA.id =(d)> locus_tag
    d = getIdToLocusTagDict(ctg_df)
    
    # subset DataFrame to only CDS type (only type required for proteinortho analysis)
    ctg_df = ctg_df[ctg_df['type'] == 'CDS']
    
    # turn attribute column into parent value
    parentSearch = re.compile(r'^.*Parent=(.*?)(;|$)')
    ctg_df['attributes'] = ctg_df['attributes'].apply(lambda x: parentSearch.match(x).group(1))
    
    # map CDS.parent values (= mRNA.id) in attribute column to locus_tag
    ctg_df['attributes'] = ctg_df['attributes'].map(d)
    
    # make attribute column into format 'ID=locus_tag' to be readable by proteinortho
    ctg_df['attributes'] = ctg_df['attributes'].apply(lambda x: 'ID=%s;'%x)
    
    with open(gff_out_loc, 'w') as outfile:
        ctg_df.to_csv(outfile, sep='\t', header = False, index = False)
    return ctg_df

In [6]:
def getProteinorthoScript():
    '''Returns a string containing the commands required to run
    proteinortho on the two .faa files in synteny mode'''
    s = ''
    s += 'cd %s\n' % PROTEINORTHO_OUT_PATH
    s += 'source activate py27\n'
    s += '%s -project=%s -synteny %s/%s %s/%s'\
        % (PROTEINORTHO_PATH, PROJECT_NAME, PROJECT_NAME, H_CTG_FAA, PROJECT_NAME, P_CTG_FAA)
    return s


In [7]:
def exitIfFileNotInCwd(fileName):
    '''Checks if a file is in the current working directory (cwd)
    and if not, alerts the user to take action.'''
    if not os.path.exists(fileName):
        print("Please copy %s file to %s" \
              % (fileName, os.getcwd()))   
        sys.exit()
    return True

def writeGffIfDoesntExist(new_gff_file_path, orig_gff_file_path):
    '''Writes the new proteinortho-compatible gff file at the
    new_gff_file_path if it does not already exist. If the gff at
    new_gff_file_path already exists, it will not be re-written.'''
    if not os.path.exists(new_gff_file_path):
        rewriteGffForProteinortho(orig_gff_file_path, new_gff_file_path)
    else:
        print('gff file at: %s already exists... no new gff file was generated.' % new_gff_file_path)

def setUp():
    print("Checking for correct files and directories...")
    os.chdir(PROTEINORTHO_OUT_PATH)
    if not os.path.isdir(PROJECT_NAME):
        print("Created directory: " + os.path.join(os.getcwd(), PROJECT_NAME))
        os.mkdir(PROJECT_NAME)
    os.chdir(PROJECT_NAME)
    
    exitIfFileNotInCwd(H_CTG_FAA)
    exitIfFileNotInCwd(P_CTG_FAA)
    
    NEW_H_CTG_GFF_PATH = os.path.join(os.getcwd(), H_CTG_FAA[:-3] + "gff")
    NEW_P_CTG_GFF_PATH = os.path.join(os.getcwd(), P_CTG_FAA[:-3] + "gff")
    
    writeGffIfDoesntExist(NEW_H_CTG_GFF_PATH, ORIG_H_CTG_GFF_PATH)
    writeGffIfDoesntExist(NEW_P_CTG_GFF_PATH, ORIG_P_CTG_GFF_PATH)

    exitIfFileNotInCwd(P_CTG_FAA[:-3] + "gff")
    exitIfFileNotInCwd(H_CTG_FAA[:-3] + "gff")
    
    print("Files and directories required for proteinortho analysis now exist.\n")
    
    return True
    

In [8]:
def main():
    assert(setUp())
    print("Execute the following commands on the command line:\n")
    print(getProteinorthoScript())

In [9]:
if __name__ == "__main__":
    main()

Checking for correct files and directories...
gff file at: /home/gamran/genome_analysis/Warrior/Richard/output/defining_alleles/proteinortho/ph_ctg_516/DK_0911_v03_h_ctg.protein.gff already exists... no new gff file was generated.
gff file at: /home/gamran/genome_analysis/Warrior/Richard/output/defining_alleles/proteinortho/ph_ctg_516/DK_0911_v03_p_ctg.protein.gff already exists... no new gff file was generated.
Files and directories required for proteinortho analysis now exist.

Execute the following commands on the command line:

cd /home/gamran/genome_analysis/Warrior/Richard/output/defining_alleles/proteinortho
source activate py27
/home/gamran/anaconda3/proteinortho_v5.16b/proteinortho5.pl -project=ph_ctg_516 -synteny ph_ctg_516/DK_0911_v03_h_ctg.protein.faa ph_ctg_516/DK_0911_v03_p_ctg.protein.faa
