# proteinortho

- Inputs: .*gff* files, .*faa* files
- Programs: **proteinortho**
- Purpose: **proteinortho** implements a blast-based approach to determine sets of (co-)orthologous proteins or nucleic acid sequences that generalises the reciprocal best alignment heuristic

NB: this notebook, as it is written here, is meant to be run from another notebook, `DK_0911_defining_alleles_v02` (using command `%run DK_0911_proteinortho.ipynb`).

In [None]:
import pandas as pd
import os
import sys
import subprocess
import shutil
import re


## This variable should be defined in the ipynb calling this ipynb.
try:
    GENOME_VERSION
except NameError:
    raise Exception('Please define GENOME_VERSION in the local ipynb to use this script.')
    
try:
    GENOME_PATH
    print(GENOME_PATH)
except NameError:
    raise Exception('Please define GENOME_PATH in the local ipynb to use this script.')
    
try:
    BASE_OUT_PATH
except NameError:
    raise Exception('Please define BASE_OUT_PATH in the local ipynb to use this script.')

GENOME = GENOME_VERSION

PROTEINORTHO_OUT_PATH = os.path.join(BASE_OUT_PATH, 'proteinortho')

ORIG_H_CTG_GFF_PATH = os.path.join(GENOME_PATH, '%s_h_ctg.anno.gff3' % GENOME_VERSION)
ORIG_P_CTG_GFF_PATH = os.path.join(GENOME_PATH, '%s_p_ctg.anno.gff3' % GENOME_VERSION)

PROTEINORTHO_PATH = '/home/gamran/anaconda3/proteinortho_v5.16b/proteinortho5.pl'
PROJECT_NAME = 'ph_ctg_516'
H_CTG_FAA = '%s_h_ctg.anno.protein.faa' % GENOME
P_CTG_FAA = '%s_p_ctg.anno.protein.faa' % GENOME

In [None]:
## obsolete function in v031

# def getIdToLocusTagDict(df):
#     df = df[df['type'] == 'mRNA']
    
#     locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')
#     idSearch = re.compile(r'ID=(.*?);')
    
#     d = {}
    
#     for attr in df['attributes']:
#         val = locusSearch.match(attr).group(1)
#         key = idSearch.match(attr).group(1)
#         if key in d.keys():
#             print("Unexpected: id key: %s already in dictionary." % key)
#         d[key] = val
#     return d

# def rewriteGffForProteinortho(gffInLoc, gffOutLoc):
#     '''Takes a gff file at gffInLoc and rewrites it to conform to
#     the format required for proteinortho analysis.'''
#     ctgDf = pd.read_table(gffInLoc, skiprows = 1, header = None, \
#                            names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    
#     # get dict mapping ID to locus_tag from mRNA entries
#     # need attribute to have 'ID=locus_tag' because .faa file has ID=locus_tag
#     # will be used in this way: CDS.parent = mRNA.id =(dict)> locus_tag
#     d = getIdToLocusTagDict(ctgDf)
    
#     # subset DataFrame to only CDS type (only type required for proteinortho analysis)
#     ctgDf = ctgDf[ctgDf['type'] == 'CDS']
    
#     # turn attribute column into parent value
#     parentSearch = re.compile(r'^.*Parent=(.*?)(;|$)')
#     ctgDf['attributes'] = ctgDf['attributes'].apply(lambda x: parentSearch.match(x).group(1))
    
#     # map CDS.parent values (= mRNA.id) in attribute column to locus_tag
#     ctgDf['attributes'] = ctgDf['attributes'].map(d)
    
#     # make attribute column into format 'ID=locus_tag' to be readable by proteinortho
#     ctgDf['attributes'] = ctgDf['attributes'].apply(lambda x: 'ID=%s;'%x)
    
#     with open(gffOutLoc, 'w') as outfile:
#         ctgDf.to_csv(outfile, sep='\t', header = False, index = False)
#     return ctgDf

From the [proteinortho manual](https://www.bioinf.uni-leipzig.de/Software/proteinortho/manual.html):

>Please note that you need additional data to include synteny, namely the gene positions in GFF3 format. As Proteinortho is primarily made for proteins, **it will only accept GFF entries of type CDS** (column #3 in the GFF-file). The **attributes column (#9) must contain Name=GENE IDENTIFIER** where GENE IDENTIFIER corresponds to the respective identifier in the FASTA format. It **may not contain a semicolon (;)**! Alternatively, you can also set **ID=GENE IDENTIFIER**.

Thus, a new gff file must be re-written that is compatible with these standards.

In [None]:
def rewriteGffForProteinortho(gffInLoc, gffOutLoc):
    '''Takes a gff file at gffInLoc and rewrites it to conform to
    the format required for proteinortho analysis.'''
    ctgDf = pd.read_table(gffInLoc, skiprows = 1, header = None, \
                           names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    
    # subset DataFrame to only CDS type (only type required for proteinortho analysis)
    ctgDf = ctgDf[ctgDf['type'] == 'CDS']
    
    # turn attribute column into parent value of the CDS tag (which is the mRNA ID tag, 
    # consistent with the fasta headers)
    parentSearch = re.compile(r'^.*Parent=(.*?)(;|$)')
    ctgDf['attributes'] = ctgDf['attributes'].apply(lambda x: parentSearch.match(x).group(1))
    
    # make attribute column into format 'ID=parent ID tag' to be readable by proteinortho
    ctgDf['attributes'] = ctgDf['attributes'].apply(lambda x: 'ID=%s;'%x)
    
    with open(gffOutLoc, 'w') as outfile:
        ctgDf.to_csv(outfile, sep='\t', header = False, index = False)
    return ctgDf

In [None]:
def getProteinorthoScript():
    '''Returns a string containing the commands required to run
    proteinortho on the two .faa files in synteny mode'''
    s = ''
    s += 'cd %s\n' % PROTEINORTHO_OUT_PATH
    s += 'source activate py27\n'
    s += '%s -project=%s -synteny -singles %s/%s %s/%s'\
        % (PROTEINORTHO_PATH, PROJECT_NAME, PROJECT_NAME, H_CTG_FAA, PROJECT_NAME, P_CTG_FAA)
    return s


In [None]:
def exitIfFileNotInCwd(fileName):
    '''Checks if a file is in the current working directory (cwd)
    and if not, alerts the user to take action.'''
    if not os.path.exists(fileName):
        fileOut = fileName
        if fileName.endswith('.faa') and os.path.exists(os.path.join(GENOME_PATH, fileName[:-1])): #.fa file exists
            fileName = fileName[:-1] # .faa is same as .fa, but simply indicates that amino acids are in the sequence
            print("Copying %s file to %s" % (fileOut, os.getcwd()))
            shutil.copy2(os.path.join(GENOME_PATH, fileName), os.path.join(os.getcwd(), fileOut))
        else:
            print("Please copy %s file to %s" % (fileOut, os.getcwd()))   
            sys.exit()
    return True

def writeGffIfDoesntExist(new_gff_file_path, orig_gff_file_path):
    '''Writes the new proteinortho-compatible gff file at the
    new_gff_file_path if it does not already exist. If the gff at
    new_gff_file_path already exists, it will not be re-written.'''
    if not os.path.exists(new_gff_file_path):
        print('Writing a new proteinortho-compatible gff file at %s based on original at %s' % (new_gff_file_path, orig_gff_file_path))
        rewriteGffForProteinortho(orig_gff_file_path, new_gff_file_path)
    else:
        print('gff file at: %s already exists... no new gff file was generated.' % new_gff_file_path)

def setUp():
    print("Checking for correct files and directories...")
    os.chdir(PROTEINORTHO_OUT_PATH)
    if not os.path.isdir(PROJECT_NAME):
        print("Created directory: " + os.path.join(os.getcwd(), PROJECT_NAME))
        os.mkdir(PROJECT_NAME)
    os.chdir(PROJECT_NAME)
    
    exitIfFileNotInCwd(H_CTG_FAA)
    exitIfFileNotInCwd(P_CTG_FAA)
    
    NEW_H_CTG_GFF_PATH = os.path.join(os.getcwd(), H_CTG_FAA[:-3] + "gff")
    NEW_P_CTG_GFF_PATH = os.path.join(os.getcwd(), P_CTG_FAA[:-3] + "gff")
    
    writeGffIfDoesntExist(NEW_H_CTG_GFF_PATH, ORIG_H_CTG_GFF_PATH)
    writeGffIfDoesntExist(NEW_P_CTG_GFF_PATH, ORIG_P_CTG_GFF_PATH)

    exitIfFileNotInCwd(P_CTG_FAA[:-3] + "gff")
    exitIfFileNotInCwd(H_CTG_FAA[:-3] + "gff")
    
    print("Files and directories required for proteinortho analysis exist.\n")
    
    return True
    

In [None]:
import subprocess

def writeBashScript(bashScriptName, folderPath):
    '''Writes the proteinortho bash script with name bashScriptName
    and in the folder folderPath'''
    os.chdir(folderPath)

    outFile = open(bashScriptName, 'w')
    outFile.write('#!/bin/bash\n')
    outFile.write(getProteinorthoScript())
    outFile.close()

def runBashScript(bashScript, folderPath):
    '''Runs bashScript and output is piped in real-time to Python'''
    os.chdir(folderPath)
    cmd = ['bash', bashScript]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    for line in iter(p.stdout.readline, b''):
        print(line.rstrip())
    
    return True


In [None]:
os.chdir('/home/gamran/genome_analysis/Warrior/Richard/scripts')
%run 'file_counting.ipynb'

PROTEINORTHO_OUT_PATH_DICT = {'blast-graph': 1,
                             'faa': 2,
                             'ffadj-graph': 1,
                             'gff': 2,
                             'phr': 2,
                             'pin': 2,
                             'poff': 1,
                             'poff-graph': 1,
                             'proteinortho': 1,
                             'proteinortho-graph': 1,
                             'psq': 2}

def reportDiscrepancies(folderPath, refDict, ignoreExts = []):
    '''Checks whether the proteinortho files already exist, according
    to the refDict'''
    print("Checking whether proteinortho files already exist in %s..." % folderPath)
    folder_name = getFolderName(folderPath)
    print('Folder reference dictionary:\n%s' % refDict)
    
    discrepancies = getDiscrepancies(folderPath, refDict, ignoreExts)
    
    if discrepancies == '':
        print("All proteinortho files, according to the reference dictionary, appear to already exist.")
        return False
    
    print(discrepancies)
    return True

In [None]:
def main():
    assert(setUp())
    fileDiscrepancies = reportDiscrepancies(PROTEINORTHO_OUT_PATH, PROTEINORTHO_OUT_PATH_DICT, ['sh']) 
    if fileDiscrepancies:
        print("Not all files supposed to be generated by proteinortho appear to be present. Running proteinortho now (this may take some time)...")
        writeBashScript('run_proteinortho.sh', PROTEINORTHO_OUT_PATH)
        runBashScript('run_proteinortho.sh', PROTEINORTHO_OUT_PATH)
        print("Proteinortho finished running.")
    else:
        print("\nProteinortho appears to have been ran previously, therefore it was not run this time.")
    return True

In [None]:
if __name__ == "__main__":
    main()