In [None]:
import glob
from Bio import Phylo
from Bio.Phylo.PhyloXML import Phylogeny

In [None]:
gfffile =  '../../../figshare/annotation/genes_scaffolded_assembly/UTEX2797_v1.gff3'
treefiles = glob.glob('../../../figshare/orthofinder/Gene_Trees_IQTREE/*.ntr.tree')

support_threshold = 0
outfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/UTEX2797_sisrels_support' + str(support_threshold) + '.txt'
bedfile1 = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/UTEX2797_sisrels_orange' + str(support_threshold) + '.bed'
bedfile2 = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/UTEX2797_sisrels_pink' + str(support_threshold) + '.bed'

#len(treefiles)

In [None]:
geneDict = {}

fi = open(gfffile)

for line in fi:
    feature = line.split('\t')[2]
    
    if feature != 'gene':
        continue

    gene = line.split('\t')[8].split(';')[0].split('=')[1]

    scaffold = line.split('\t')[0]
    start = int(line.split('\t')[3])
    stop = int(line.split('\t')[4])

    geneDict[gene] = [scaffold, start, stop]
    #print(gene, scaffold)

fi.close()

### Given a minimum bootstrap support value, iterate through each gene tree and:
1. Identitfy all UTEX2797 genes
2. For each gene, move up the tree to find the first node meeting the minimum bootstrap support
3. Collect all descendants of that node as sister strains to that gene 

In [None]:
def all_parents(tree):
    parents = {}
    for clade in tree.find_clades(order="level"):
        for child in clade:
            parents[child] = clade
    return parents

In [None]:
def get_support(node):
    support = 0
    try:
        int(node.confidence)
    except TypeError:
        pass
    else:
        support = int(node.confidence)
    return support

In [None]:
informative_trees = set()

fo = open(outfile, 'w')
fo.write('# Orthogroup\tUTEX2797 gene\tUltrafast bootstrap support\tSister strains\n')

bo1 = open(bedfile1, 'w')
bo1.write('# Sister relationships that correspond to predicted subgenomes\n')
bo1.write('# orange: UTEX2797 groups with 12B1 and/or CCMP3037\n')

bo2 = open(bedfile2, 'w')
bo2.write('# Sister relationships that correspond to predicted subgenomes\n')
bo2.write('# pink: UTEX2797 groups with RCC3703 and/or CCMP2941\n')

orange_set = {'12B1', 'CCMP3037'}
pink_set = {'RCC3703', 'CCMP2941'}

for treefile in treefiles:
    orthogroup = treefile.split('/')[-1].split('.')[0]
    #print(orthogroup)
    
    tree = Phylo.read(treefile, 'newick')
    tree.rooted = True

    parents = all_parents(tree)
    
    for gene in tree.get_terminals(): # for each leaf in tree
        if 'UTEX2797' in str(gene): # if it is a 2797 gene
            
            node = gene
            parent = parents[node]
            support = get_support(parent)
            i = len(tree.get_path(gene))

            sister_set = set()
            for leaf in parent.get_terminals(): 
                strain = str(leaf).split('_')[1]
                if strain == 'UTEX2797': 
                    continue
                else:                 
                    sister_set.add(strain)

            #print(node, parent, parent.confidence, sister_set)

            while support < support_threshold and i > 1 or len(sister_set) == 0:
                node = parent
                parent = parents[node]
                support = get_support(parent)
                i = i - 1
                
                sister_set = set()
                for leaf in parent.get_terminals(): 
                    strain = str(leaf).split('_')[1]
                    if strain == 'UTEX2797': 
                        continue
                    else:                 
                        sister_set.add(strain)

                
            if support >= support_threshold:
                #print(orthogroup, gene)
                
                informative_trees.add(orthogroup)                
                fo.write(orthogroup + '\t' + str(gene) + '\t' + str(support) + '\t' + ', '.join(sister_set) + '\n')

                gene = 'UTEX2797' + str(gene).split('.')[0]
                scaffold = geneDict[str(gene)][0]
                start = geneDict[str(gene)][1]
                stop = geneDict[str(gene)][2]
                
                if len(sister_set) == 0:
                    print('warning: UTEX2797 has no sisters', orthogroup, gene)
                    continue

                if len(sister_set - orange_set) == 0:
                    bo1.write(scaffold + '\t' + str(start) + '\t' + str(stop) + '\t' + orthogroup + '_' + gene + '_orange\n')#can add gene ID here

                if len(sister_set - pink_set) == 0:
                    bo2.write(scaffold + '\t' + str(start) + '\t' + str(stop) + '\t' + orthogroup + '_' + gene + '_pink\n')#can add gene ID here

fo.close()      
bo1.close()
bo2.close()

In [None]:
print(len(informative_trees), ' Gene trees passed support value cutoff')
print(len(informative_trees)/len(treefiles)*100, '% of input trees ')