# Prepare files for constructing a species tree
using both super matrix and super tree approaches

In [None]:
import glob
import os.path
import numpy as np
from Bio import Phylo
from ete3 import PhyloTree
from Bio import SeqIO

In [None]:
ogdir = '../../figshare/orthofinder/'
ogfile = ogdir + 'Orthogroups/Orthogroups.GeneCount.tsv'
ogtreedir = ogdir + 'Gene_Trees_IQTREE/'
ogalndir = ogdir + 'MultipleSequenceAlignments_GUIDANCE/'

intreefile = ogdir + 'Species_Tree/Supertree_in.tree'
inalnfile = ogdir + 'Species_Tree/Supermatrix_in.aln'
inparamfile = ogdir + 'Species_Tree/Supermatrix_in.param'
inparamprotfile = ogdir + 'Species_Tree/Supermatrix_in.param_prot'

astral_treefile = ogdir + 'Species_Tree/Supertree_astral.tree'
iqtree_base = ogdir + 'Species_Tree/Supermatrix_iqtree'

In [None]:
scogs = set()

fi = open(ogfile)

for line in fi:
    genecounts =  line.rstrip().split('\t')
    og = genecounts.pop(0)
    total = genecounts.pop(-1)
    #print(og,total)
    
    if og == 'Orthogroup':
        continue

    genecounts = list(map(int, genecounts))
    if sum(np.array(genecounts)>1) == 0:
        if genecounts.count(0) == 0:
            intree = ogtreedir + og + '.mpr.tree'
            if os.path.exists(intree) == False:
                continue 
            scogs.add(og)
    
fi.close()

In [None]:
print("Number of orthogroups for species tree:", len(scogs))

# Construct Super Tree Species Tree
https://github.com/smirarab/ASTRAL

In [None]:
phasepairs = {}
treelist = []
for og in scogs:
    ogtreefile = ogtreedir + og + '/' + og + '.contree'
    tree = Phylo.read(ogtreefile, "newick")
    phasepairs[og] = []
    
    for term in tree.get_terminals():
        #print(term.name)
        species = term.name.split('_')[-1]
        if species == 'UTEX2797' or species == '12B1':
            phasepairs[og].append(term.name)
        #print(species)
        term.name = species
    
    treelist.append(tree)
    
Phylo.write(treelist, intreefile, "newick")


In [None]:
fo = open(ogdir + 'Species_Tree/single_copy_genes_phasegenomes.txt', 'w')

for og in phasepairs:
    fo.write(og + '\t' + ', '.join(phasepairs[og]) + '\n')
fo.close()

# Run ASTRAL

In [None]:
! java -jar /depot/jwisecav/apps/bell/ASTRAL-5.7.1/Astral/astral.5.7.1.jar \
  -i {intreefile} \
  -o {astral_treefile} \
  2> {astral_treefile}.log

# Construct Super Matrix Species Tree
http://www.iqtree.org/doc/Complex-Models

```
DNA, part1 = 1-100
DNA, part2 = 101-384
```

In [None]:
seqDict = {}
for og in scogs:
    ogalnfile = ogalndir + og + '.trim.aln'

    for seq_record in SeqIO.parse(ogalnfile, "fasta"):
        sequence = str(seq_record.seq)
        strain = seq_record.id.split('_')[-1]
        #print(strain)

        if strain not in seqDict:
            seqDict[strain] = {}
            
        seqDict[strain][og] = sequence

In [None]:
fo = open(inparamfile, 'w')
start = 0
first = next(iter(seqDict.keys()))
for og in seqDict[first]:
    start += 1
    end = int(start + len(seqDict[first][og]) - 1)
    fo.write('DNA, ' + og + ' = ' + str(start) + '-' + str(end) + '\n')
    start = end

fo.close()

In [None]:
fo = open(inparamprotfile, 'w')
start = 0
first = next(iter(seqDict.keys()))
for og in seqDict[first]:
    start += 1
    end = int(start + (len(seqDict[first][og]) / 3) - 1)
    fo.write('DNA, ' + og + ' = ' + str(start) + '-' + str(end) + '\n')
    start = end

fo.close()

In [None]:
fo = open(inalnfile, 'w')
for strain in seqDict:
    fo.write('>' + strain + '\n')
    for og in seqDict[strain]:
        fo.write(seqDict[strain][og])
    fo.write('\n')

fo.close()

# Run IQ-TREE

```
scripts/3_run_supermatrix_speciestree.sub 
```