# Extract gene IDs with significant alien index scores
12B1 and UTEX 2797 predicted proteomes were run through the alien index pipeline. 

Genes were considered candidates if:

* AI > 0.1
* total hits in dbs >= 50
* if top hit to eukaryote
    * hits to other haptophytes >= 5
    * hits to dinoflagellates were skipped

In [None]:
infile = '../query/alien_index_recipientPRYMNESIUM_ancestralHAPTISTA.txt'
ogfile = '../prymnesium_paper/7_Pangenome/1_OrthoFinder/Orthogroups/Orthogroups.txt'
nodesfile = '../dbs/nodes.dmp'
mergedfile = '../dbs/merged.dmp'

# contaminated contigs removed from assembly in v0.4
skiplist = ['g2235.t1_12B1', 'g2236.t2_12B1', 'g2237.t1_12B1', 'g2238.t1_12B1', 'g2239.t1_12B1', 'g2240.t1_12B1', 'g2241.t1_12B1', 'g2242.t1_12B1', 'g2243.t1_12B1', 'g6975.t1_12B1', 'g6976.t1_12B1', 'g6977.t1_12B1', 'g6978.t1_12B1', 'g6979.t1_12B1', 'g6980.t1_12B1', 'g6981.t1_12B1', 'g6982.t1_12B1']


In [None]:
fi = open(ogfile)

ogDict = {}
revogDict = {}

for line in fi:
    genelist = line.rstrip().split()
    og = genelist.pop(0).split(':')[0]
    
    #print(og, genelist)
    ogDict[og] = genelist
    
    for gene in genelist: 
        revogDict[gene] = og

fi.close()

In [None]:
parentDict = {}
lin = {}

nfi = open(nodesfile)
for line in nfi:
    #print(line)
    col = line.rstrip().split('\t|\t')
    node = col[0]
    parent = col[1]
    #print(node,parent)
    parentDict[node] = parent
    
nfi.close()

mfi = open(mergedfile)
for line in mfi:
    col = line.rstrip().split('\t|\t')
    node = col[0]
    parent = col[1].split('\t')[0]  
    #print(node,parent)
    parentDict[node] = parent
mfi.close()

def taxdump(taxid):
    root = 'no'
    taxlist = []
    
    while root == 'no':
        #print(taxid)
        if taxid == '':
            break
        if taxid == '1':
            root = 'yes'
        taxlist.append(taxid)
        
        if taxid not in parentDict:
            break
        
        taxid = parentDict[taxid]
    
    return(taxlist)


In [None]:
outfile1 = '../query/candidates_from_other_euks_no_dinos.txt'
outfile2 = '../query/candidates_from_bacteria.txt'

fi = open(infile)
fo1 = open(outfile1, 'w')
fo2 = open(outfile2, 'w')

candidates = set()

for line in fi:
    if line[0] == '#':
        continue
        
    col = line.rstrip().split('\t')
    gene = col[0]
    if gene in skiplist:
        continue
        
    #print(line)
    if col[9] == 'na':
        continue
        
    bestTaxid = col[9].split('-')[1]
    bestDom = 'notEuk'
    ncbi_lineage = taxdump(bestTaxid)
    for taxon_id in ncbi_lineage:
        if taxon_id == '2864':
            bestDom = 'Dino'
            break
        if taxon_id == '2759':
            bestDom = 'Euk'
            break
            
    ai = float(col[13])
    noAncest = int(col[17])
    noTotal = col[19]
    
    if noTotal == '>200':
        noTotal = 200
    else:
        noTotal = int(noTotal)

    #print(gene,bestTaxid,bestDom,ai,noAncest,noTotal)
    
    if noTotal >= 50 and ai > 0.1:
        #print(gene,bestTaxid,bestDom,ai,noAncest,noTotal)
        if bestDom == 'Euk':
            if noAncest > 4:
                #print(gene,bestTaxid,bestDom,ai,noAncest,noTotal)
                candidates.add(gene)
                fo1.write(gene + '\n')
        if bestDom == 'notEuk':
            candidates.add(gene)
            fo2.write(gene + '\n')
    
fi.close()
fo1.close()
fo2.close()

In [None]:
# number of AI candidates
len(candidates)

In [None]:
ogCandidates = set()

for gene in candidates:
    #print(gene)
    og = revogDict[gene]
    ogCandidates.add(og)

In [None]:
# 221 AI candidates collapsed into 95 orthogroups
len(ogCandidates)

# Combine orthogroup sequences with top hits from AI pipeline

In [None]:
from Bio import SeqIO
import glob

In [None]:
seqDict = {}
fastafile = '../prymnesium_paper/6_Genome_annotation/5_Functional/fasta/all_proteins.fa.combined'

for record in SeqIO.parse(fastafile, 'fasta'):
    #print(record.id)
    seqDict[record.id] = str(record.seq)

In [None]:
hgtDict = {}
fastafiles = '../tree-out-Euk/fasta/*fa'

for fastafile in glob.glob(fastafiles):
    gene = fastafile.split('/')[-1].split('.fa')[0]
    hgtDict[gene] = {}
    for record in SeqIO.parse(fastafile, 'fasta'):
        #print(gene, record.id)
        hgtDict[gene][record.id] = str(record.seq)
        
        
fastafiles = '../tree-out-notEuk/fasta/*fa'

for fastafile in glob.glob(fastafiles):
    gene = fastafile.split('/')[-1].split('.fa')[0]
    hgtDict[gene] = {}
    for record in SeqIO.parse(fastafile, 'fasta'):
        #print(gene, record.id)
        hgtDict[gene][record.id] = str(record.seq)

In [None]:
for og in ogCandidates:
    #print(og)
    
    fastaDict = {}
    for gene in ogDict[og]:
        #print(gene)
        fastaDict[gene] = seqDict[gene]
        
        if gene in hgtDict:
            for hitgene in hgtDict[gene]:
                if hitgene.split('-')[0] == 'QUERY':
                    continue
                
                fastaDict[hitgene] = hgtDict[gene][hitgene]
        
    outfile = '../tree-out/fasta/' + og + '.fa'
    fo = open(outfile, 'w')
    
    for gene in fastaDict:
        if gene in hgtDict:
            fo.write('>AI-' + gene + '\n' + fastaDict[gene] + '\n')
        else:
            fo.write('>' + gene + '\n' + fastaDict[gene] + '\n')
    
    fo.close()


# Create supplemental table for HGT1-HGT11

In [None]:
import os

In [None]:
fastafiles = glob.glob('../tree-out/fasta/*fa')
fastas = []
for file in fastafiles:
    fastas.append(file.split('/')[-1].split('.')[0])
    
alnfiles = glob.glob('../tree-out/mafft/*aln')
alignments = {}
for file in alnfiles:
    og = file.split('/')[-1].split('.')[0]
    
    if os.stat(file).st_size == 0:
        continue
    seq_record = next(SeqIO.parse(file, "fasta"))
    seqlen = len(str(seq_record.seq))
    alignments[og] = seqlen

trimfiles = glob.glob('../tree-out/trimal/*trim')
trimmed_alignments = {}
for file in trimfiles:
    og = file.split('/')[-1].split('.')[0]
    
    if os.stat(file).st_size == 0:
        continue
    seq_record = next(SeqIO.parse(file, "fasta"))
    seqlen = len(str(seq_record.seq))
    trimmed_alignments[og] = seqlen

treefiles = glob.glob('../tree-out/tree/*contree')
trees = []
for file in treefiles:
    trees.append(file.split('/')[-1].split('.')[0])


In [None]:
outfile = '../query/alien_index_phylogenies.txt'

fo = open(outfile, 'w')
fo.write('orthogroup\talignment length\ttrimmed align length\ttree built\n')
for og in fastas:
    fo.write(og)
    if og in alignments:
        fo.write('\t' + str(alignments[og]))
    else:
        fo.write('\t')

    if og in trimmed_alignments:
        fo.write('\t' + str(trimmed_alignments[og]))
    else:
        fo.write('\t')
    
    if og in trees:
        fo.write('\tyes')
    else:
        fo.write('\tno')

    fo.write('\n')
    
fo.close()

# Create combined phylogeny for OG0000128, OG0000258, OG0000491, OG0024434, OG0024714 and OG0003216

These 6 orthogroups are part of a larger gene family. 

OG0000128, OG0000258, OG0000491, OG0024434, and OG0024714 all contain genes with positive AI scores with phylogenies that support HGT. 

Genes in OG0003216 do not have a high alien index, but the orthogroup sequences were included as distant homologs in first-pass HGT phylogenies. 

In [8]:
oglist = ['OG0000128', 'OG0000258', 'OG0000491', 'OG0010579', 'OG0024434', 'OG0024714', 'OG0003216']
ogfile = '../prymnesium_paper/7_Pangenome/1_OrthoFinder/Orthogroups/Orthogroups.tsv'
outfile = '../query/genes_for_HGT1.txt'

fi = open(ogfile)
fo = open(outfile, 'w')

for line in fi:
    line = line.rstrip().split('\t')
    og = line[0]
    if og in oglist:
        bgenes = line[2].split(', ')
        ugenes = line[14].split(', ')
        for gene in bgenes:
            if gene != '':
                fo.write(gene + '\n')
        for gene in ugenes:
            if gene != '':
                fo.write(gene + '\n')

fi.close()
fo.close()

In [9]:
from Bio import SeqIO
import glob

In [11]:
seqDict = {}
fastafile = '../prymnesium_paper/6_Genome_annotation/5_Functional/fasta/all_proteins.fa.combined'

for record in SeqIO.parse(fastafile, 'fasta'):
    #print(record.id)
    seqDict[record.id] = str(record.seq)

In [12]:
finDict = {}
fastafiles = '../tree-out/fasta/g*fa'

for fastafile in glob.glob(fastafiles):
    for record in SeqIO.parse(fastafile, 'fasta'):
        if record.id.split('-')[0] == 'QUERY':
            #print(record.id)
            record.id = record.id.split('-')[1]
        finDict[record.id] = str(record.seq)

In [13]:
hgtList = []
fastafiles = '../tree-out-Euk/fasta/*fa'

for fastafile in glob.glob(fastafiles):
    gene = fastafile.split('/')[-1].split('.fa')[0]
    hgtList.append(gene)

In [14]:
oglist = ['OG0000128', 'OG0000258', 'OG0000491', 'OG0010579', 'OG0024434', 'OG0024714', 'OG0003216']
ogfile = '../prymnesium_paper/7_Pangenome/1_OrthoFinder/Orthogroups/Orthogroups.tsv'

fi = open(ogfile)

for line in fi:
    line = line.rstrip().split('\t')
    og = line.pop(0)
    if og in oglist:
        for col in line:
            genes = col.split(', ')
        for gene in genes:
            if gene != '':
                finDict[gene] = seqDict[gene]

fi.close()


In [15]:
outfile = '../tree-out/fasta/HGT1.fa'

fo = open(outfile, 'w')

for gene in finDict:
    if gene in hgtList:
        fo.write('>AI-' + gene + '\n' + finDict[gene] + '\n')
    else:
        fo.write('>' + gene + '\n' + finDict[gene] + '\n')

        

fo.close()

In [19]:
stopseq = 'XP_023337425.1-88015-Eurytemora_affinis-Metazoa'
infile = '../tree-out/mafft/HGT1.aln'
outfile = '../tree-out/fasta/HGT1_cropped.fa'

fo = open(outfile, 'w')

for record in SeqIO.parse(infile, 'fasta'):
    #print(record.id)
    
    sequence = str(record.seq).replace('-' , '')
    fo.write('>' + record.id + '\n' + sequence + '\n')
    
    if record.id == stopseq:
        break
        
fo.close()

# Create supplemental table for HGT11 viral insertions genomic region

In [20]:
import os

fastafiles = glob.glob('../tree-out/fasta/*fa')
fastas = []
for file in fastafiles:
    fastas.append(file.split('/')[-1].split('.')[0])
    
alnfiles = glob.glob('../tree-out/mafft/*aln')
alignments = {}
for file in alnfiles:
    og = file.split('/')[-1].split('.')[0]
    
    if os.stat(file).st_size == 0:
        continue
    seq_record = next(SeqIO.parse(file, "fasta"))
    seqlen = len(str(seq_record.seq))
    alignments[og] = seqlen

trimfiles = glob.glob('../tree-out/trimal/*trim')
trimmed_alignments = {}
for file in trimfiles:
    og = file.split('/')[-1].split('.')[0]
    
    if os.stat(file).st_size == 0:
        continue
    seq_record = next(SeqIO.parse(file, "fasta"))
    seqlen = len(str(seq_record.seq))
    trimmed_alignments[og] = seqlen

treefiles = glob.glob('../tree-out/tree/*contree')
trees = []
for file in treefiles:
    trees.append(file.split('/')[-1].split('.')[0])



In [21]:
outfile = '../query/HGT11_region_phylogenies.txt'

fo = open(outfile, 'w')
fo.write('orthogroup\talignment length\ttrimmed align length\ttree built\n')
for og in fastas:
    fo.write(og)
    if og in alignments:
        fo.write('\t' + str(alignments[og]))
    else:
        fo.write('\t')

    if og in trimmed_alignments:
        fo.write('\t' + str(trimmed_alignments[og]))
    else:
        fo.write('\t')
    
    if og in trees:
        fo.write('\tyes')
    else:
        fo.write('\tno')

    fo.write('\n')
    
fo.close()