In [43]:
records = SeqIO.parse('../data/sequences.fasta.aln', 'fasta')
np.unique([len(list(r.seq)) for r in records])

array([414])

In [44]:
alns = getRecordAlignments('../data/sequences.fasta.aln')
azo = getRecordAlignments('../data/sequencesLongLabels.fasta')

In [56]:
azo['001'][108:118]

['F', 'L', 'E', 'E', 'E', 'G', 'A', 'Y', 'S', 'D']

In [57]:
print(alns['001'][193:203])

['F', 'L', 'E', 'E', 'E', 'G', 'A', 'Y', 'S', 'D']


In [2]:
from collections import defaultdict
import pandas as pd
from Bio import SearchIO, SeqIO

hmmout = '../sout.txt'
hitsout = 'hits.fasta'
pepite_db = '/home/robaina/cleangenomes/results/Marref_V6.faa'

attribs = ['id', 'bias', 'bitscore', 'description']
hits = defaultdict(list)

with open(hmmout) as handle:
    for queryresult in SearchIO.parse(handle, 'hmmer3-tab'):
      for hit in queryresult.hits:
        for attrib in attribs:
          hits[attrib].append(getattr(hit, attrib))

hits_df = pd.DataFrame.from_dict(hits)

# Write hit fasta
hit_records = [record for record in SeqIO.parse(pepite_db, 'fasta')
               if record.id in hits_df.id.values]

with open(hitsout, 'w') as out_handle:
     SeqIO.write(hit_records, out_handle, 'fasta')

In [30]:
from Bio import SeqIO

input_fasta = '/home/robaina/cleangenomes/results/Marref_V6.fasta'

hit_records = [record for record in SeqIO.parse(input_fasta, 'fasta')
               if record.id in hits_df.id.values]

with open('hits.fasta', 'w') as out_handle:
    # for record in SeqIO.parse(input_fasta, 'fasta'):
    #     if record.id in hits_df.id.values:
     SeqIO.write(hit_records, out_handle, 'fasta')

# SeqIO.write(selected_seqs, out_handle, "fasta")

In [1]:
def my_decorator(func):
    def wrapper():
        print("Something is happening before the function is called.")
        func()
        print("Something is happening after the function is called.")
    return wrapper

@my_decorator
def say_whee():
    print("Whee!")


say_whee()

Something is happening before the function is called.
Whee!
Something is happening after the function is called.


In [14]:
#Python decorators must return a function object
import smtplib
import functools

def pegasoNotify(message: str, receivers: list) -> None:
    """
    Send email from python to receivers.
    receivers: list containing email addresses as strings.
    """
    s = smtplib.SMTP_SSL(host = 'smtp.gmail.com', port = 465)
    s.login(user='pegaso.notifications@gmail.com', password='pegasoFzdx')
    s.sendmail('pegaso.notifications@gmail.com', receivers, message)
    s.quit()
    print('Notification sent')


def remotetest(func):
    """
    Decorator to test and report remote
    process success status
    """
    @functools.wraps(func) # Allows keeping function identity when calling help
    def wrapper(*args, **kwargs):
        try:
            func(*args, **kwargs)
            print('Function succeded')
            pegasoNotify('Aquifex: Job done!', ['semidan.robaina@gmail.com'])
        except Exception as e:
            print(f'Function failed with exception: {e}')
            pegasoNotify(f'Aquifex: Job failed with exception: {e}', ['semidan.robaina@gmail.com'])
    return wrapper


@remotetest
def test(a, b):
    print(f'This is a test {a / b}')

test(1, 0)

Function failed with exception: division by zero
Notification sent


In [4]:
import pyfastx

fa = pyfastx.Fasta('/home/robaina/Documents/MAR_database/mardb_proteins_V6.faa')
ids = fa.keys()
fa

<Fasta> /home/robaina/Documents/MAR_database/mardb_proteins_V6.faa contains 46739080 sequences

In [3]:
s0 = fa[0]
s0

<Sequence> SCZ75963.1_MMP03080599 with length of 391

# Search for duplicates through composition

NOT working, takes way too much time to compute, storing compositions would save some RAM space but would need to precompute (perhaps in parallel). Seems to much work for something that can be done in a computer with higher RAM...

Any other, more efficient, way to look for duplicated sequences?

# Using CD-HIT to eliminate duplicated sequences

CD-HIT keeps one representative of the duplicated sequences, can be used in merged fasta
from multiple datablases

In [1]:
from phyloplacement.database import runCDHIT

runCDHIT(input_fasta='/home/robaina/Documents/MAR_database/mardb_proteins_V6.faa',
         output_fasta='/home/robaina/Documents/MAR_database/mardb_proteins_V6_cdhit_no_duplicates.faa',
         additional_args='-c 1 -t 1'
)

# Preparing test query sequences for EPA-ng

Using data from Kitzinger, 2021, but cutting sequences to deliver same size

In [9]:
from Bio import SeqIO

min_length = 100

input_fasta = '/home/robaina/Documents/TRAITS/data/nxr/kitzinger2021/Nxr_kitzinger_2021.fasta'
output_fasta = '/home/robaina/Documents/TRAITS/data/nxr/kitzinger2021/epang_test.fasta'
records = SeqIO.parse(input_fasta, 'fasta')
output_records = []

for record in records:
    if len(record.seq) >= min_length:
        record.seq = record.seq[:min_length]  
        output_records.append(record)
with open(output_fasta, 'w') as out_handle: 
        SeqIO.write(output_records, out_handle, 'fasta')
