In [1]:
import numpy as np
import pandas as pd
from six import text_type


In [2]:
my_result = pd.read_csv("../../data/primer-finder-result.csv", sep=";")

In [3]:
from src.orf_finder import decide_orfs

In [4]:
#setup
remaining_results = my_result[~my_result['possible_orfs'].str.contains(r'\[\]')]
solved_results = remaining_results[~remaining_results['possible_orfs'].str.contains(',')]
remaining_results = remaining_results[~(remaining_results['Read ID'].isin(solved_results['Read ID']))]

solved_results["ORF"] = solved_results.apply(lambda x: x["possible_orfs"][1], axis=1)

# for faster debug:
remaining_results = remaining_results[~(remaining_results.Order == "o:Lepidoptera")]

In [12]:
import pyhmmer
from src.orf_finder import pad_sequences, build_seq_from_pandas_entry, process_ambiguous_orf

def pad_sequences(sequences, pad_char='X', minimum=0):
    max_length = len(max(sequences, key=len))
    max_length = max(max_length, minimum)
    def _pad(seq):
        pyhmmer.easel.TextSequence(name=seq.name, sequence=(seq.sequence + pad_char * (max_length - len(seq))))
    np.vectorize(_pad)(sequences)
    return sequences



def decide_orfs_here(referenceEntries: pd.DataFrame, questionableEntries: pd.DataFrame):
    alphabet = pyhmmer.easel.Alphabet.amino()

    referenceSequences = []
    referenceEntries.apply(lambda x: referenceSequences.append(build_seq_from_pandas_entry(x)), axis=1)
    questionableSequences = np.zeros(shape=(len(questionableEntries), 3), dtype=pyhmmer.easel.TextSequence)
    # questionableEntries.apply(lambda x: questionableSequences.append(process_ambiguous_orf(x)), axis=1)
    questionableEntries = questionableEntries.reset_index(drop=True)
    for i, entry in enumerate(questionableEntries):
        questionableSequences[i] = process_ambiguous_orf(questionableEntries.loc[i])
    print(questionableSequences)
    # wip
    lengths = np.vectorize(len)(sequences)
    max_length = np.max(lengths)
    max_length = max(max_length, minimum)
    # end wip
    longest_ref_seq = max(len(seq) for seq in referenceSequences)
    longest_que_seq = max(len(seq) for seq in questionableSequences)
    longest_seq_len = max(longest_ref_seq, longest_que_seq)

    referenceSequences = pad_sequences(referenceSequences, minimum=longest_seq_len)
    questionableSequences = pad_sequences(questionableSequences, minimum=longest_seq_len)

    dig_questionable_sequences = [seq.digitize(alphabet=alphabet) for seq in questionableSequences]

    msa = pyhmmer.easel.TextMSA(name=b"myMSA", sequences=referenceSequences)
    background = pyhmmer.plan7.Background(alphabet)
    builder = pyhmmer.plan7.Builder(alphabet)
    digital_msa = msa.digitize(alphabet=alphabet)
    hmm, profile, optimized_profile = builder.build_msa(digital_msa, background)

    pipeline = pyhmmer.plan7.Pipeline(alphabet, background)
    pipeline.bias_filter = False

    sequenceBlock = pyhmmer.easel.DigitalSequenceBlock(alphabet=alphabet, iterable=dig_questionable_sequences)
    hits = pipeline.search_hmm(query=hmm, sequences=sequenceBlock)

    modified_entries = pd.DataFrame(columns=questionableEntries.columns)
    questionableEntries.loc[:, 'ORF'] = ''
    for hit in hits:
        [read_id, correct_orf] = hit.name.decode().split("_")
        questionableEntries.loc[questionableEntries['Read ID'] == read_id, 'ORF'] = correct_orf
        if len(questionableEntries[questionableEntries["Read ID"] == read_id]) == 0:
            modified_entries = pd.concat([modified_entries, questionableEntries.loc[questionableEntries['Read ID'] == read_id].copy()], ignore_index=True)

    return modified_entries

In [9]:
#params
threshold = 4
upper_threshold = 50

last_solved = pd.DataFrame
failed = 0

taxonomic_levels = ['Family', 'Order', 'Class']

while remaining_results.size > 0:
    current_entry = remaining_results.iloc[0]

    # Try to match at each taxonomic level, from specific to general
    for level in taxonomic_levels:
        level_value = current_entry[level]
        comp_group = solved_results[solved_results[level] == level_value]
        group_size = len(comp_group)

        if group_size >= threshold:
            comp_group = comp_group.sample(min(upper_threshold, group_size))
            related_entries = remaining_results[remaining_results[level] == level_value]
            solved = decide_orfs_here(comp_group, related_entries)
            solved_results = pd.concat([solved_results, solved], ignore_index=True)
            remaining_results = remaining_results[~(remaining_results[level] == level_value)]
            last_solved = solved
            break
        else:
            print(f"{group_size} entries of {level} {level_value} is too small.")

            # If we've tried all levels and none are big enough
            if level == taxonomic_levels[-1]:
                print(f"Removing Family '{current_entry['Family']}' with {len(solved_results[solved_results['Family'] == current_entry['Family']])} members to continue.")
                remaining_results = remaining_results[~(remaining_results['Family'] == current_entry['Family'])]
                failed += len(related_entries) if 'related_entries' in locals() else 1

    print("-------------")

print(f"A total of {failed} entries were impossible to match.")

WQFSDYT*QESLLL*EPSTSFQPLSTYDHKEYS*IEPLCLCGL**LRQFCFYCPX
*LFSVFI*QEFHQFLVL*ILFQQ**MYDQSEYP*KKYPFFLEQL*LLLFYFFFHX
*QPMLPMEVLPLI*LFSDSILQEFLQF*EL*ILSLQLSMYDLKE*A*IEYPYSHEQLLLLPFFFFYLS
TII*HRS*SILCWFX
WPFSGYI*QESPQF*GQSILLLQ*SMYDPKV*Q*TEYPYSYELS*LQLFYFCYHX
SQQMLLTVGLP*I*QFSDYT*QESLLL*EPSTSFQPLSTYDHKEYS*IEPLCLCGL**LRQFCFYCPX
*QPMLPMEALPLI*LFSDSILQEFLQF*EL*ILLLQLSMYDLKE*A*IEYPYSHEQLLLLPFFFFYLS
WPFSGYI*QESPQF*GQSILLLQ*SMYDPKV*Q*TEYPYSYELS*LQLFYFCYHX
SQQMLLTVGLP*IWQFSDYT*QESLLL*EPSTSFQPLSTYDHKEYS*IEPLCLCGL**LRQFCFYCPX
*QFSDYI*PEFHLF*VQSILLLQ*SMYDPRA*A*TKYPFLYEQLWLPLSYCYYPX
WPFSGYI*QESPQF*GQSILLLQ*SMYDPKV*Q*TEYPYSYELS*LQLFYFCYHX
WQEMSLTEAPQLI*QFSAYI*QEFPQF*EQ*ILFLQ*SMYDPLE*MLTACHYSLX
PELL*LQLFFYFFLS
*LFSVFI*QEFHQFLVL*ILFQQ**MYDQSEYP*KKYPFFLEQL*LLLFYFFFHX
SQLQLRMLGHRLI*PFSGYT*LESPQL*EQSISFQQLSMYALKEY**IVLPYLFELSLLQLFYSCYPX
*LFSVFI*QEFHQFLVL*ILFQQ**MYDQSEYP*KKYPFFLEQL*LLLFYFFFHX
SQLQLRMLGHRLI*PFSGYT*LESPQL*EQSISFQQLSMYALKEY**IVLPYLFELSLLQLFYSCYPX
FLQMLPMEVLLLI*QFSVYIX
FQQPLLMLGHQLTSQFSDYT*QVYPQL*

AttributeError: 'numpy.ndarray' object has no attribute 'name'

In [26]:
related_entries.reset_index(drop=True)

Unnamed: 0,BOLD ID,Read ID,Country,Phylum,Class,Order,Family,Genus,Species,f_score,f_match,f_index,b_score,b_match,b_index,read,possible_orfs
0,>BOLD:AAG4445,TTCFW920-09,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,0,,384,46,CCAGTTTTAGCTGGTGCAATCAC,550,TTTAGCTATTTTTAGTCTTCATTTGGCAGGAATCTCATCAATTTTA...,"[0, 2]"
1,>BOLD:AAG4445,POBGC153-15,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,0,,368,46,CCAGTTTTAGCTGGTGCAATCAC,534,TTTAGCTATTTTTAGTCTTCATTTGGCAGGAATCTCATCAATTTTA...,"[0, 2]"
2,>BOLD:AAG4445,SMTPB14426-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,0,,359,46,CCAGTTTTAGCTGGTGCAATCAC,525,TTTAGCTATTTTTAGTCTTCATTTGGCAGGAATCTCATCAATTTTA...,"[0, 2]"
3,>BOLD:AAG4445,SMTPB6291-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,0,,371,46,CCAGTTTTAGCTGGTGCAATCAC,537,TTTAGCTATTTTTAGTCTTCATTTGGCAGGAATCTCATCAATTTTA...,"[0, 2]"
4,>BOLD:AAG4445,CNFNQ400-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,0,,383,46,CCAGTTTTAGCTGGTGCAATCAC,549,TTTAGCTATTTTTAGTCTTCATTTAGCAGGAATCTCATCAATTTTA...,"[0, 2]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19109,>BOLD:AAW6927,GBCL1755-06,,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Bruchidius,s:Bruchidius_albopubens,52,GGAACAGGTTGAACAGTTTACCCCCC,105,46,CCAGTATTAGCAGGAGCTATTAC,336,TTTAGCTAGAAATATTGCTCATGGCGGATCATCTGTTGACTTAGCT...,"[0, 2]"
19110,>BOLD:AEK3623,GBMND70410-21,,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Luperomorpha,s:Luperomorpha_xanthodera,52,GGAACTGGTTGAACTGTTTATCCCCC,264,40,CCAGTGTTAGCTGGGGCCATTAC,495,TCTCTCTTCAAATATTGCCCATGGAGGGTCATCCGTAGACCTAGCT...,"[0, 2]"
19111,>BOLD:AAY7837,SICOC688-18,Nicaragua,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Walterianella,s:Walterianella_oculata,52,GGAACTGGATGAACAGTTTACCCCCC,319,46,CCTGTATTAGCTGGAGCAATTAC,550,TTTATCTTCCAATCTTGCACATGAAGGATCTTCTATTGACTTAGCA...,"[0, 2]"
19112,>BOLD:ACA4092,CNCCE1865-12,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Plateumaris,s:Plateumaris_rufa,52,GGAACTGGTTGAACTGTATACCCTCC,319,46,CCAGTTTTAGCAGGAGCAATTAC,550,TTTATCAAGAAATATCGCCCATAGAGGAGCCTCAGTAGACCTCGCT...,"[0, 2]"
