In [67]:
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
my_result = pd.read_csv("../../data/primer-finder-result-0.csv", sep=";")

In [69]:
#setup
remaining_results = my_result[~my_result['possible_orfs'].str.contains(r'\[\]')]
solved_results = remaining_results[~remaining_results['possible_orfs'].str.contains(',')]
remaining_results = remaining_results[~(remaining_results['Read ID'].isin(solved_results['Read ID']))]

solved_results["ORF"] = solved_results.apply(lambda x: x["possible_orfs"][1], axis=1)

In [72]:
from pyhmmer.easel import MSAFile
import subprocess
import numpy as np
import pyhmmer
from src.orf_finder import process_ambiguous_orf, build_seq_from_pandas_entry


def decide_orfs_here(
        referenceEntries: pd.DataFrame,
        questionableEntries: pd.DataFrame,
        translation_table,
        e_value = 1000,
        pbar=None
):
    alphabet = pyhmmer.easel.Alphabet.amino()

    referenceSequences = np.zeros(shape=len(referenceEntries), dtype=pyhmmer.easel.TextSequence)
    referenceSequences.fill(pyhmmer.easel.TextSequence("".encode(),sequence=""))
    referenceEntries = referenceEntries.reset_index(drop=True)
    for i, row in referenceEntries.iterrows():
        referenceSequences[i] = build_seq_from_pandas_entry(row, translation_table=translation_table)

    questionableSequences = np.zeros(shape=(len(questionableEntries), 3), dtype=pyhmmer.easel.TextSequence)
    questionableSequences.fill(pyhmmer.easel.TextSequence("".encode(),sequence=""))
    questionableEntries = questionableEntries.reset_index(drop=True)
    for i, row in questionableEntries.iterrows():
        questionableSequences[i] = process_ambiguous_orf(row, translation_table=translation_table)

    in_file = "tmp_in.fasta"
    with open(in_file, "wb") as f:
        for seq in referenceSequences:
            #print(seq.sequence)
            seq.write(f)
    out_file = "tmp_out.fasta"
    muscle_path = "/mnt/c/Users/Me/bin/muscle"
    subprocess.run([f"{muscle_path}", "-align", in_file, "-output", out_file],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True)

    msa = MSAFile(out_file).read()
    msa.name = "tmpMSA".encode()
    background = pyhmmer.plan7.Background(alphabet)
    builder = pyhmmer.plan7.Builder(alphabet)
    digital_msa = msa.digitize(alphabet=alphabet)
    hmm, profile, optimized_profile = builder.build_msa(digital_msa, background)

    pipeline = pyhmmer.plan7.Pipeline(alphabet, background)
    pipeline.bias_filter = False
    pipeline.E = e_value

    questionableEntries.loc[:, 'ORF'] = ''
    modified_entries = pd.DataFrame(columns=questionableEntries.columns)
    total_hits = 0

    for query in questionableSequences:

        dig_questionable_sequences = [seq.digitize(alphabet=alphabet) for seq in query]
        sequenceBlock = pyhmmer.easel.DigitalSequenceBlock(alphabet=alphabet, iterable=dig_questionable_sequences)
        top_hits = pipeline.search_hmm(query=hmm, sequences=sequenceBlock)
        if len(top_hits.reported) > 0:
            total_hits += len(top_hits.reported)
            top_hit = top_hits[0]

            for hit in top_hits:
                if top_hit is not None and top_hit.name.decode() != "":
                    if hit.evalue < top_hit.evalue:
                        top_hit = hit

            [read_id, correct_orf] = top_hit.name.decode().split("_")
            questionableEntries.loc[questionableEntries['Read ID'] == read_id, 'ORF'] = correct_orf
            if len(modified_entries[modified_entries["Read ID"] == read_id]) == 0:
                modified_entries = pd.concat([modified_entries, questionableEntries.loc[questionableEntries['Read ID'] == read_id].copy()], ignore_index=True)
        ### this is the much slower but more responsive updater (see comment further up)
        if pbar is not None:
            pbar.update(1)
    print(f"modified entries: {len(modified_entries)} (of {total_hits} hits) and {len(questionableEntries)} original entries")
    return modified_entries

In [73]:
#params
threshold = 10
upper_threshold = 50

debugFrame = pd.DataFrame
failed = 0
taxonomic_levels = ['Species', 'Genus', 'Family', 'Order', 'Class']

while remaining_results.size > 0:
    current_entry = remaining_results.iloc[0]

    # Try to match at each taxonomic level, from specific to general
    for level in taxonomic_levels:
        level_value = current_entry[level]
        comp_group = solved_results[solved_results[level] == level_value]
        group_size = len(comp_group)

        if group_size >= threshold:
            comp_group = comp_group.sample(min(upper_threshold, group_size))
            related_entries = remaining_results[remaining_results[level] == level_value]
            solved = decide_orfs_here(comp_group, related_entries,translation_table=5)
            solved_results = pd.concat([solved_results, solved], ignore_index=True)
            remaining_results = remaining_results[~(remaining_results[level] == level_value)]
            break
        else:
            print(f"{group_size} entries of {level} {level_value} is too small.")

            # If we've tried all levels and none are big enough
            if level == taxonomic_levels[-1]:
                print(f"Removing Family '{current_entry['Family']}' with {len(solved_results[solved_results['Family'] == current_entry['Family']])} members to continue.")
                remaining_results = remaining_results[~(remaining_results['Family'] == current_entry['Family'])]
                failed += len(related_entries) if 'related_entries' in locals() else 1

    print("-------------")

print(f"A total of {failed} entries were impossible to match.")

modified entries: 6 (of 6 hits) and 6 original entries
-------------
modified entries: 7 (of 7 hits) and 7 original entries
-------------
modified entries: 1 (of 1 hits) and 1 original entries
-------------
1 entries of Species s:Oedothorax_trilobatus is too small.
modified entries: 66 (of 66 hits) and 66 original entries
-------------
modified entries: 1 (of 1 hits) and 1 original entries
-------------
modified entries: 43 (of 43 hits) and 44 original entries


KeyboardInterrupt: 