In [7]:
import numpy as np
import pandas as pd


In [8]:
my_result = pd.read_csv("../../data/primer-finder-result.csv", sep=";")

In [9]:
#setup
remaining_results = my_result[~my_result['possible_orfs'].str.contains(r'\[\]')]
solved_results = remaining_results[~remaining_results['possible_orfs'].str.contains(',')]
remaining_results = remaining_results[~(remaining_results['Read ID'].isin(solved_results['Read ID']))]

solved_results["ORF"] = solved_results.apply(lambda x: x["possible_orfs"][1], axis=1)

# for faster debug:
remaining_results = remaining_results[~(remaining_results.Order == "o:Lepidoptera")]

In [10]:
from abc import ABCMeta, abstractmethod

class ABCFileLike(metaclass=ABCMeta):
    @abstractmethod
    def __init__(self): pass

    @abstractmethod
    def write(self, line): pass

    @abstractmethod
    def close(self): pass

    @abstractmethod
    def flush(self): pass

class FileLike(ABCFileLike):
    """ Concrete implementation of a file-like class.
        (Meaning all the abstract methods have an implementation.)
    """
    def __init__(self):
        pass

    def write(self, line):
        print("Written:", line)

    def close(self):
        pass

    def flush(self):
        pass

import io

print(isinstance(io.IOBase(), ABCFileLike))  # -> False

ABCFileLike.register(io.IOBase)
print(isinstance(io.IOBase(), ABCFileLike))  # -> True

False
True


In [12]:
from typing import TextIO
import pyhmmer
from src.orf_finder import build_seq_from_pandas_entry, process_ambiguous_orf

def pad_sequences_2d(sequences, minimum, pad_char='X'):
    padded_matrix = np.zeros(shape=(len(sequences), len(sequences[0])), dtype=pyhmmer.easel.TextSequence)
    for i in range(len(sequences)):
        padded_matrix[i] = pad_sequences(sequences[i], minimum, pad_char)
    return np.array(padded_matrix)

def pad_sequences(sequences, minimum=0, pad_char='X'):
    max_length = len(max(sequences, key=len))
    max_length = max(max_length, minimum)

    padded_sequences = np.zeros(shape=len(sequences), dtype=pyhmmer.easel.TextSequence)
    for i, seq in enumerate(sequences):
        padded_sequence = seq.sequence + pad_char * (max_length - len(seq))
        padded_sequences[i] = pyhmmer.easel.TextSequence(name=seq.name, sequence=padded_sequence)

    return np.array(padded_sequences, dtype=pyhmmer.easel.TextSequence)


def decide_orfs_here(referenceEntries: pd.DataFrame, questionableEntries: pd.DataFrame):
    alphabet = pyhmmer.easel.Alphabet.amino()

    referenceSequences = np.zeros(shape=len(referenceEntries), dtype=pyhmmer.easel.TextSequence)
    referenceSequences.fill(pyhmmer.easel.TextSequence("".encode(),sequence=""))
    referenceEntries = referenceEntries.reset_index(drop=True)
    for i, row in referenceEntries.iterrows():
        referenceSequences[i] = build_seq_from_pandas_entry(row)

    questionableSequences = np.zeros(shape=(len(questionableEntries), 3), dtype=pyhmmer.easel.TextSequence)
    questionableSequences.fill(pyhmmer.easel.TextSequence("".encode(),sequence=""))
    questionableEntries = questionableEntries.reset_index(drop=True)
    for i, row in questionableEntries.iterrows():
        questionableSequences[i] = process_ambiguous_orf(row)

    que_lengths = np.vectorize(len)(questionableSequences)
    longest_que_seq = np.max(que_lengths)
    ref_lengths = np.vectorize(len)(referenceSequences)
    longest_ref_seq = np.max(ref_lengths)
    longest_seq_len = max(longest_ref_seq, longest_que_seq)

    referenceSequences = pad_sequences(referenceSequences, minimum=longest_seq_len)
    questionableSequences = pad_sequences_2d(questionableSequences, minimum=longest_seq_len)


    msa = pyhmmer.easel.TextMSA(name=b"myMSA", sequences=referenceSequences.tolist())
    background = pyhmmer.plan7.Background(alphabet)
    builder = pyhmmer.plan7.Builder(alphabet)
    digital_msa = msa.digitize(alphabet=alphabet)
    hmm, profile, optimized_profile = builder.build_msa(digital_msa, background)

    pipeline = pyhmmer.plan7.Pipeline(alphabet, background)
    pipeline.bias_filter = False

    questionableEntries.loc[:, 'ORF'] = ''
    modified_entries = pd.DataFrame(columns=questionableEntries.columns)
    total_hits = 0

    for query in questionableSequences:
        query_hits = []
        for seq in query:
            dig_questionable_sequence = [seq.digitize(alphabet=alphabet)]
            sequenceBlock = pyhmmer.easel.DigitalSequenceBlock(alphabet=alphabet, iterable=dig_questionable_sequence)
            top_hits = pipeline.search_hmm(query=hmm, sequences=sequenceBlock)
            if len(top_hits.reported) > 0:
                # for hit in top_hits:
                    # print(f"Score of {hit.name.decode()} is {hit.score} with sequence {seq.sequence}. E: {hit.evalue}, Bias: {hit.bias}")
                query_hits.append(top_hits[0])
                total_hits += len(top_hits.reported)

        if len(query_hits) > 0:
            top_hit = query_hits[0]
            for hit in query_hits:
                if hit.score > top_hit.score:
                    top_hit = hit
            # print(f"best was {top_hit.name.decode()} with {top_hit.score}")
            if top_hit is not None and top_hit.name.decode() != "":
                [read_id, correct_orf] = top_hit.name.decode().split("_")
                questionableEntries.loc[questionableEntries['Read ID'] == read_id, 'ORF'] = correct_orf
                if len(modified_entries[modified_entries["Read ID"] == read_id]) == 0:
                    modified_entries = pd.concat([modified_entries, questionableEntries.loc[questionableEntries['Read ID'] == read_id].copy()], ignore_index=True)
    print(f"modified entries: {len(modified_entries)} (of {total_hits} hits) and {len(questionableEntries)} original entries")
    return modified_entries

In [13]:

#params
threshold = 4
upper_threshold = 50

debugFrame = pd.DataFrame
failed = 0

taxonomic_levels = ['Family', 'Order', 'Class']

while remaining_results.size > 0:
    current_entry = remaining_results.iloc[0]

    # Try to match at each taxonomic level, from specific to general
    for level in taxonomic_levels:
        level_value = current_entry[level]
        comp_group = solved_results[solved_results[level] == level_value]
        group_size = len(comp_group)

        if group_size >= threshold:
            comp_group = comp_group.sample(min(upper_threshold, group_size))
            related_entries = remaining_results[remaining_results[level] == level_value]
            solved = decide_orfs_here(comp_group, related_entries)
            solved_results = pd.concat([solved_results, solved], ignore_index=True)
            remaining_results = remaining_results[~(remaining_results[level] == level_value)]
            break
        else:
            print(f"{group_size} entries of {level} {level_value} is too small.")

            # If we've tried all levels and none are big enough
            if level == taxonomic_levels[-1]:
                print(f"Removing Family '{current_entry['Family']}' with {len(solved_results[solved_results['Family'] == current_entry['Family']])} members to continue.")
                remaining_results = remaining_results[~(remaining_results['Family'] == current_entry['Family'])]
                failed += len(related_entries) if 'related_entries' in locals() else 1

    print("-------------")

print(f"A total of {failed} entries were impossible to match.")

FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFSSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFI*QESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFIWQESHQF*EQLILLPR*LMYDPKEYL*IKYLYLYELCLLQLFFYFYHX

--
FSYF*SSFGSNLINFSSN*FYYHDN*YTTPSNIYSSNIFICMSCVYYSYSFTSIIX
*LFLVFI

KeyboardInterrupt: 

In [77]:
35510 /2


17755.0