In [89]:
import sqlite3
import random


def find_comparison_group(current_entry, level, lower_reference_threshold, upper_reference_threshold, random_seed: int = None):
    if random_seed is not None:
        random.seed(random_seed)

    conn = sqlite3.connect("/mnt/z/Uni/Master Thesis/eyeBOLD/eyeBOLD_mini3.db")
    conn.row_factory = sqlite3.Row

    try:
        cursor = conn.cursor()

        # First, get the primer sequences from the input primer pair
        primer_lookup_query = """
                                  SELECT fm.primer_sequence as forward_primer,
                                         fm.*,
                                         rm.primer_sequence as reverse_primer,
                                         s.*
                                  FROM primer_pairs pp
                                           LEFT JOIN primer_matches fm ON pp.forward_match_id = fm.match_id
                                           LEFT JOIN primer_matches rm ON pp.reverse_match_id = rm.match_id
                                           LEFT JOIN specimen s ON pp.specimen_id = s.specimenid
                                  WHERE pp.forward_match_id = ?
                                    AND pp.reverse_match_id = ?
                                  """
        f_id = int(current_entry["forward_match_id"].iloc[0])
        b_id = int(current_entry["reverse_match_id"].iloc[0])

        cursor.execute(primer_lookup_query, (f_id, b_id))

        input_primer_info = cursor.fetchone()

        if not input_primer_info:
            print(
                f"No primer pair found with forward_match_id={f_id}, "
                f"reverse_match_id={b_id}\n"
                f"{input_primer_info}")
            return None, False
        forward_primer_seq = input_primer_info['forward_primer']
        reverse_primer_seq = input_primer_info['reverse_primer']
        input_taxonomic_group = input_primer_info[level]

        print(
f"""Forward primer: {forward_primer_seq}
Reverse primer: {reverse_primer_seq}
Group: {input_taxonomic_group}
""")

        if not input_taxonomic_group:
            print(f"Input specimen has no {level} information")
            return None, False

        # Find all primer pairs with the same primer sequences in the same taxonomic group that are solved
        discovery_query = f"""
            SELECT pp.forward_match_id, pp.reverse_match_id, pp.specimen_id,
                   pp.inter_primer_sequence, pp.orf_candidates, pp.orf_index,
                   pp.orf_aa, pp.matching_flags
            FROM primer_pairs pp
            JOIN primer_matches fm ON pp.forward_match_id = fm.match_id
            JOIN primer_matches rm ON pp.reverse_match_id = rm.match_id
            JOIN specimen s ON pp.specimen_id = s.specimenid
            WHERE fm.primer_sequence = ?
              AND rm.primer_sequence = ?
              AND s.{level} = ?
              AND pp.orf_index IS NOT NULL
            ORDER BY pp.specimen_id
            """

        cursor.execute(discovery_query, (forward_primer_seq, reverse_primer_seq, input_taxonomic_group))
        matching_entries = cursor.fetchall()

        if len(matching_entries) > lower_reference_threshold:
            # Randomly sample up to max_entries from this group
            sample_size = min(upper_reference_threshold, len(matching_entries))
            selected_entries = random.sample(matching_entries, sample_size)
            print(f"Found {len(matching_entries)} matching entries for {forward_primer_seq} and {reverse_primer_seq} in {input_taxonomic_group}")
            return selected_entries, True
        else:
            return None, False

    except Exception as e:
        print(f"Error while finding comparison group: {e}")
        return None, False
    finally:
        conn.close()


In [90]:
import pandas as pd

def get_next_unsolved_sequence():
    db = sqlite3.connect("/mnt/z/Uni/Master Thesis/eyeBOLD/eyeBOLD_mini3.db")
    query = """
            SELECT forward_match_id,
                   reverse_match_id,
                   specimen_id,
                   inter_primer_sequence,
                   orf_candidates,
                   orf_index,
                   orf_aa,
                   matching_flags
            FROM primer_pairs
            WHERE orf_index IS NULL
            LIMIT 1
            """
    df = pd.read_sql_query(query, db)
    db.close()
    return df


In [268]:
current_entry = get_next_unsolved_sequence()
current_entry

Unnamed: 0,forward_match_id,reverse_match_id,specimen_id,inter_primer_sequence,orf_candidates,orf_index,orf_aa,matching_flags
0,100001,100002,75884,ATTATCTTCTAATATTTTTCATAATGGCCCTTCTATTGATTTATCC...,2,,,0


In [279]:
comparison_group, success = find_comparison_group(current_entry, "taxon_species", 5, 10)

Forward primer: GGDACWGGWTGAACWGTWTAYCCHCC
Reverse primer: CCWGTWYTAGCHGGDGCWATYAC
Group: Pyramica ludovici

Found 13 matching entries for GGDACWGGWTGAACWGTWTAYCCHCC and CCWGTWYTAGCHGGDGCWATYAC in Pyramica ludovici


In [2]:
import sqlite3
def create_indexes():
    conn = sqlite3.connect("/mnt/z/Uni/Master Thesis/eyeBOLD/eyeBOLD_mini3.db")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_species ON specimen(taxon_species)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_genus ON specimen(taxon_genus)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_family ON specimen(taxon_family)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_order ON specimen(taxon_order)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_class ON specimen(taxon_class)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_sequence ON primer_matches(primer_sequence)")
    conn.close()
create_indexes()

In [3]:
conn = sqlite3.connect("/mnt/z/Uni/Master Thesis/eyeBOLD/eyeBOLD_mini3.db")


In [91]:
import primer_finder.connectors.db_connector as dbc
con = dbc.DbConnector("/mnt/z/Uni/Master Thesis/eyeBOLD/eyeBOLD_mini3.db")

In [92]:
next = con.get_next_unsolved_sequence()
related, _ = con._fetch_related_sequences(next, "taxon_species", False)

Input specimen has no taxon_species information


In [9]:
non_zero = related[related["orf_index"] >= 0]

In [14]:
both = related[related["matching_flags"] == 0 & related["orf_index"] >= 0]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [32]:
from orf.decider import _trim_to_triplet
from Bio.Seq import Seq
import numpy as np
import pyhmmer
import pandas as pd

def _dna_to_aa(dna_sequence, offset):
    sequence = Seq(dna_sequence)[int(offset):]
    trimmed = _trim_to_triplet(sequence)
    translated = trimmed.translate(table=5)
    return str(translated)

def _get_possible_amino_text_sequences_of(entry: pd.Series):
    possible_orfs = [i for i in range(int(entry["orf_candidates"].iloc[0]).bit_length()) if entry["orf_candidates"] & (1 << i)]
    print(possible_orfs)
    seqs = np.zeros(shape=3, dtype=pyhmmer.easel.TextSequence)
    seqs.fill(pyhmmer.easel.TextSequence("".encode(), sequence=""))
    for i, possible_orf in enumerate(possible_orfs):
        aa_sequence = _dna_to_aa(entry["inter_primer_sequence"], possible_orf)
        text_seq = pyhmmer.easel.TextSequence(name=(str(entry["specimen_id"]).encode() + b"_" + str(possible_orf).encode()),
                                             sequence=str(aa_sequence))
        seqs[i] = text_seq
    return seqs

In [33]:
seqs = _get_possible_amino_text_sequences_of(next)

[]


In [57]:
def _decrypt_oc(possible_orf: int) -> list[int]:
    return [i for i in range(possible_orf.bit_length()) if possible_orf & (1 << i)]

_decrypt_oc(int(next["orf_candidates"].iloc[0]))
#next["orf_candidates"]

[]

In [114]:
id = next["specimen_id"].loc
id in next["specimen_id"].values

False

In [78]:
related.loc[1] = related.loc[0]

In [83]:
related

(None, False)