In [None]:
import sqlite3
import pandas as pd
import numpy as np

# parameters
database_path = "../../eyeBOLD/eyeBOLD_db.db"
table_name = "specimen"
id_column = "specimenid"
num_entries = 100

# connect to the database
conn = sqlite3.connect(database_path)
min_id, max_id = conn.execute(
    f"SELECT MIN({id_column}), MAX({id_column}) FROM {table_name}"
).fetchone()

found_rows = []
found_ids = set()

batch_size = num_entries * 2  # query more than needed per batch

while len(found_rows) < num_entries:
    # generate more random unique candidate IDs
    candidate_ids = set(np.random.randint(min_id, max_id + 1, size=batch_size))
    candidate_ids -= found_ids  # remove already seen IDs
    if not candidate_ids:
        continue

    placeholders = ",".join("?" for _ in candidate_ids)
    query = f"""
        SELECT *
        FROM {table_name}
        WHERE {id_column} IN ({placeholders})
    """

    rows = conn.execute(query, list(candidate_ids)).fetchall()

    for row in rows:
        row_id = row[0]  # assuming id is the first column
        if row_id not in found_ids:
            found_ids.add(row_id)
            found_rows.append(row)
        if len(found_rows) >= num_entries:
            break

# convert to DataFrame
col_names = [desc[0] for desc in conn.execute(f"PRAGMA table_info({table_name})")]
df = pd.DataFrame(found_rows, columns=col_names)

conn.close()

df.head()

In [None]:
df.head()

In [None]:
csv_path = "sampled_entries.csv"

# keep only the columns of interest
df_subset = df[["specimenid", "nuc_san"]]

# write to CSV
df_subset.to_csv(csv_path, index=False)

print(f"DataFrame with columns ['specimenid', 'nuc_san'] saved to {csv_path}")

In [None]:
# specify output FASTA path
fasta_path = "sampled_entries.fasta"

with open(fasta_path, "w") as fasta:
    for _, row in df[["specimenid", "nuc_san"]].iterrows():
        fasta.write(f">{row['specimenid']}\n{row['nuc_san']}\n")

print(f"FASTA file saved to {fasta_path}")