In [14]:
import random
import sqlite3
import pandas as pd
from tqdm import tqdm

# parameters

output_base = "sampled_entries_2"
csv_path = output_base + ".csv"
fasta_path = output_base + ".fasta"

database_path = "../../eyeBOLD/eyeBOLD_db.db"
table_name = "specimen"
id_column = "specimenid"
num_entries = 1000000
table_size = 16185388

batch_size = 10000

entry_row_nums = list(set(random.sample(range(1, table_size + 1), k=num_entries)))
#print(entry_row_nums)

# connect to the database
conn = sqlite3.connect(database_path)
cursor = conn.cursor()

all_results = []
pbar = tqdm(total=len(entry_row_nums))
for i in range(0, len(entry_row_nums), batch_size):
    batch = list(entry_row_nums[i:i+batch_size])

    # Create placeholders for this batch
    placeholders = ','.join('?' * len(batch))
    query = f"SELECT specimen_id, nuc_san FROM {table_name} WHERE rowid IN ({placeholders})"

    # Execute query with this batch
    cursor.execute(query, batch)
    batch_results = cursor.fetchall()
    all_results.extend(batch_results)
    pbar.update(len(batch))

df = pd.DataFrame(all_results, columns=["specimen_id", "nuc_san"])
pbar.close()
conn.close()

df.head()

  0%|          | 0/1000000 [21:10<?, ?it/s]
100%|██████████| 1000000/1000000 [04:08<00:00, 4021.10it/s]


Unnamed: 0,specimen_id,nuc_san
0,2,GGAATTTGAGCAGGATTAGTTGGTACTTCTTTAAGATTACTTATTC...
1,23,GAACATCTTTAAGATTATTAATTCGAGCTGAATTAGGAAACCCCGG...
2,46,GAACATCTTTAAGTTTATTAATTCGAGCTGAATTAGGTAATCCTGG...
3,81,GAACTTCATTAAGATTACTAATTCGAGCTGAATTAGGAAATCCTGG...
4,120,GAACTTCATTAAGATTATTAATTCGTGCTGAATTAGGGAACCCTGG...


In [15]:
df.head()

Unnamed: 0,specimen_id,nuc_san
0,2,GGAATTTGAGCAGGATTAGTTGGTACTTCTTTAAGATTACTTATTC...
1,23,GAACATCTTTAAGATTATTAATTCGAGCTGAATTAGGAAACCCCGG...
2,46,GAACATCTTTAAGTTTATTAATTCGAGCTGAATTAGGTAATCCTGG...
3,81,GAACTTCATTAAGATTACTAATTCGAGCTGAATTAGGAAATCCTGG...
4,120,GAACTTCATTAAGATTATTAATTCGTGCTGAATTAGGGAACCCTGG...


In [16]:
# write to CSV
df.to_csv(csv_path, index=False)

print(f"DataFrame with columns ['specimen_id', 'nuc_san'] saved to {csv_path}")

DataFrame with columns ['specimen_id', 'nuc_san'] saved to sampled_entries_2.csv


In [17]:
with open(fasta_path, "w") as fasta:
    for _, row in df[["specimen_id", "nuc_san"]].iterrows():
        fasta.write(f">{row['specimen_id']}\n{row['nuc_san']}\n")

print(f"FASTA file saved to {fasta_path}")

FASTA file saved to sampled_entries_2.fasta
