# Imports and helper functions

In [3]:
import os
import pandas as pd
from Bio import SeqIO
import gzip
import urllib.request
import random
from tqdm import tqdm

def tqdm_hook(t):
    last_b = [0]
    def update_to(block_num=1, block_size=1, total_size=None):
        if total_size is not None:
            t.total = total_size
        downloaded = block_num * block_size
        t.update(downloaded - last_b[0])
        last_b[0] = downloaded
    return update_to

def get_fna_path(asb, asm):
    asm = asm.replace(' ', '_').replace('#', '_')
    srv = 'https://ftp.ncbi.nlm.nih.gov/genomes/all'
    ext = 'genomic.fna.gz'
    return ('%s/%s/%s/%s/%s/%s_%s/%s_%s_%s'
            % (srv, asb[:3], asb[4:7], asb[7:10], asb[10:13],
               asb, asm, asb, asm, ext))

def extract_subsequences_from_fna(fna_path, organism_name, label, window_size=150, stride=150):
    records = []
    with gzip.open(fna_path, "rt") as f:
        for record in SeqIO.parse(f, "fasta"):
            seq = str(record.seq)
            for start in range(0, len(seq) - window_size + 1, stride):
                sub_seq = seq[start:start + window_size]
                records.append({
                    "organism": organism_name,
                    "sequence": sub_seq,
                    "location": (start, start + window_size),
                    "label": label,
                    "contig": record.id,
                    "source_file": os.path.basename(fna_path)
                })
    return pd.DataFrame(records)

def extract_random_subsequences_from_fna(fna_path, organism_name, label, window_size=150, n=10):
    records = []
    all_seqs = []

    with gzip.open(fna_path, "rt") as f:
        for record in SeqIO.parse(f, "fasta"):
            if len(record.seq) >= window_size:
                all_seqs.append((record.id, str(record.seq)))

    while len(records) < n and all_seqs:
        contig_id, seq = random.choice(all_seqs)
        if len(seq) < window_size:
            continue
        start = random.randint(0, len(seq) - window_size)
        sub_seq = seq[start:start + window_size]
        records.append({
            "organism": organism_name,
            "sequence": sub_seq,
            "location": (start, start + window_size),
            "label": label,
            "contig": contig_id,
            "source_file": os.path.basename(fna_path)
        })

    return pd.DataFrame(records)


# Set parameters (replace these as needed)

In [16]:
metadata_path = "microbial_metadata.tsv"  # Replace with your local path
sample_size = 10


 # Read metadata and sample

In [17]:
df = pd.read_csv(metadata_path, sep='\t')

required_cols = ['assembly_accession', 'asm_name']
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f'Missing required column: {col}')

sample_df = df.sample(n=min(sample_size, len(df)), random_state=42)
sample_df


Unnamed: 0,#genome,asm_name,assembly_accession,bioproject,biosample,wgs_master,seq_rel_date,submitter,ftp_path,img_id,...,coding_density,completeness,contamination,strain_heterogeneity,markers,5s_rrna,16s_rrna,23s_rrna,trnas,draft_quality
1284,G000217635,ASM21763v1,GCF_000217635.1,PRJNA224116,SAMN02603413,,2011/06/09,Instituto Cavanilles Biodiversidad y Biologia ...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,650716012.0,...,80.934764,98.12,0.0,0.0,149,yes,yes,yes,20,high
6994,G001509375,ASM150937v1,GCA_001509375.1,PRJNA278302,SAMN03445147,LGFT00000000.1,2016/01/08,Lawrence Berkeley National Lab,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001...,,...,86.71264,100.0,0.0,0.0,195,no,yes,yes,17,medium
9187,G001821355,ASM182135v1,GCA_001821355.1,PRJNA288027,SAMN04314511,MHLE00000000.1,2016/10/21,"Banfield Lab, University of California, Berkeley",ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001...,,...,87.586151,72.7,1.02,0.0,125,no,yes,no,16,medium
7976,G001771365,ASM177136v1,GCA_001771365.1,PRJNA288027,SAMN04315926,METP00000000.1,2016/10/18,"Banfield Lab, University of California, Berkeley",ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001...,,...,95.098232,88.89,2.56,0.0,274,no,yes,no,18,medium
1393,G000229225,SMUT5-NEX_12-176,GCF_000229225.1,PRJNA224116,SAMN02436514,AGLC00000000.1,2011/10/06,JCVI,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,2548876566.0,...,86.863901,99.95,0.37,0.0,305,no,no,no,18,medium
10057,G900086585,PRJEB13931,GCF_900086585.1,PRJNA224116,SAMEA3959737,FLKM00000000.1,2016/05/19,URMITE,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900...,,...,88.935879,99.29,0.71,0.0,335,yes,yes,yes,20,high
318,G000017645,ASM1764v1,GCF_000017645.1,PRJNA224116,SAMN02598342,,2007/07/30,US DOE Joint Genome Institute,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,640753059.0,...,87.764231,100.0,0.32,0.0,360,yes,yes,yes,20,high
6927,G001485475,ASM148547v1,GCF_001485475.1,PRJNA224116,SAMD00043586,BCMU00000000.1,2015/12/08,AIST,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001...,,...,89.636847,93.27,0.0,0.0,318,yes,yes,yes,20,high
733,G000154285,ASM15428v1,GCF_000154285.1,PRJNA224116,SAMN00000734,ABAW00000000.2,2007/09/12,Washington University Genome Sequencing Center,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,641380446.0,...,88.808806,100.0,0.0,0.0,287,yes,yes,yes,20,high
1509,G000242915,ASM24291v2,GCF_000242915.1,PRJNA224116,SAMN02470075,AFRZ00000000.1,2012/01/18,IOW-Leibniz Institute for Baltic Sea Research,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,...,93.910418,100.0,0.2,0.0,338,yes,yes,yes,20,high


# Download microbial genomes and extract sequences

In [6]:

all_microbial_data = []

for _, row in sample_df.iterrows():
    asb, asm, unique_org_name = row['assembly_accession'], row['asm_name'], row['unique_name']
    url = get_fna_path(asb, asm)
    fna_filename = f"{asb}_{asm.replace(' ', '_')}_genomic.fna.gz"

    print(f"Downloading microbial genome: {url}")
    try:
        with tqdm(unit='B', unit_scale=True, desc=fna_filename, leave=True) as t:
            urllib.request.urlretrieve(url, fna_filename, reporthook=tqdm_hook(t))
        df = extract_subsequences_from_fna(fna_filename, unique_org_name, label=1)
        all_microbial_data.append(df)
    except Exception as e:
        print(f"Failed to download or process {url}: {e}")

if not all_microbial_data:
    raise RuntimeError("❌ No microbial data extracted.")

microbial_df = pd.concat(all_microbial_data, ignore_index=True)
microbial_df


Downloading microbial genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/217/635/GCF_000217635.1_ASM21763v1/GCF_000217635.1_ASM21763v1_genomic.fna.gz


GCF_000217635.1_ASM21763v1_genomic.fna.gz: 131kB [00:00, 217kB/s]                            


Downloading microbial genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/509/375/GCA_001509375.1_ASM150937v1/GCA_001509375.1_ASM150937v1_genomic.fna.gz


GCA_001509375.1_ASM150937v1_genomic.fna.gz: 713kB [00:00, 1.10MB/s]                           


Downloading microbial genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/821/355/GCA_001821355.1_ASM182135v1/GCA_001821355.1_ASM182135v1_genomic.fna.gz


GCA_001821355.1_ASM182135v1_genomic.fna.gz: 172kB [00:00, 331kB/s]                            


Downloading microbial genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/771/365/GCA_001771365.1_ASM177136v1/GCA_001771365.1_ASM177136v1_genomic.fna.gz


GCA_001771365.1_ASM177136v1_genomic.fna.gz: 418kB [00:00, 779kB/s]                            


Downloading microbial genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/229/225/GCF_000229225.1_SMUT5-NEX_12-176/GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz


GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz: 541kB [00:00, 924kB/s]                            


Unnamed: 0,organism,sequence,location,label,contig,source_file
0,Buchnera aphidicola (Cinara tujafilina),ATGGGAAATCAGATTCAGAAAACGACAGTGATTGTGGTAGGTGGCG...,"(0, 150)",1,NC_015662.1,GCF_000217635.1_ASM21763v1_genomic.fna.gz
1,Buchnera aphidicola (Cinara tujafilina),TCTATTGGAGGTATTGGTAAAAGTCAATTAGTAAAAGAGATTGATG...,"(150, 300)",1,NC_015662.1,GCF_000217635.1_ASM21763v1_genomic.fna.gz
2,Buchnera aphidicola (Cinara tujafilina),GATAGACAGTTGTATAAGAAAAATTTACAATATTTTTTAAAATTAC...,"(300, 450)",1,NC_015662.1,GCF_000217635.1_ASM21763v1_genomic.fna.gz
3,Buchnera aphidicola (Cinara tujafilina),TCAGTGATATTAACCACTGGGACTTTCTTAAATGGAAAAATATTTA...,"(450, 600)",1,NC_015662.1,GCF_000217635.1_ASM21763v1_genomic.fna.gz
4,Buchnera aphidicola (Cinara tujafilina),GGAACACCACCAAGATTATTAGGTAGTAGTATTAATTTCCAATTGT...,"(600, 750)",1,NC_015662.1,GCF_000217635.1_ASM21763v1_genomic.fna.gz
...,...,...,...,...,...,...
43648,Streptococcus mutans TCI-176,CCCAAAGGTATTATTGCTTCTTTAACGATTGTTACCATTCTTTATG...,"(15900, 16050)",1,NZ_AGLC01000001.1,GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz
43649,Streptococcus mutans TCI-176,AACTATATTTCAGTTGTTGCGATTCTGACTCTTATAACAGTTTGCA...,"(16050, 16200)",1,NZ_AGLC01000001.1,GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz
43650,Streptococcus mutans TCI-176,AATGCTACTCTTCTTGTTGGTTTTGCTTCTATGATTTGTGCTGGAG...,"(16200, 16350)",1,NZ_AGLC01000001.1,GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz
43651,Streptococcus mutans TCI-176,CCTAAAATTGGAGAATTCAAAACACCATTTATCCCATTTTTACCTA...,"(16350, 16500)",1,NZ_AGLC01000001.1,GCF_000229225.1_SMUT5-NEX_12-176_genomic.fna.gz


# Download and extract human sequences

In [10]:
genome_dir = "genomes"
os.makedirs(genome_dir, exist_ok=True)

# Human genome file info
human_url = ("https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/"
             "GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz")
human_fna = os.path.join(genome_dir, "human_GRCh38.fna.gz")

# Check if genome already exists
if not os.path.exists(human_fna):
    print(f"Downloading human genome: {human_url}")
    with tqdm(unit='B', unit_scale=True, desc=os.path.basename(human_fna), leave=True) as t:
        urllib.request.urlretrieve(human_url, human_fna, reporthook=tqdm_hook(t))
else:
    print(f"Human genome already exists at {human_fna}, skipping download.")

# Extract random subsequences
human_df = extract_random_subsequences_from_fna(
    human_fna, "Homo sapiens", label=0, window_size=150, n=len(microbial_df)
)
human_df

Human genome already exists at genomes/human_GRCh38.fna.gz, skipping download.


Unnamed: 0,organism,sequence,location,label,contig,source_file
0,Homo sapiens,CCTCCTCACATCCCCCCACTCtgtgcaacctccccaaatccccccc...,"(217480, 217630)",0,KI270774.1,human_GRCh38.fna.gz
1,Homo sapiens,ACTGTGTGGGCAGGTGAAGGACATGTCTACCACCCTATGCTCTGGG...,"(34795, 34945)",0,KZ559116.1,human_GRCh38.fna.gz
2,Homo sapiens,GCCTGGGGGAGGGTAGCAAGGCCTGGACTAGGAGATGGAGGCCTAA...,"(148724, 148874)",0,KI270792.1,human_GRCh38.fna.gz
3,Homo sapiens,CTGACGATATAGGACAAAAGCAGGGAACACCTGCCTTCTGCTCTGG...,"(31442, 31592)",0,KI270778.1,human_GRCh38.fna.gz
4,Homo sapiens,CCGAGACGCACCCTGGGTTCGAACCAGGGACGCCAGGTTCACGGGG...,"(64039, 64189)",0,GL000205.2,human_GRCh38.fna.gz
...,...,...,...,...,...,...
43648,Homo sapiens,TATAATAGCTCCAGGCATGGTTCCTACAGAGACTGGAAGACTATAC...,"(196970, 197120)",0,MU273386.1,human_GRCh38.fna.gz
43649,Homo sapiens,GAGCATGTCCATTGCTTGGTCCCTTGGAGCATCTGCAGCAAAGTGA...,"(41867, 42017)",0,KZ559103.1,human_GRCh38.fna.gz
43650,Homo sapiens,TTAACTCAGCCATGCCAATATGCAGAGGAAGAAGAGCACTCTTAGG...,"(7504, 7654)",0,MU273349.1,human_GRCh38.fna.gz
43651,Homo sapiens,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,"(488950, 489100)",0,GL949749.2,human_GRCh38.fna.gz


# Combine and save

In [None]:
final_df = pd.concat([microbial_df, human_df], ignore_index=True)
final_df.to_csv("subsequences_dataset.csv", index=False)
print("✅ Dataset saved to subsequences_dataset.csv")


✅ Dataset saved to subsequences_dataset.csv
