In [3]:
#!pip install biopython
from io import StringIO
from Bio import Entrez, SeqIO
import os
Entrez.email = "petershamoun80@gmail.com"

In [4]:
MAX_SEQ_LENGTH = 20_000  # Maximum allowed sequence length
MAX_FETCH_ATTEMPTS = 100  # Maximum number of fetch attempts before giving up

# Define search queries
queries = {
    "protein_coding": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("CDS"[Feature] OR "mRNA"[Feature])',
    "non_protein_coding": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("ncRNA"[Feature] OR "lncRNA"[Feature])',
    "enhancer": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("enhancer"[Title] OR "enhancer"[Gene Name])',
    "non_enhancer": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("enhancer"[Title] OR "enhancer"[Gene Name])',
    "promoter": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND "promoter"[Title]',
    "non_promoter": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT "promoter"[Title]',
    "splice_site": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("splice site"[Title] OR "splice junction"[Title])',
    "non_splice_site": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("splice site"[Title] OR "splice junction"[Title])',
    "methylated": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("methylation"[Title] OR "CpG island"[Title])',
    "non_methylated": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("methylation"[Title] OR "CpG island"[Title])'
}

# Create 'sequences' directory if it doesn't exist
output_dir = "sequences"
os.makedirs(output_dir, exist_ok=True)

# Function to fetch sequences from NCBI and ensure we get 'max_records' valid sequences
def fetch_fasta(query, filename, max_records=100):
    print(f"Searching NCBI for: {query}")
    
    valid_sequences = []
    fetched_ids = set()
    attempt = 0

    while len(valid_sequences) < max_records and attempt < MAX_FETCH_ATTEMPTS:
        attempt += 1
        remaining = max_records - len(valid_sequences)
        
        # Step 1: Search NCBI to get sequence IDs
        handle = Entrez.esearch(db="nucleotide", term=query, retmax=remaining)
        record = Entrez.read(handle)
        handle.close()

        # Get new sequence IDs, avoiding duplicates
        new_ids = [seq_id for seq_id in record["IdList"] if seq_id not in fetched_ids]
        fetched_ids.update(new_ids)

        if not new_ids:
            print(f"No new sequences found in attempt {attempt}. Retrying...")
            continue

        print(f"Fetching {len(new_ids)} sequences...")

        # Step 2: Fetch FASTA sequences
        handle = Entrez.efetch(db="nucleotide", id=new_ids, rettype="fasta", retmode="text")
        fasta_records = handle.read()
        handle.close()

        # Step 3: Filter sequences by length
        fasta_io = StringIO(fasta_records)
        for seq_record in SeqIO.parse(fasta_io, "fasta"):
            if len(seq_record.seq) <= MAX_SEQ_LENGTH:
                valid_sequences.append(seq_record)
                if len(valid_sequences) == max_records:
                    break  # Stop early if we reach the target count

        print(f"Valid sequences collected: {len(valid_sequences)} (Target: {max_records})")

    # Step 4: Save only the filtered sequences to file
    if valid_sequences:
        file_path = os.path.join(output_dir, filename)
        with open(file_path, "w") as f:
            SeqIO.write(valid_sequences, f, "fasta")
        print(f"Saved {len(valid_sequences)} sequences to {file_path}.")
    else:
        print(f"No sequences within allowed length ({MAX_SEQ_LENGTH}) for {filename}.")

# Run the function for each query
for label, query in queries.items():
    fetch_fasta(query, f"{label}.fasta")

Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("CDS"[Feature] OR "mRNA"[Feature])
Fetching 100 sequences...
Valid sequences collected: 100 (Target: 100)
Saved 100 sequences to sequences\protein_coding.fasta.
Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("ncRNA"[Feature] OR "lncRNA"[Feature])
Fetching 100 sequences...
Valid sequences collected: 88 (Target: 100)
No new sequences found in attempt 2. Retrying...
No new sequences found in attempt 3. Retrying...
No new sequences found in attempt 4. Retrying...
No new sequences found in attempt 5. Retrying...
No new sequences found in attempt 6. Retrying...
No new sequences found in attempt 7. Retrying...
No new sequences found in attempt 8. Retrying...
No new sequences found in attempt 9. Retrying...
No new sequences found in attempt 10. Retrying...
No new sequences found in attempt 11. Retrying...
No new sequences found in attempt 12. Retrying...
No new sequences found in at