In [None]:
!pip insstall biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.8/2.8 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 16.3 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
from Bio import Entrez, SeqIO

# Set your NCBI email (Required for API usage)
Entrez.email = "petershamoun80@gmail.com"

# Define search queries
queries = {
    "protein_coding": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("CDS"[Feature] OR "mRNA"[Feature])',
    "non_protein_coding": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("ncRNA"[Feature] OR "lncRNA"[Feature])',
    "enhancer": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("enhancer"[Title] OR "enhancer"[Gene Name])',
    "non_enhancer": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("enhancer"[Title] OR "enhancer"[Gene Name])',
    "promoter": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND "promoter"[Title]',
    "non_promoter": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT "promoter"[Title]',
    "splice_site": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("splice site"[Title] OR "splice junction"[Title])',
    "non_splice_site": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("splice site"[Title] OR "splice junction"[Title])',
    "methylated": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("methylation"[Title] OR "CpG island"[Title])',
    "non_methylated": '("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("methylation"[Title] OR "CpG island"[Title])'
}

# Create 'sequences' directory if it doesn't exist
output_dir = "sequences"
os.makedirs(output_dir, exist_ok=True)

# Function to fetch sequences from NCBI and save them in the 'sequences' folder
def fetch_fasta(query, filename, max_records=100):
    print(f"Searching NCBI for: {query}")

    # Step 1: Search NCBI to get sequence IDs
    handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_records)
    record = Entrez.read(handle)
    handle.close()

    # Get list of sequence IDs
    id_list = record["IdList"]
    if not id_list:
        print(f"No sequences found for {filename}.")
        return
    
    print(f"Found {len(id_list)} sequences. Downloading...")

    # Step 2: Fetch FASTA sequences using IDs
    handle = Entrez.efetch(db="nucleotide", id=id_list, rettype="fasta", retmode="text")
    fasta_records = handle.read()
    handle.close()

    # Step 3: Save to the 'sequences' folder
    file_path = os.path.join(output_dir, filename)
    with open(file_path, "w") as f:
        f.write(fasta_records)

    print(f"Saved {len(id_list)} sequences to {file_path}.")

# Run the function for each query
for label, query in queries.items():
    fetch_fasta(query, f"{label}.fasta")


Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("CDS"[Feature] OR "mRNA"[Feature])
Found 100 sequences. Downloading...
Saved 100 sequences to protein_coding.fasta.
Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("ncRNA"[Feature] OR "lncRNA"[Feature])
Found 100 sequences. Downloading...
Saved 100 sequences to non_protein_coding.fasta.
Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND ("enhancer"[Title] OR "enhancer"[Gene Name])
Found 100 sequences. Downloading...
Saved 100 sequences to enhancer.fasta.
Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND NOT ("enhancer"[Title] OR "enhancer"[Gene Name])
Found 100 sequences. Downloading...
Saved 100 sequences to non_enhancer.fasta.
Searching NCBI for: ("Homo sapiens"[Organism] OR "Mus musculus"[Organism]) AND "promoter"[Title]
Found 100 sequences. Downloading...
Saved 100 sequences to promoter.fasta.
Searching NCBI for: ("

In [4]:
import os
import numpy as np

In [5]:
fasta_dir = "sequences"  # Change to your folder path

# Dictionary to store arrays dynamically
sequence_arrays = {}

# Loop through all FASTA files in the directory
for file in os.listdir(fasta_dir):
    if file.endswith(".fasta"):  # Only process FASTA files
        file_path = os.path.join(fasta_dir, file)
        
        sequences = []
        with open(file_path, "r") as f:
            for line in f:
                line = line.strip()
                if not line.startswith(">"):  # Skip headers, store sequences
                    sequences.append(line)
        
        # Convert to NumPy array
        np_array = np.array(["".join(sequences)])  # Single string array
        
        # Use filename (without extension) as variable name
        var_name = os.path.splitext(file)[0]
        globals()[var_name] = np_array  # Create variable dynamically
        
        # Also store in dictionary for easier access
        sequence_arrays[var_name] = np_array

        print(f"Stored {var_name} as a NumPy array.")

# Example: Accessing a variable dynamically
print(sequence_arrays.keys())  # List of stored variable names
print(sequence_arrays["protein_coding"])  # Access by file name


Stored enhancer as a NumPy array.
Stored methylated as a NumPy array.
Stored non_enhancer as a NumPy array.
Stored non_methylated as a NumPy array.
Stored non_promoter as a NumPy array.
Stored non_protein_coding as a NumPy array.
Stored non_splice_site as a NumPy array.
Stored promoter as a NumPy array.
Stored protein_coding as a NumPy array.
Stored splice_site as a NumPy array.
dict_keys(['enhancer', 'methylated', 'non_enhancer', 'non_methylated', 'non_promoter', 'non_protein_coding', 'non_splice_site', 'promoter', 'protein_coding', 'splice_site'])
['GGGGAGGGGGTCGAAGGCAGAGACAGGAGACAGGTGTGGTGGCGGATTCTGGGGCCATGGGAGGCGGCAAAGGCTTGAGGAGCCCAGCGCTCTCCACCGCGCCCTGGAGCTGCACTCATGCGCTTCAGAGCGGGTGGCGGAGGAAGGGACCGGCGCGGGGGGGGCGGGGGGCGAGCACGTCGGAAGCCAGAGAACTCCATGTCCCTGCGCTCCGTCGGTCACCAGGCGCAGGCCGGCACGATGGAGCCGCAGCATGTGCACCGTGTGCTTGGAGCCCAGAGCCCAGCCTGGGAGGAACCGAAAATCTTCAGTACTCGTCTCGCAACATGAACGTTTAAAGCTAAAACCTTCACAGAGATGTGAGATGGAGCAGGAGCAGTGGCTAAATGTTGAACTCTTCAAAACTTGGCGTATGGGCTGCTGGTGTACCACCATCATCCCC