In [None]:
# Install required packages
!pip install biopython

# Acinetobacter baumannii

In [None]:
# Imports
import os
import csv
import time
import gzip
import shutil
from Bio import Entrez
from google.colab import drive

# Set your email (required by NCBI)
Entrez.email = "s-salma.hatem@zewailcity.edu.eg"  # <-- Change this to your email!

# File and folder paths
input_csv = "mbio.02852-24-s0002.csv"  # <-- Upload this file to Colab
output_dir = "/content/fasta_files"

# Google Drive mount point
drive_mount = "/content/drive/MyDrive/MRSA datasets/Datasets/Acinetobacter baumannii/PRJNA1014981"

# Mount Google Drive
drive.mount('/content/drive')

# Create directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(drive_mount, exist_ok=True)

# Read accession numbers from CSV
def get_sample_ids(csv_file):
    sample_ids = []
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header if present
        for row in reader:
            sample_id = row[0].strip()
            if sample_id:
                sample_ids.append(sample_id)
    return sample_ids

# Search NCBI for assembly ID
def search_ncbi(sample_id):
    try:
        handle = Entrez.esearch(db="assembly", term=f"{sample_id}[Assembly Accession]", retmax=1)
        record = Entrez.read(handle)
        handle.close()
        return record["IdList"]
    except Exception as e:
        print(f"Error searching for {sample_id}: {e}")
        return []

# Get FTP link
def get_assembly_info(assembly_id):
    try:
        handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
        record = Entrez.read(handle)
        handle.close()
        return record["DocumentSummarySet"]["DocumentSummary"][0]
    except Exception as e:
        print(f"Error getting info for {assembly_id}: {e}")
        return None

# Download and unzip FASTA
def download_fna(ftp_path, sample_id):
    try:
        base = ftp_path.split('/')[-1]
        url = f"{ftp_path}/{base}_genomic.fna.gz"
        gz_path = os.path.join(output_dir, f"{sample_id}.fna.gz")
        fasta_path = os.path.join(output_dir, f"{sample_id}.fna")

        # Download
        os.system(f"wget -q -O {gz_path} {url}")

        # Decompress
        with gzip.open(gz_path, 'rb') as f_in:
            with open(fasta_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        os.remove(gz_path)
        print(f"✅ Downloaded and extracted: {sample_id}.fna")
        return fasta_path
    except Exception as e:
        print(f"❌ Error for {sample_id}: {e}")
        return None

# Copy to Google Drive
def upload_to_drive(local_file):
    try:
        filename = os.path.basename(local_file)
        dest_path = os.path.join(drive_mount, filename)

        # Check if file already exists on Google Drive
        if os.path.exists(dest_path):
            print(f"⏩ Skipped (already on Drive): {filename}")
            return

        shutil.copy(local_file, dest_path)
        print(f"📁 Uploaded to Drive: {filename}")
    except Exception as e:
        print(f"❌ Upload failed for {filename}: {e}")

# Main logic
def process_all():
    sample_ids = get_sample_ids(input_csv)
    print(f"🔍 Found {len(sample_ids)} accession IDs")

    for idx, sample_id in enumerate(sample_ids, 1):
        print(f"\n[{idx}/{len(sample_ids)}] Processing: {sample_id}")
        dest_filename = f"{sample_id}.fna"
        drive_path = os.path.join(drive_mount, dest_filename)

        # Skip if already on Drive
        if os.path.exists(drive_path):
            print(f"⏩ Skipped (already on Drive): {dest_filename}")
            continue

        assembly_ids = search_ncbi(sample_id)
        time.sleep(1)

        if not assembly_ids:
            print(f"❌ No assembly ID for {sample_id}")
            continue

        info = get_assembly_info(assembly_ids[0])
        time.sleep(1)

        ftp_path = info.get("FtpPath_RefSeq") or info.get("FtpPath_GenBank")
        if not ftp_path:
            print(f"❌ No FTP path for {sample_id}")
            continue

        fasta_file = download_fna(ftp_path, sample_id)
        if fasta_file:
            upload_to_drive(fasta_file)


# Run the process
process_all()


In [None]:
# Imports
import os
import csv
import time
import gzip
import shutil
import requests
from Bio import Entrez
from google.colab import drive

# Set your email (required by NCBI)
Entrez.email = "s-salma.hatem@zewailcity.edu.eg"  # <-- Change this to your email!

# File and folder paths
input_csv = "mbio.02852-24-s0002.csv"  # <-- Upload this file to Colab
output_dir = "/content/fasta_files"

# Google Drive mount point
drive_mount = "/content/drive/MyDrive/MRSA datasets/Datasets/Acinetobacter baumannii/PRJNA1014981"

# Mount Google Drive
drive.mount('/content/drive')

# Create directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(drive_mount, exist_ok=True)

# --- Utilities ---

def get_sample_ids(csv_file):
    sample_ids = []
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header if present
        for row in reader:
            sample_id = row[0].strip()
            if sample_id:
                sample_ids.append(sample_id)
    return sample_ids

def search_ncbi(sample_id):
    try:
        handle = Entrez.esearch(db="assembly", term=f"{sample_id}[Assembly Accession]", retmax=1)
        record = Entrez.read(handle)
        handle.close()
        return record["IdList"]
    except Exception as e:
        print(f"Error searching for {sample_id}: {e}")
        return []

def get_assembly_info(assembly_id):
    try:
        handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
        record = Entrez.read(handle)
        handle.close()
        return record["DocumentSummarySet"]["DocumentSummary"][0]
    except Exception as e:
        print(f"Error getting info for {assembly_id}: {e}")
        return None

def download_fna(ftp_path, sample_id):
    try:
        base = ftp_path.split('/')[-1]
        url = f"{ftp_path}/{base}_genomic.fna.gz"
        gz_path = os.path.join(output_dir, f"{sample_id}.fna.gz")
        fasta_path = os.path.join(output_dir, f"{sample_id}.fna")

        # Download
        os.system(f"wget -q -O {gz_path} {url}")

        # Decompress
        with gzip.open(gz_path, 'rb') as f_in:
            with open(fasta_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        os.remove(gz_path)
        print(f"✅ Downloaded and extracted: {sample_id}.fna")
        return fasta_path
    except Exception as e:
        print(f"❌ Error for {sample_id}: {e}")
        return None

def upload_to_drive(local_file):
    try:
        filename = os.path.basename(local_file)
        dest_path = os.path.join(drive_mount, filename)
        shutil.copy(local_file, dest_path)
        print(f"📁 Uploaded to Drive: {filename}")
    except Exception as e:
        print(f"❌ Upload failed for {filename}: {e}")

def get_sample_accession_from_ena(sra_id):
    try:
        url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={sra_id}&result=read_run&fields=sample_accession&format=json"
        r = requests.get(url)
        data = r.json()
        if data and "sample_accession" in data[0]:
            return data[0]["sample_accession"]
        else:
            print(f"❌ No sample accession found for {sra_id} in ENA")
            return None
    except Exception as e:
        print(f"❌ ENA lookup error for {sra_id}: {e}")
        return None

def download_from_bvbrc(sample_accession, sample_id):
    try:
        # Step 1: Get genome_id via biosample lookup
        genome_url = f"https://www.bv-brc.org/api/genome/?eq(biosample_accession,{sample_accession})&select(genome_id,genome_name)&http_accept=application/json"
        genome_response = requests.get(genome_url)

        if genome_response.status_code != 200:
            print(f"❌ BV-BRC API error ({genome_response.status_code}) for {sample_accession}")
            return None

        try:
            genome_data = genome_response.json()
        except ValueError:
            print(f"❌ Failed to parse genome JSON for {sample_accession}")
            return None

        if not genome_data:
            print(f"❌ No genome found for BioSample {sample_accession}")
            return None

        genome_id = genome_data[0]["genome_id"]
        genome_name = genome_data[0]["genome_name"]
        print(f"🔗 Found Genome: {genome_id} - {genome_name}")

        # Step 2: Query genome_sequence for actual file paths
        seq_url = f"https://www.bv-brc.org/api/genome_sequence/?eq(genome_id,{genome_id})&http_accept=application/json"
        seq_response = requests.get(seq_url)

        if seq_response.status_code != 200:
            print(f"❌ Failed to query genome_sequence API for {genome_id}")
            return None

        try:
            seq_data = seq_response.json()
        except ValueError:
            print(f"❌ Failed to parse sequence JSON for {genome_id}")
            return None

        # Step 3: Pick the best sequence file
        fasta_url = None
        for seq in seq_data:
            accession = seq.get("accession")
            if accession and any(ext in accession.lower() for ext in ['.fna', '.fa', '.fasta']):
                # BV-BRC file structure
                fasta_url = f"https://www.bv-brc.org/downloads/{genome_id}/{accession}"
                break

        if not fasta_url:
            print(f"❌ No usable FASTA file found for genome {genome_id}")
            return None

        # Step 4: Download the file
        dest_path = os.path.join(output_dir, f"{sample_id}.fna")

        r = requests.get(fasta_url, stream=True)
        if r.status_code != 200:
            print(f"❌ File not found at {fasta_url}")
            return None

        if fasta_url.endswith(".gz"):
            # If gzipped, decompress
            temp_path = dest_path + ".gz"
            with open(temp_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            with gzip.open(temp_path, 'rb') as f_in, open(dest_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
            os.remove(temp_path)
        else:
            with open(dest_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

        print(f"✅ Downloaded dynamic FASTA from BV-BRC for {sample_id}")
        return dest_path

    except Exception as e:
        print(f"❌ Error downloading from BV-BRC for {sample_accession}: {e}")
        return None




# --- Main Process ---

def process_all():
    sample_ids = get_sample_ids(input_csv)
    print(f"🔍 Found {len(sample_ids)} accession IDs")

    for idx, sample_id in enumerate(sample_ids, 1):
        print(f"\n[{idx}/{len(sample_ids)}] Processing: {sample_id}")
        dest_filename = f"{sample_id}.fna"
        drive_path = os.path.join(drive_mount, dest_filename)

        # Skip if already on Drive
        if os.path.exists(drive_path):
            print(f"⏩ Skipped (already on Drive): {dest_filename}")
            continue

        # --- Handle SRA accessions ---
        if sample_id.startswith(("SRR", "ERR", "DRR")):
            sample_accession = get_sample_accession_from_ena(sample_id)
            if not sample_accession:
                continue
            fasta_file = download_from_bvbrc(sample_accession, sample_id)
            if fasta_file:
                upload_to_drive(fasta_file)
            continue  # move to next sample

        # --- Handle Assembly accessions ---
        assembly_ids = search_ncbi(sample_id)
        time.sleep(1)

        if not assembly_ids:
            print(f"❌ No assembly ID for {sample_id}")
            continue

        info = get_assembly_info(assembly_ids[0])
        time.sleep(1)

        ftp_path = info.get("FtpPath_RefSeq") or info.get("FtpPath_GenBank")
        if not ftp_path:
            print(f"❌ No FTP path for {sample_id}")
            continue

        fasta_file = download_fna(ftp_path, sample_id)
        if fasta_file:
            upload_to_drive(fasta_file)

# Run the process
process_all()


# Salmonella enterica

In [None]:
!pip install biopython requests

import os
import csv
import time
import gzip
import shutil
import requests
import pandas as pd
from Bio import Entrez
from google.colab import drive

# 🔧 Configuration
Entrez.email = "s-salma.hatem@zewailcity.edu.eg"  # 🔁 CHANGE THIS
csv_file = "PRJNA292666_PRJNA292661_labels.xlsx"
output_dir = "/content/fasta_files"
drive_dir = "/content/drive/MyDrive/MRSA datasets/Datasets/Salmonella enterica/PRJNA292666 & PRJNA292661"

# 📂 Mount Google Drive
drive.mount('/content/drive')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(drive_dir, exist_ok=True)

# 🧾 Read SRR IDs from CSV
def get_sra_ids(excel_file):
    df = pd.read_excel(excel_file)

    if "SRA Run Accession" not in df.columns:
        raise ValueError("Column 'SRA Run Accession' not found in Excel file.")

    sra_ids = df["SRA Run Accession"].dropna().astype(str).unique().tolist()
    return sra_ids

# 🔗 Step 1: SRR ➝ ENA ➝ Sample Accession (ERSxxx)
def get_sample_accession_from_ena(sra_id):
    try:
        url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={sra_id}&result=read_run&fields=sample_accession&format=json"
        r = requests.get(url)
        data = r.json()
        if data and "sample_accession" in data[0]:
            return data[0]["sample_accession"]
        else:
            print(f"❌ No sample accession found for {sra_id} in ENA")
            return None
    except Exception as e:
        print(f"❌ ENA lookup error for {sra_id}: {e}")
        return None

# 🔗 Step 2: Sample Accession ➝ Assembly ➝ FTP path
def get_assembly_ftp_from_ncbi(sample_accession):
    try:
        # Search assembly DB with sample accession
        handle = Entrez.esearch(db="assembly", term=f"{sample_accession}[Sample]", retmax=1)
        result = Entrez.read(handle)
        if not result["IdList"]:
            print(f"❌ No assembly found for {sample_accession}")
            return None

        assembly_id = result["IdList"][0]
        handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
        summary = Entrez.read(handle)
        doc = summary["DocumentSummarySet"]["DocumentSummary"][0]

        # Prefer RefSeq path
        ftp_path = doc["FtpPath_RefSeq"] or doc["FtpPath_GenBank"]
        return ftp_path
    except Exception as e:
        print(f"❌ Error getting FTP path for {sample_accession}: {e}")
        return None

# ⬇️ Step 3: Download and extract .fna.gz
def download_fna_file(ftp_path, sra_id):
    try:
        base = ftp_path.split("/")[-1]
        url = f"{ftp_path}/{base}_genomic.fna.gz"
        gz_path = os.path.join(output_dir, f"{sra_id}.fna.gz")
        out_path = os.path.join(output_dir, f"{sra_id}.fna")

        os.system(f"wget -q -O {gz_path} {url}")

        # ✅ Validate if it's a real gzip file
        with open(gz_path, 'rb') as test_f:
            magic = test_f.read(2)
            if magic != b'\x1f\x8b':
                raise ValueError("Not a valid gzip file — probably a 404 page.")

        # If it passed, proceed to decompress
        with gzip.open(gz_path, 'rb') as f_in, open(out_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)
        print(f"✅ Downloaded and extracted {sra_id}.fna")
        return out_path

    except Exception as e:
        print(f"❌ Download error for {sra_id}: {e}")
        if os.path.exists(gz_path):
            os.remove(gz_path)
        return None


# ☁️ Step 4: Upload to Google Drive
def upload_to_drive(filepath):
    try:
        dest = os.path.join(drive_dir, os.path.basename(filepath))
        shutil.copy(filepath, dest)
        print(f"☁️ Uploaded to Drive: {os.path.basename(filepath)}")
    except Exception as e:
        print(f"❌ Drive upload error: {e}")

# ▶️ Main pipeline
# ▶️ Main pipeline with skip-if-exists check
def process_sra_ids():
    sra_ids = get_sra_ids(csv_file)
    print(f"🔍 Found {len(sra_ids)} SRA IDs")

    for idx, sra_id in enumerate(sra_ids, 1):
        print(f"\n[{idx}/{len(sra_ids)}] Processing {sra_id}...")

        drive_file_path = os.path.join(drive_dir, f"{sra_id}.fna")
        if os.path.exists(drive_file_path):
            print(f"⏭️  Skipping {sra_id} — already exists in Drive.")
            continue

        sample_accession = get_sample_accession_from_ena(sra_id)
        if not sample_accession:
            continue

        ftp_path = get_assembly_ftp_from_ncbi(sample_accession)
        if not ftp_path:
            continue

        fna_file = download_fna_file(ftp_path, sra_id)
        if fna_file:
            upload_to_drive(fna_file)


process_sra_ids()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
☁️ Uploaded to Drive: SRR1982114.fna

[4004/5278] Processing SRR1773838...
✅ Downloaded and extracted SRR1773838.fna
☁️ Uploaded to Drive: SRR1773838.fna

[4005/5278] Processing SRR3295918...
✅ Downloaded and extracted SRR3295918.fna
☁️ Uploaded to Drive: SRR3295918.fna

[4006/5278] Processing SRR3115182...
✅ Downloaded and extracted SRR3115182.fna
☁️ Uploaded to Drive: SRR3115182.fna

[4007/5278] Processing SRR1573573...
✅ Downloaded and extracted SRR1573573.fna
☁️ Uploaded to Drive: SRR1573573.fna

[4008/5278] Processing SRR3198632...
✅ Downloaded and extracted SRR3198632.fna
☁️ Uploaded to Drive: SRR3198632.fna

[4009/5278] Processing SRR2407619...
✅ Downloaded and extracted SRR2407619.fna
☁️ Uploaded to Drive: SRR2407619.fna

[4010/5278] Processing SRR3295589...
✅ Downloaded and extracted SRR3295589.fna
☁️ Uploaded to Drive: SRR3295589.fna

[4011/5278] Processing SRR2542458...
❌ No sample accession found for SRR254245

# Mycobacterium tuberculosis

In [None]:
!pip install biopython openpyxl requests

import os
import gzip
import shutil
import requests
import pandas as pd
from Bio import Entrez
from google.colab import drive

# 🔧 Configuration
Entrez.email = "s-salma.hatem@zewailcity.edu.eg"
excel_file = "master_table_resistance - Copy.csv"
sample_column = "accessions"
output_dir = "/content/fasta_downloads"
drive_dir = "/content/drive/MyDrive/Ncbi_Fasta_Downloads"


In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/MRSA datasets/Datasets/Mycobacterium tuberculosis/PRJNA343736')
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/13LGqe__ULhjG0hZmpgxz8WqE0fNXsVNQ/MRSA datasets/Datasets/Mycobacterium tuberculosis/PRJNA343736


In [None]:
!pip install biopython openpyxl requests

import os
import gzip
import shutil
import requests
import pandas as pd
from Bio import Entrez
from google.colab import drive

# 🔧 Configuration
Entrez.email = "s-salma.hatem@zewailcity.edu.eg"
excel_file = "master_table_resistance.csv"
sample_column = "accessions"
output_dir = "/content/fasta_downloads"
drive_dir = "/content/drive/My Drive/MRSA datasets/Datasets/Mycobacterium tuberculosis/PRJNA343736"

# 📂 Mount Google Drive
drive.mount('/content/drive')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(drive_dir, exist_ok=True)

# 🧾 Read sample accessions from Excel
def get_sample_accessions(file):
    df = pd.read_csv(file)
    if sample_column not in df.columns:
        raise ValueError(f"Column '{sample_column}' not found.")
    return df[sample_column].dropna().astype(str).unique().tolist()

# 🔗 Sample ➝ Assembly ➝ FTP path
def get_assembly_ftp_from_ncbi(sample_accession):
    try:
        handle = Entrez.esearch(db="assembly", term=f"{sample_accession}[Sample]", retmax=1)
        result = Entrez.read(handle)
        if not result["IdList"]:
            print(f"❌ No assembly found for {sample_accession}")
            return None

        assembly_id = result["IdList"][0]
        handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
        summary = Entrez.read(handle)
        doc = summary["DocumentSummarySet"]["DocumentSummary"][0]

        ftp_path = doc["FtpPath_RefSeq"] or doc["FtpPath_GenBank"]
        return ftp_path
    except Exception as e:
        print(f"❌ FTP path error for {sample_accession}: {e}")
        return None

# ⬇️ Download and extract .fna.gz
def download_fna_file(ftp_path, sample_id):
    try:
        base = ftp_path.split("/")[-1]
        url = f"{ftp_path}/{base}_genomic.fna.gz"
        gz_path = os.path.join(output_dir, f"{sample_id}.fna.gz")
        out_path = os.path.join(output_dir, f"{sample_id}.fna")

        os.system(f"wget -q -O {gz_path} {url}")

        # Validate gzip
        with open(gz_path, 'rb') as test_f:
            if test_f.read(2) != b'\x1f\x8b':
                raise ValueError("Invalid .gz file (probably a 404 page).")

        with gzip.open(gz_path, 'rb') as f_in, open(out_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)
        print(f"✅ Downloaded and extracted: {sample_id}.fna")
        return out_path
    except Exception as e:
        print(f"❌ Download error for {sample_id}: {e}")
        if os.path.exists(gz_path):
            os.remove(gz_path)
        return None

# ☁️ Upload to Google Drive
def upload_to_drive(filepath):
    try:
        dest = os.path.join(drive_dir, os.path.basename(filepath))
        shutil.copy(filepath, dest)
        print(f"☁️ Uploaded to Drive: {os.path.basename(filepath)}")
    except Exception as e:
        print(f"❌ Upload error: {e}")

# ▶️ Main runner (goal: 5000 total files in Drive)
def process_samples():
    sample_ids = get_sample_accessions(excel_file)
    print(f"🔍 Found {len(sample_ids)} sample accessions")

    # Count existing files in Drive
    existing_files = [f for f in os.listdir(drive_dir) if f.endswith(".fna")]
    existing_count = len(existing_files)
    print(f"📁 {existing_count} files already exist in Drive.")

    max_total = 5000
    needed = max_total - existing_count

    if needed <= 0:
        print("✅ Already have 5000 or more files in Drive. No download needed.")
        return

    print(f"🚀 Attempting to download {needed} more files to reach 5000.")

    success_count = 0

    for idx, sample_id in enumerate(sample_ids, 1):
        if success_count >= needed:
            break

        print(f"\n[{idx}/{len(sample_ids)}] Processing: {sample_id}")

        drive_file_path = os.path.join(drive_dir, f"{sample_id}.fna")
        if os.path.exists(drive_file_path):
            print(f"⏭️ Skipping {sample_id} — already exists in Drive.")
            continue

        ftp_path = get_assembly_ftp_from_ncbi(sample_id)
        if not ftp_path:
            continue

        fna_file = download_fna_file(ftp_path, sample_id)
        if fna_file:
            upload_to_drive(fna_file)
            success_count += 1
            print(f"📦 Total new downloads this session: {success_count}")

    print(f"\n✅ Finished. Total files in Drive should now be: {existing_count + success_count}")


# 🚀 Run the pipeline
process_samples()


# Escherichia coli

In [None]:
import os
import urllib.request
import pandas as pd
from google.colab import drive
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

# 🔧 CONFIGURATION
csv_file_path = "PATRIC_e.coli.xlsx"  # Change to your file name
column_name = "Genome ID"
local_download_dir = "/content/genome_fna_files"
drive_mount_path = "/content/drive"
drive_target_dir = os.path.join(drive_mount_path, "My Drive/MRSA datasets/Datasets/Escherichia coli/PATRIC_e.coli")
MAX_WORKERS = 5  # 🔁 Number of parallel downloads (adjust based on performance)

# 📂 Mount Google Drive
drive.mount(drive_mount_path)
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_target_dir, exist_ok=True)

# 📄 Load genome IDs
df = pd.read_excel(csv_file_path)
genome_ids = df[column_name].dropna().astype(str).unique()
print(f"🔍 Found {len(genome_ids)} genome IDs to process.")

# 🔁 Function for downloading and uploading
def download_and_upload(genome_id):
    file_name = f"{genome_id}.fna"
    ftp_url = f"ftp://ftp.patricbrc.org/genomes/{genome_id}/{file_name}"
    local_path = os.path.join(local_download_dir, file_name)
    drive_path = os.path.join(drive_target_dir, file_name)

    if os.path.exists(drive_path):
        return f"☁️ Skipping {file_name} — already in Drive."

    try:
        # ⬇️ Download
        urllib.request.urlretrieve(ftp_url, local_path)

        # ☁️ Copy to Drive
        shutil.copy(local_path, drive_path)

        # 🧹 Remove local file
        os.remove(local_path)

        return f"✅ Downloaded and uploaded: {file_name}"
    except Exception as e:
        return f"❌ Failed for {genome_id}: {e}"

# 🚀 Run downloads concurrently
results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_id = {executor.submit(download_and_upload, gid): gid for gid in genome_ids}
    for i, future in enumerate(as_completed(future_to_id), 1):
        result = future.result()
        print(f"[{i}/{len(genome_ids)}] {result}")

# 🔄 Ensure everything is synced
drive.flush_and_unmount()

# Staphylococcus aureus

In [None]:
import os
import urllib.request
import pandas as pd
from google.colab import drive
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

# 🔧 CONFIGURATION
csv_file_path = "PATRIC_genomes.csv"  # Change to your file name
column_name = "genome_id"
local_download_dir = "/content/genome_fna_files"
drive_mount_path = "/content/drive"
drive_target_dir = os.path.join(drive_mount_path, "My Drive/MRSA datasets/Datasets/Staphylococcus aureus/PATRIC_genomes")
MAX_WORKERS = 5  # 🔁 Number of parallel downloads (adjust based on performance)

# 📂 Mount Google Drive
drive.mount(drive_mount_path)
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_target_dir, exist_ok=True)

# 📄 Load genome IDs
df = pd.read_csv(csv_file_path)
genome_ids = df[column_name].dropna().astype(str).unique()
print(f"🔍 Found {len(genome_ids)} genome IDs to process.")

# 🔁 Function for downloading and uploading
def download_and_upload(genome_id):
    file_name = f"{genome_id}.fna"
    ftp_url = f"ftp://ftp.patricbrc.org/genomes/{genome_id}/{file_name}"
    local_path = os.path.join(local_download_dir, file_name)
    drive_path = os.path.join(drive_target_dir, file_name)

    if os.path.exists(drive_path):
        return f"☁️ Skipping {file_name} — already in Drive."

    try:
        # ⬇️ Download
        urllib.request.urlretrieve(ftp_url, local_path)

        # ☁️ Copy to Drive
        shutil.copy(local_path, drive_path)

        # 🧹 Remove local file
        os.remove(local_path)

        return f"✅ Downloaded and uploaded: {file_name}"
    except Exception as e:
        return f"❌ Failed for {genome_id}: {e}"

# 🚀 Run downloads concurrently
results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_id = {executor.submit(download_and_upload, gid): gid for gid in genome_ids}
    for i, future in enumerate(as_completed(future_to_id), 1):
        result = future.result()
        print(f"[{i}/{len(genome_ids)}] {result}")

# 🔄 Ensure everything is synced
drive.flush_and_unmount()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[771/5770] ☁️ Skipping 1280.25102.fna — already in Drive.
[772/5770] ☁️ Skipping 1280.25103.fna — already in Drive.
[773/5770] ☁️ Skipping 1280.25104.fna — already in Drive.
[774/5770] ☁️ Skipping 1280.25101.fna — already in Drive.
[775/5770] ☁️ Skipping 1280.25106.fna — already in Drive.
[776/5770] ☁️ Skipping 1280.25107.fna — already in Drive.
[777/5770] ☁️ Skipping 1280.25105.fna — already in Drive.
[778/5770] ☁️ Skipping 1280.25108.fna — already in Drive.
[779/5770] ☁️ Skipping 1280.25111.fna — already in Drive.
[780/5770] ☁️ Skipping 1280.25109.fna — already in Drive.
[781/5770] ☁️ Skipping 1280.25112.fna — already in Drive.
[782/5770] ☁️ Skipping 1280.25113.fna — already in Drive.
[783/5770] ☁️ Skipping 1280.25115.fna — already in Drive.
[784/5770] ☁️ Skipping 1280.25114.fna — already in Drive.
[785/5770] ☁️ Skipping 1280.25117.fna — already in Drive.
[786/5770] ☁️ Skipping 1280.25116.fna — already in Drive.
[787/57

# Klebsiella pneumoniae

In [None]:
import os
import urllib.request
import pandas as pd
from google.colab import drive
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

# 🔧 CONFIGURATION
csv_file_path = "001222_2.xlsx"  # Change to your file name
column_name = "Genome_ID"
local_download_dir = "/content/genome_fna_files"
drive_mount_path = "/content/drive"
drive_target_dir = os.path.join(drive_mount_path, "My Drive/MRSA datasets/Datasets/Klebsiella pneumoniae/PATRIC_klebsiella")
MAX_WORKERS = 5  # 🔁 Number of parallel downloads (adjust based on performance)

# 📂 Mount Google Drive
drive.mount(drive_mount_path)
os.makedirs(local_download_dir, exist_ok=True)
os.makedirs(drive_target_dir, exist_ok=True)

# 📄 Load genome IDs
df = pd.read_excel(csv_file_path)
genome_ids = df[column_name].dropna().astype(str).unique()
print(f"🔍 Found {len(genome_ids)} genome IDs to process.")

# 🔁 Function for downloading and uploading
def download_and_upload(genome_id):
    file_name = f"{genome_id}.fna"
    ftp_url = f"ftp://ftp.patricbrc.org/genomes/{genome_id}/{file_name}"
    local_path = os.path.join(local_download_dir, file_name)
    drive_path = os.path.join(drive_target_dir, file_name)

    if os.path.exists(drive_path):
        return f"☁️ Skipping {file_name} — already in Drive."

    try:
        # ⬇️ Download
        urllib.request.urlretrieve(ftp_url, local_path)

        # ☁️ Copy to Drive
        shutil.copy(local_path, drive_path)

        # 🧹 Remove local file
        os.remove(local_path)

        return f"✅ Downloaded and uploaded: {file_name}"
    except Exception as e:
        return f"❌ Failed for {genome_id}: {e}"

# 🚀 Run downloads concurrently
results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_id = {executor.submit(download_and_upload, gid): gid for gid in genome_ids}
    for i, future in enumerate(as_completed(future_to_id), 1):
        result = future.result()
        print(f"[{i}/{len(genome_ids)}] {result}")

# 🔄 Ensure everything is synced
drive.flush_and_unmount()


# zip file

In [None]:
import zipfile
import os
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define the path to your zip file in Drive
zip_path = '/content/drive/MyDrive/MRSA datasets/Datasets_fcgr.zip'  # 🔁 Update this path
extract_dir = '/content/drive/MyDrive/Dataset_fcgr/'  # 🔁 Update this path

# Step 3: Create the target extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Step 4: Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'✅ Zip file extracted to: {extract_dir}')


Mounted at /content/drive
✅ Zip file extracted to: /content/drive/MyDrive/Dataset_fcgr/
