In [1]:
import pandas as pd
url = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/assembly_summary.txt"

mammal = pd.read_csv(url, sep="\t", skiprows=1)


In [3]:
mammal.columns

Index(['#assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name',
       'asm_submitter', 'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'asm_not_live_date', 'assembly_type', 'group', 'genome_size',
       'genome_size_ungapped', 'gc_percent', 'replicon_count',
       'scaffold_count', 'contig_count', 'annotation_provider',
       'annotation_name', 'annotation_date', 'total_gene_count',
       'protein_coding_gene_count', 'non_coding_gene_count', 'pubmed_id'],
      dtype='object')

In [32]:
mammal.query("organism_name == 'Mus musculus'").assembly_level

89    Chromosome
Name: assembly_level, dtype: object

In [10]:
import pandas as pd
import time
data = pd.DataFrame()

for group in ["fungi","invertebrate","plant","protozoa","vertebrate_mammalian","vertebrate_other"]:
    
    url = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/"+group+"/assembly_summary.txt"
    tmp = pd.read_csv(url, sep="\t", skiprows=1)
    data = pd.concat([data,tmp])
    # Don't get kicked out by NCBI
    time.sleep(2)



ftps = data.ftp_path.to_list()

# Turn ftps into list of tuples with assembly accession and ftp path

urls = [ftp+"_transalted_cds.faa.gz" for ftp in ftps]


# Based on URLs, download the files in the "./CDS/" folder and unzip them in the same folder at the same time

import os
import urllib.request
import gzip
import shutil
import time

for url in urls:
    try:
        print(url)
        file_name = url.split("/")[-1]
        print(file_name)
        urllib.request.urlretrieve(url, "./CDS/"+file_name)
        with gzip.open("./CDS/"+file_name, 'rb') as f_in:
            with open("./CDS/"+file_name[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove("./CDS/"+file_name)
        time.sleep(2)
    except:
        print("Error with "+url)
        pass



In [12]:
data.assembly_level = data.assembly_level.map({"Complete Genome":0, "Chromosome":1, "Scaffold":2, "Contig":3, "Fragment":4, "NA":5})


    #assembly_accession    bioproject     biosample         wgs_master  \
0       GCF_000002945.1      PRJNA127  SAMEA3138176                 na   
1       GCF_003054445.1   PRJNA545694  SAMN08558011                 na   
2       GCF_000243375.1    PRJNA79345  SAMEA2272433                 na   
3       GCF_000026365.1    PRJNA39573  SAMEA3138235                 na   
4       GCF_025882075.1   PRJNA926498  SAMN26195980  JAKWFO000000000.1   
..                  ...           ...           ...                ...   
354     GCF_029582105.1   PRJNA975542  SAMN33408179  JARJVX000000000.1   
355     GCF_025583915.1  PRJNA1032142  SAMN29589363  JANCLY000000000.1   
356     GCF_027887155.1   PRJNA938469  SAMN31394215  JAQIHI000000000.1   
357     GCF_029890125.1   PRJNA973061  SAMN33577524  JARGFG000000000.1   
358     GCF_030684315.1  PRJNA1008742  SAMN35823397  JAUUDQ000000000.1   

           refseq_category    taxid  species_taxid  \
0    representative genome     4896           4896   
1  

In [13]:
print(data.assembly_level)

0      1
1      0
2      1
3      1
4      3
      ..
354    1
355    2
356    1
357    3
358    1
Name: assembly_level, Length: 1794, dtype: int64


In [18]:

ftps = data.ftp_path.to_list()

# Turn ftps into list of tuples with assembly accession and ftp path

urls = [ftp+"_transalted_cds.faa.gz" for ftp in ftps]


# Based on URLs, download the files in the "./CDS/" folder and unzip them in the same folder at the same time

import os
import urllib.request
import gzip
import shutil
import time

for url in urls:
    try:
        print(url)
        file_name = url.split("/")[-1]
        print(file_name)
        urllib.request.urlretrieve(url, "./CDS/"+file_name)
        with gzip.open("./CDS/"+file_name, 'rb') as f_in:
            with open("./CDS/"+file_name[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove("./CDS/"+file_name)
        time.sleep(2)
    except:
        print("Error with "+url)
        pass


https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_translated_cds.faa.gz

In [19]:
urls

['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/945/GCF_000002945.1_ASM294v2_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/054/445/GCF_003054445.1_ASM305444v1_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/243/375/GCF_000243375.1_ASM24337v1_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/026/365/GCF_000026365.1_ASM2636v1_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/025/882/075/GCF_025882075.1_ASM2588207v1_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/102/565/GCF_002102565.1_Kocim1_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/074/885/GCF_011074885.1_ASM1107488v2_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/855/GCF_000002855.4_ASM285v2_transalted_cds.faa.gz',
 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/013/340/325/GCF_013340325.1_ASM1334032v1_transalted_cds.faa.gz',
 'https://ft