In [1]:
import pandas as pd

In [2]:
bacteria_series = pd.read_csv("bacteria_species.csv")
bacteria_series = bacteria_series['species']
bacteria_series

0                             bacterium
1         TM7 phylum sp. oral taxon 352
2                [Eubacterium] infirmum
3         TM7 phylum sp. oral taxon 348
4                [Eubacterium] infirmum
                     ...               
5103    Arcticibacter pallidicorallinus
5104              Massilimicrobiota sp.
5105            Pontibacter ummariensis
5106               Bacillus sp. KBS0812
5107          Anaeromusa acidaminophila
Name: species, Length: 5108, dtype: object

In [3]:
import pandas as pd

# Clean bacterial names
def clean_bacteria_name(name):
    name = name.replace('[', '').replace(']', '').strip()  # Remove brackets
    if 'sp.' in name:  # Handle species abbreviations
        name = name.split('sp.')[0].strip()
    return name

bacteria_series = bacteria_series.apply(clean_bacteria_name).drop_duplicates()
bacteria_series
#bacteria_series_cleaned

0                             bacterium
1                            TM7 phylum
2                  Eubacterium infirmum
5                    bacterium 1xD42-67
7                     bacterium YEK0313
                     ...               
5102              Corynebacterium sputi
5103    Arcticibacter pallidicorallinus
5104                  Massilimicrobiota
5105            Pontibacter ummariensis
5107          Anaeromusa acidaminophila
Name: species, Length: 4682, dtype: object

In [4]:
import pandas as pd

# Load the CSV into a pandas DataFrame
df = pd.read_csv("uniprot_reference_proteomes_ids.csv", sep = ';')
df


Unnamed: 0,Proteome_ID,Tax_ID,OSCODE,SUPERREGNUM,#(1),#(2),#(3),Species Name
0,UP000521578,47692,,eukaryota,14416,0,14442,Menura novaehollandiae (superb lyrebird)
1,UP000029743,1554474,ROSV2,viruses,1,0,1,Rosavirus A2 (isolate Human/Gambia/GA7403/2008)
2,UP000628017,2035344,,bacteria,3942,0,3942,Amylibacter cionae
3,UP001232722,2664220,,viruses,2,0,2,Tadarida brasiliensis gemykibivirus 1
4,UP000248311,1795041,,bacteria,3295,0,3300,Pseudoroseicyclus aestuarii
...,...,...,...,...,...,...,...,...
24950,UP000203542,2560777,,viruses,83,0,83,Spheniscid alphaherpesvirus 1
24951,UP000273303,2490946,,bacteria,2387,0,2391,Actinobaculum sp. 352
24952,UP000031129,743971,,bacteria,553,0,563,Mesomycoplasma flocculare ATCC 27399
24953,UP000000814,272562,CLOAB,bacteria,3847,0,3847,Clostridium acetobutylicum (strain ATCC 824 / ...


In [5]:


# Filter the DataFrame for bacteria
bacteria_df = df[df["SUPERREGNUM"] == "bacteria"]

# Create a function for loose matching
def find_proteome_id(name, species_column, proteome_column):
    for index, species in enumerate(species_column):
        if name in species:  # Check if the name is a substring of the species
            return proteome_column.iloc[index]
    return None

# Map the bacteria names in the Series to their Proteome_ID using loose matching
proteome_ids = bacteria_series.apply(
    lambda name: find_proteome_id(name, bacteria_df["Species Name"], bacteria_df["Proteome_ID"])
)

# Drop NaN values and output the result
proteome_ids = proteome_ids.dropna()
print(len(proteome_ids))  # Count of matches
print(proteome_ids)  # Matched Proteome_IDs


1976
0       UP000198817
2       UP000004504
5       UP000270441
7       UP000236625
8       UP000001410
           ...     
5098    UP000002939
5099    UP000006085
5103    UP000238034
5104    UP000195713
5105    UP000198432
Name: species, Length: 1976, dtype: object


In [None]:
# TRY USING UNIPROT

import requests
import os
import time
from tqdm import tqdm

# Function to download proteome using the new UniProt API
def download_proteome(reference_proteome, output_dir="proteomes"):
    """
    Download the proteome of a given bacteria using the updated UniProt REST API.
    """
    url = (f'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28proteome%3A{reference_proteome}%29%29')
    
    # Send the request to the UniProt API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Save the proteome data to a file
        filename = os.path.join(output_dir, f"{reference_proteome}_proteome.fasta")
        with open(filename, "w") as file:
            file.write(response.text)
        print(f"Proteome for {reference_proteome} downloaded successfully.")
        return True
    else:
        # Log the failure
        print(f"Failed to fetch proteome for {reference_proteome}: {response.status_code}")
        print(f"Error details: {response.text}")
        return False

# Main function to handle downloading with resumption
def download_proteomes_with_resumption(proteome_ids, output_dir="proteomes", resume_file="completed_ids.txt"):
    """
    Download proteomes with the ability to resume after interruption.
    """
    # Load completed IDs if resume file exists
    if os.path.exists(resume_file):
        with open(resume_file, "r") as file:
            completed_ids = set(line.strip() for line in file.readlines())
    else:
        completed_ids = set()

    # Open resume file in append mode to update as downloads complete
    with open(resume_file, "a") as resume_log:
        # Iterate over proteome IDs with a progress bar
        for proteome_id in tqdm(proteome_ids, desc="Downloading Proteomes"):
            if proteome_id in completed_ids:
                print(f"Skipping already downloaded proteome: {proteome_id}")
                continue
            
            # Attempt to download the proteome
            success = download_proteome(proteome_id, output_dir=output_dir)
            
            if success:
                # Log the completed ID to the resume file
                resume_log.write(proteome_id + "\n")
                resume_log.flush()  # Ensure the log is written immediately

            # Add a delay to avoid hitting rate limits
            time.sleep(0.3)

# Example usage with a list of proteome IDs
#proteome_ids = ["UP000005640", "UP000006548", "UP000000589"]  # Replace with your list of proteome IDs

download_proteomes_with_resumption(proteome_ids)



Downloading Proteomes:   0%|          | 0/1976 [00:00<?, ?it/s]

Skipping already downloaded proteome: UP000198817
Skipping already downloaded proteome: UP000004504
Skipping already downloaded proteome: UP000270441
Skipping already downloaded proteome: UP000236625
Skipping already downloaded proteome: UP000001410
Skipping already downloaded proteome: UP000008816
Skipping already downloaded proteome: UP000002230
Skipping already downloaded proteome: UP000001118
Skipping already downloaded proteome: UP000001436
Skipping already downloaded proteome: UP000001415
Skipping already downloaded proteome: UP000002438
Skipping already downloaded proteome: UP000000584
Skipping already downloaded proteome: UP000001570
Skipping already downloaded proteome: UP000001986
Skipping already downloaded proteome: UP000006735
Skipping already downloaded proteome: UP000001062
Skipping already downloaded proteome: UP000248731
Skipping already downloaded proteome: UP000000532
Proteome for UP000001735 downloaded successfully.


Downloading Proteomes:   1%|          | 19/1976 [00:01<01:50, 17.68it/s]

Proteome for UP000255382 downloaded successfully.
Proteome for UP000049472 downloaded successfully.


Downloading Proteomes:   1%|          | 21/1976 [00:05<10:17,  3.17it/s]

Proteome for UP000000809 downloaded successfully.


Downloading Proteomes:   1%|          | 22/1976 [00:06<12:57,  2.51it/s]

Proteome for UP000001417 downloaded successfully.


Downloading Proteomes:   1%|          | 23/1976 [00:08<19:41,  1.65it/s]

Proteome for UP000002198 downloaded successfully.


Downloading Proteomes:   1%|          | 24/1976 [00:09<22:27,  1.45it/s]

Proteome for UP000002430 downloaded successfully.


Downloading Proteomes:   1%|▏         | 25/1976 [00:11<24:40,  1.32it/s]

Proteome for UP000001978 downloaded successfully.


Downloading Proteomes:   1%|▏         | 26/1976 [00:13<31:40,  1.03it/s]

Proteome for UP000678348 downloaded successfully.


Downloading Proteomes:   1%|▏         | 27/1976 [00:14<35:09,  1.08s/it]

Proteome for UP000000821 downloaded successfully.


Downloading Proteomes:   1%|▏         | 28/1976 [00:15<36:39,  1.13s/it]

Proteome for UP000000750 downloaded successfully.


Downloading Proteomes:   1%|▏         | 29/1976 [00:17<39:37,  1.22s/it]

Proteome for UP000012675 downloaded successfully.


Downloading Proteomes:   2%|▏         | 30/1976 [00:19<43:59,  1.36s/it]

Proteome for UP000001697 downloaded successfully.


Downloading Proteomes:   2%|▏         | 31/1976 [00:19<38:11,  1.18s/it]

Proteome for UP000001889 downloaded successfully.


Downloading Proteomes:   2%|▏         | 32/1976 [00:21<46:01,  1.42s/it]

Proteome for UP000005156 downloaded successfully.


Downloading Proteomes:   2%|▏         | 33/1976 [00:23<45:48,  1.41s/it]

Proteome for UP000007477 downloaded successfully.


Downloading Proteomes:   2%|▏         | 34/1976 [00:24<49:13,  1.52s/it]

Proteome for UP000001173 downloaded successfully.


Downloading Proteomes:   2%|▏         | 35/1976 [00:26<46:50,  1.45s/it]

Proteome for UP000254807 downloaded successfully.


Downloading Proteomes:   2%|▏         | 36/1976 [00:27<49:10,  1.52s/it]

Proteome for UP000009077 downloaded successfully.


Downloading Proteomes:   2%|▏         | 37/1976 [00:29<47:34,  1.47s/it]

Proteome for UP000001420 downloaded successfully.


Downloading Proteomes:   2%|▏         | 38/1976 [00:30<44:49,  1.39s/it]

Proteome for UP000001020 downloaded successfully.


Downloading Proteomes:   2%|▏         | 39/1976 [00:32<50:00,  1.55s/it]

Proteome for UP000005740 downloaded successfully.


Downloading Proteomes:   2%|▏         | 40/1976 [00:34<50:46,  1.57s/it]

Proteome for UP000028631 downloaded successfully.


Downloading Proteomes:   2%|▏         | 41/1976 [00:36<55:18,  1.71s/it]

Proteome for UP000000556 downloaded successfully.


Downloading Proteomes:   2%|▏         | 42/1976 [00:38<1:03:58,  1.98s/it]

Proteome for UP000646478 downloaded successfully.


Downloading Proteomes:   2%|▏         | 43/1976 [00:40<1:05:36,  2.04s/it]

Proteome for UP000000605 downloaded successfully.


Downloading Proteomes:   2%|▏         | 44/1976 [00:43<1:09:07,  2.15s/it]

Proteome for UP000000432 downloaded successfully.


Downloading Proteomes:   2%|▏         | 45/1976 [00:44<1:05:11,  2.03s/it]

Proteome for UP000000586 downloaded successfully.


Downloading Proteomes:   2%|▏         | 46/1976 [00:46<58:53,  1.83s/it]  

Proteome for UP000000817 downloaded successfully.


Downloading Proteomes:   2%|▏         | 47/1976 [00:47<56:33,  1.76s/it]

Proteome for UP000481043 downloaded successfully.


Downloading Proteomes:   2%|▏         | 48/1976 [00:49<56:30,  1.76s/it]

Proteome for UP000240206 downloaded successfully.


Downloading Proteomes:   2%|▏         | 49/1976 [00:51<53:46,  1.67s/it]

Proteome for UP000008840 downloaded successfully.


Downloading Proteomes:   3%|▎         | 50/1976 [00:53<57:14,  1.78s/it]

Proteome for UP000557042 downloaded successfully.


Downloading Proteomes:   3%|▎         | 51/1976 [00:54<53:28,  1.67s/it]

Proteome for UP000000815 downloaded successfully.


Downloading Proteomes:   3%|▎         | 52/1976 [00:56<59:53,  1.87s/it]

Proteome for UP000001131 downloaded successfully.


Downloading Proteomes:   3%|▎         | 53/1976 [00:58<54:45,  1.71s/it]

Proteome for UP000001036 downloaded successfully.


Downloading Proteomes:   3%|▎         | 54/1976 [00:59<54:46,  1.71s/it]

Proteome for UP000254920 downloaded successfully.


Downloading Proteomes:   3%|▎         | 55/1976 [01:01<49:21,  1.54s/it]

Proteome for UP000254640 downloaded successfully.


Downloading Proteomes:   3%|▎         | 56/1976 [01:03<55:42,  1.74s/it]

Proteome for UP000001726 downloaded successfully.


Downloading Proteomes:   3%|▎         | 57/1976 [01:04<54:27,  1.70s/it]

Proteome for UP000000799 downloaded successfully.


Downloading Proteomes:   3%|▎         | 58/1976 [01:06<49:04,  1.53s/it]

Proteome for UP000003917 downloaded successfully.


Downloading Proteomes:   3%|▎         | 59/1976 [01:08<53:31,  1.68s/it]

Proteome for UP000017248 downloaded successfully.


Downloading Proteomes:   3%|▎         | 60/1976 [01:09<49:48,  1.56s/it]

Proteome for UP000001170 downloaded successfully.


Downloading Proteomes:   3%|▎         | 61/1976 [01:10<45:27,  1.42s/it]

Proteome for UP000001889 downloaded successfully.


Downloading Proteomes:   3%|▎         | 62/1976 [01:12<51:03,  1.60s/it]

Proteome for UP000002196 downloaded successfully.


Downloading Proteomes:   3%|▎         | 63/1976 [01:13<48:22,  1.52s/it]

Proteome for UP000000594 downloaded successfully.


Downloading Proteomes:   3%|▎         | 64/1976 [01:16<56:26,  1.77s/it]

Proteome for UP000000757 downloaded successfully.


Downloading Proteomes:   3%|▎         | 65/1976 [01:18<1:02:42,  1.97s/it]

Proteome for UP000000818 downloaded successfully.


Downloading Proteomes:   3%|▎         | 66/1976 [01:20<57:34,  1.81s/it]  

Proteome for UP000000260 downloaded successfully.


Downloading Proteomes:   3%|▎         | 67/1976 [01:22<59:40,  1.88s/it]

Proteome for UP000000841 downloaded successfully.


Downloading Proteomes:   3%|▎         | 68/1976 [01:23<59:50,  1.88s/it]

Proteome for UP000006135 downloaded successfully.


Downloading Proteomes:   3%|▎         | 69/1976 [01:25<55:38,  1.75s/it]

Proteome for UP000061457 downloaded successfully.


Downloading Proteomes:   4%|▎         | 70/1976 [01:27<57:32,  1.81s/it]

Proteome for UP000029431 downloaded successfully.


Downloading Proteomes:   4%|▎         | 71/1976 [01:29<55:35,  1.75s/it]

Proteome for UP000030341 downloaded successfully.


Downloading Proteomes:   4%|▎         | 72/1976 [01:30<55:03,  1.73s/it]

Proteome for UP000000935 downloaded successfully.


Downloading Proteomes:   4%|▎         | 73/1976 [01:32<1:00:17,  1.90s/it]

Proteome for UP000007137 downloaded successfully.


Downloading Proteomes:   4%|▎         | 74/1976 [01:34<1:01:15,  1.93s/it]

Proteome for UP000002719 downloaded successfully.


Downloading Proteomes:   4%|▍         | 75/1976 [01:36<57:51,  1.83s/it]  

Proteome for UP000006833 downloaded successfully.


Downloading Proteomes:   4%|▍         | 76/1976 [01:38<57:17,  1.81s/it]

Proteome for UP000008520 downloaded successfully.


Downloading Proteomes:   4%|▍         | 77/1976 [01:39<51:31,  1.63s/it]

Proteome for UP000002521 downloaded successfully.


Downloading Proteomes:   4%|▍         | 78/1976 [01:40<48:13,  1.52s/it]

Proteome for UP000001010 downloaded successfully.


Downloading Proteomes:   4%|▍         | 79/1976 [01:42<52:50,  1.67s/it]

Proteome for UP000007065 downloaded successfully.


Downloading Proteomes:   4%|▍         | 80/1976 [01:43<45:57,  1.45s/it]

Proteome for UP000000774 downloaded successfully.


Downloading Proteomes:   4%|▍         | 81/1976 [01:45<44:05,  1.40s/it]

Proteome for UP000000579 downloaded successfully.


Downloading Proteomes:   4%|▍         | 82/1976 [01:46<41:44,  1.32s/it]

Proteome for UP000283360 downloaded successfully.


Downloading Proteomes:   4%|▍         | 83/1976 [01:47<44:23,  1.41s/it]

Proteome for UP000008812 downloaded successfully.


Downloading Proteomes:   4%|▍         | 84/1976 [01:48<39:18,  1.25s/it]

Proteome for UP000001955 downloaded successfully.


Downloading Proteomes:   4%|▍         | 85/1976 [01:52<1:07:00,  2.13s/it]

Proteome for UP000000528 downloaded successfully.


Downloading Proteomes:   4%|▍         | 86/1976 [01:54<57:50,  1.84s/it]  