In [1]:
import pandas as pd

In [None]:
bacteria_series = pd.read_csv("bacteria_species.csv")
bacteria_series = bacteria_series['species']
bacteria_series

0                             bacterium
1         TM7 phylum sp. oral taxon 352
2                [Eubacterium] infirmum
3         TM7 phylum sp. oral taxon 348
4                [Eubacterium] infirmum
                     ...               
5103    Arcticibacter pallidicorallinus
5104              Massilimicrobiota sp.
5105            Pontibacter ummariensis
5106               Bacillus sp. KBS0812
5107          Anaeromusa acidaminophila
Name: species, Length: 5108, dtype: object

In [78]:
import pandas as pd

# Clean bacterial names
def clean_bacteria_name(name):
    name = name.replace('[', '').replace(']', '').strip()  # Remove brackets
    if 'sp.' in name:  # Handle species abbreviations
        name = name.split('sp.')[0].strip()
    return name

bacteria_series = bacteria_series.apply(clean_bacteria_name).drop_duplicates()
bacteria_series
#bacteria_series_cleaned

0                             bacterium
1                            TM7 phylum
2                  Eubacterium infirmum
5                    bacterium 1xD42-67
7                     bacterium YEK0313
                     ...               
5102              Corynebacterium sputi
5103    Arcticibacter pallidicorallinus
5104                  Massilimicrobiota
5105            Pontibacter ummariensis
5107          Anaeromusa acidaminophila
Name: species, Length: 4682, dtype: object

In [63]:
import pandas as pd

# Load the CSV into a pandas DataFrame
df = pd.read_csv("uniprot_reference_proteomes_ids.csv", sep = ';')
df


Unnamed: 0,Proteome_ID,Tax_ID,OSCODE,SUPERREGNUM,#(1),#(2),#(3),Species Name
0,UP000521578,47692,,eukaryota,14416,0,14442,Menura novaehollandiae (superb lyrebird)
1,UP000029743,1554474,ROSV2,viruses,1,0,1,Rosavirus A2 (isolate Human/Gambia/GA7403/2008)
2,UP000628017,2035344,,bacteria,3942,0,3942,Amylibacter cionae
3,UP001232722,2664220,,viruses,2,0,2,Tadarida brasiliensis gemykibivirus 1
4,UP000248311,1795041,,bacteria,3295,0,3300,Pseudoroseicyclus aestuarii
...,...,...,...,...,...,...,...,...
24950,UP000203542,2560777,,viruses,83,0,83,Spheniscid alphaherpesvirus 1
24951,UP000273303,2490946,,bacteria,2387,0,2391,Actinobaculum sp. 352
24952,UP000031129,743971,,bacteria,553,0,563,Mesomycoplasma flocculare ATCC 27399
24953,UP000000814,272562,CLOAB,bacteria,3847,0,3847,Clostridium acetobutylicum (strain ATCC 824 / ...


In [79]:


# Filter the DataFrame for bacteria
bacteria_df = df[df["SUPERREGNUM"] == "bacteria"]

# Create a function for loose matching
def find_proteome_id(name, species_column, proteome_column):
    for index, species in enumerate(species_column):
        if name in species:  # Check if the name is a substring of the species
            return proteome_column.iloc[index]
    return None

# Map the bacteria names in the Series to their Proteome_ID using loose matching
proteome_ids = bacteria_series.apply(
    lambda name: find_proteome_id(name, bacteria_df["Species Name"], bacteria_df["Proteome_ID"])
)

# Drop NaN values and output the result
proteome_ids = proteome_ids.dropna()
print(len(proteome_ids))  # Count of matches
print(proteome_ids)  # Matched Proteome_IDs


1976
0       UP000198817
2       UP000004504
5       UP000270441
7       UP000236625
8       UP000001410
           ...     
5098    UP000002939
5099    UP000006085
5103    UP000238034
5104    UP000195713
5105    UP000198432
Name: species, Length: 1976, dtype: object


In [None]:
# TRY USING UNIPROT

import requests
import os
import time

# Function to download proteome using the new UniProt API
def download_proteome(reference_proteome, output_dir="proteomes"):
    """
    Download the proteome of a given bacteria using the updated UniProt REST API.
    """
    
    url = (f'https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28proteome%3A{reference_proteome}%29%29')
    
    # Send the request to the UniProt API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Save the proteome data to a file
        filename = os.path.join(output_dir, f"{reference_proteome.replace(' ', '_')}_proteome.fasta")
        with open(filename, "w") as file:
            file.write(response.text)
        print(f"Proteome for {reference_proteome} downloaded successfully.")
    else:
        # Log the failure
        print(f"Failed to fetch proteome for {reference_proteome}: {response.status_code}")
        print(f"Error details: {response.text}")


# Download proteomes for each bacterium
for proteome_id in proteome_ids:
    download_proteome(proteome_id)
    time.sleep(0.3)  # Add a delay to avoid hitting rate limits
