In [None]:
import requests
import pandas as pd

# Load the dataset
df = pd.read_csv(
    "PPIDataset.txt",
    sep=" ",  
    skiprows=1,  
    header=None,  
    names=["protein1", "protein2", "combined_score"],  
    dtype={"protein1": str, "protein2": str, "combined_score": float}
)

# Clean the protein IDs
df["protein1"] = df["protein1"].str.replace("9606.", "", regex=False)
df["protein2"] = df["protein2"].str.replace("9606.", "", regex=False)

# Get unique protein IDs from both columns
unique_proteins = pd.concat([df["protein1"], df["protein2"]]).unique()
print(f"Total unique proteins: {len(unique_proteins)}")

# Cache for storing Ensembl to UniProt mappings
mapping_cache = {}

# Function to map a batch of Ensembl IDs to UniProt IDs
def map_batch_to_uniprot(ensembl_ids):
    url = "https://rest.uniprot.org/idmapping/run"
    data = {"from": "Ensembl_Protein", "to": "UniProtKB", "ids": ",".join(ensembl_ids)}
    response = requests.post(url, data=data)
    
    if response.status_code == 200:
        job_id = response.json().get("jobId")
        status_url = f"https://rest.uniprot.org/idmapping/status/{job_id}"
        
        # Wait for the job to complete
        while True:
            status_response = requests.get(status_url)
            if status_response.status_code == 200:
                status_data = status_response.json()
                if status_data.get("results"):
                    results_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/{job_id}?format=tsv"
                    results_response = requests.get(results_url)
                    if results_response.status_code == 200:
                        # Parse the TSV results
                        results = results_response.text.strip().split("\n")
                        header = results[0].split("\t")  # Extract header
                        print("TSV Header:", header)  # Inspect the header
                        for line in results[1:]:  # Skip header
                            columns = line.split("\t")
                            # Ensure the line has the expected number of columns
                            if len(columns) == len(header):
                                # Extract Ensembl ID and UniProt ID
                                ensembl_id = columns[header.index("From")]  # Column with Ensembl IDs
                                uniprot_id = columns[header.index("Entry")]  # Column with UniProt IDs
                                mapping_cache[ensembl_id] = uniprot_id
                    break
    return

# Batch size for UniProt API (max 100,000 per request)
BATCH_SIZE = 25  # Adjust based on API limits and performance

# Map unique proteins in batches
for i in range(0, len(unique_proteins), BATCH_SIZE):
    batch = unique_proteins[i:i + BATCH_SIZE]
    print(f"Mapping batch {i // BATCH_SIZE + 1} of {len(unique_proteins) // BATCH_SIZE + 1}")
    map_batch_to_uniprot(batch)
    print(len(mapping_cache))
# Apply mappings to the dataset
df["protein1_uniprot"] = df["protein1"].map(mapping_cache)
df["protein2_uniprot"] = df["protein2"].map(mapping_cache)

# Save the updated dataset
df.to_csv("mapped_dataset.csv", index=False)

# Display the updated dataset
print(df.head())