In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Define the project root and data path
project_root = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA"
data_path = f"{project_root}/data"

In [None]:
#Test parse and submit to NCBI Blast
from Bio import SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import time

#Load FASTA File from drive
fasta_path = f"{data_path}/davis_proteins.fasta"
records = list(SeqIO.parse(fasta_path, "fasta"))

#Select first 5 for testing
test_records = records[:5]

results = []

for record in test_records:
  print(f"Querying {record.id}...")
  try:
    result_handle = NCBIWWW.qblast("blastp", "swissprot", record.format("fasta"))
    blast_record = NCBIXML.read(result_handle)

    if blast_record.alignments:
      top_hit = blast_record.alignments[0].accession
      results.append({"Protein_ID": record.id, "UniProt_ID": top_hit})
    else:
      results.append({"Protein_ID": record.id, "UniProt_ID": None})

    time.sleep(2) #Respect NCBI rate limit

  except Exception as e:
    print(f"Error for {record.id}: {e}")
    results.append({"Protein_ID": record.id, "UniProt_ID": None})

#Takes almost 10 minutes for 5 proteins, not feasible for 433 sequences.

Querying Protein_0...
Querying Protein_1...
Querying Protein_3...
Querying Protein_5...
Querying Protein_7...


In [None]:
!wget -O /content/uniprot_sprot.fasta.gz "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
!gunzip -f /content/uniprot_sprot.fasta.gz


--2025-05-17 02:42:21--  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 92986586 (89M) [application/x-gzip]
Saving to: ‘/content/uniprot_sprot.fasta.gz’


2025-05-17 02:42:22 (84.8 MB/s) - ‘/content/uniprot_sprot.fasta.gz’ saved [92986586/92986586]



In [None]:
from Bio import SeqIO
import pandas as pd

# Load Davis sequences
davis_records = list(SeqIO.parse("/content/davis_proteins.fasta", "fasta"))
davis_map = {str(record.seq): record.id for record in davis_records}

# Load UniProt (Swiss-Prot) FASTA
uniprot_records = list(SeqIO.parse("/content/uniprot_sprot.fasta", "fasta"))

matches = []
for record in uniprot_records:
    seq = str(record.seq)
    if seq in davis_map:
        matches.append({
            "Protein_ID": davis_map[seq],
            "UniProt_ID": record.id.split("|")[1] if "|" in record.id else record.id
        })

# Save mapping
df = pd.DataFrame(matches)
output_path = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/uniprot_mapping.csv"
df.to_csv(output_path, index=False)

print(f"✅ Saved {len(df)} matches to: {output_path}")


✅ Saved 267 matches to: /content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/uniprot_mapping.csv


In [None]:
import pandas as pd
import os
import requests

# Load UniProt mapping
df = pd.read_csv("/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/uniprot_mapping.csv")

# Set output directory for PDB files
output_dir = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/davis_alphafold_structures"
os.makedirs(output_dir, exist_ok=True)

# Download AlphaFold .pdb files
for _, row in df.iterrows():
    protein_id = row["Protein_ID"]
    uniprot_id = row["UniProt_ID"]
    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
    save_path = os.path.join(output_dir, f"{protein_id}.pdb")

    if not os.path.exists(save_path):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    f.write(response.content)
                print(f"✅ Downloaded: {protein_id} → {uniprot_id}")
            else:
                print(f"❌ Not found: {uniprot_id} (status: {response.status_code})")
        except Exception as e:
            print(f"❌ Error for {uniprot_id}: {e}")


✅ Downloaded: Protein_29 → P54646
✅ Downloaded: Protein_15 → P00519
✅ Downloaded: Protein_18 → P36896
✅ Downloaded: Protein_17 → Q04771
✅ Downloaded: Protein_24 → P31749
✅ Downloaded: Protein_25 → P31751
✅ Downloaded: Protein_26 → Q9Y243
✅ Downloaded: Protein_27 → Q9UM73
✅ Downloaded: Protein_30 → Q8NFD2
✅ Downloaded: Protein_34 → O14965
✅ Downloaded: Protein_19 → P27037
✅ Downloaded: Protein_20 → Q13705
✅ Downloaded: Protein_39 → P51451
✅ Downloaded: Protein_42 → Q13873
✅ Downloaded: Protein_40 → P36894
✅ Downloaded: Protein_41 → O00238
✅ Downloaded: Protein_43 → P51813
✅ Downloaded: Protein_44 → P15056
✅ Downloaded: Protein_47 → Q8TDC3
✅ Downloaded: Protein_49 → Q06187
✅ Downloaded: Protein_63 → Q14004
✅ Downloaded: Protein_294 → Q00536
✅ Downloaded: Protein_296 → Q07002
✅ Downloaded: Protein_64 → Q9BWU1
✅ Downloaded: Protein_301 → P61075
✅ Downloaded: Protein_65 → P24941
✅ Downloaded: Protein_66 → Q00526
✅ Downloaded: Protein_69 → Q00535
✅ Downloaded: Protein_70 → P50613
✅ Downloade