In [1]:
from Bio import PDB
import math
import os
import requests
import numpy as np

In [5]:
def load_codes(filepath):
    with open(filepath, "r", encoding="utf-8-sig") as f:
        codes = f.read().split()
    return codes

codes = load_codes("protein_codes.txt")
print(codes[:10])

['101M', '102L', '102M', '103L', '103M', '104L', '104M', '105M', '106M', '107L']


In [6]:
from Bio import PDB
import math

def get_ramachandran_coordinates(file_path):
    """
    Extracts only the phi (x) and psi (y) angles from a PDB file.
    Returns a dictionary with lists of x and y values.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('input_structure', file_path)
    
    ramachandran_data = {"x": [], "y": []}

    for model in structure:
        for chain in model:
            polypeptides = PDB.PPBuilder().build_peptides(chain)
            for poly in polypeptides:
                phi_psi = poly.get_phi_psi_list()
                
                for i, (phi, psi) in enumerate(phi_psi):
                    # Only append if both angles are present (not None)
                    if phi is not None and psi is not None:
                        # Convert radians to degrees
                        ramachandran_data["x"].append(math.degrees(phi))
                        ramachandran_data["y"].append(math.degrees(psi))
    ramachandran_data["x"] = np.array(ramachandran_data["x"])
    ramachandran_data["y"] = np.array(ramachandran_data["y"])
    coords = np.column_stack((ramachandran_data["x"], ramachandran_data["y"])) 
    return coords

In [8]:
def download_pdb_file(pdb_id, download_dir="."):
    """
    Downloads the .pdb file directly from RCSB using the PDB ID.
    """
    pdb_id = pdb_id.upper()
    pdb_file = f"{pdb_id}.pdb"
    file_path = os.path.join(download_dir, pdb_file)
    
    # Check if file already exists to save time/bandwidth
    if os.path.exists(file_path):
        print(f"File {pdb_file} already exists. Using local copy.")
        return file_path

    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    
    try:
        print(f"Downloading {pdb_id}...")
        response = requests.get(url)
        response.raise_for_status() # Raises error for 404 (ID not found)
        
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Download complete: {file_path}")
        return file_path
    except requests.exceptions.HTTPError:
        print(f"Error: PDB ID '{pdb_id}' not found on RCSB.")
        return None
    except Exception as e:
        print(f"Error downloading PDB: {e}")
        return None

def extract_sequence(file_path):
    """
    Extracts the amino acid sequence from the PDB file.
    Uses PPBuilder to ensure it matches the indices used in the Ramachandran calculation.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('seq_extraction', file_path)
    ppb = PDB.PPBuilder()
    
    full_sequence = ""
    
    for model in structure:
        for chain in model:
            # build_peptides filters out waters and heteroatoms automatically
            for pp in ppb.build_peptides(chain):
                full_sequence += str(pp.get_sequence())
                
    return full_sequence

# --- Main Pipeline ---

def process_pdb_pipeline(pdb_id):
    """
    Takes a PDB ID, downloads the file, calculates angles, and extracts sequence.
    """
    # 1. Download PDB
    file_path = download_pdb_file(pdb_id)
    if not file_path:
        return None

    # 2. Get Ramachandran Coordinates
    coords = get_ramachandran_coordinates(file_path)

    # 3. Extract Sequence
    sequence = extract_sequence(file_path)

    # 4. Package Results
    result = {
        "pdb_id": pdb_id,
        "sequence": sequence,
        "ramachandran": coords
    }
    
    return result


In [9]:
# -- Insert codes list into main Pipeline --
all_results = {}

for pdb_id in codes[:10]:
    print(f"Processing {pdb_id}...")
    data = process_pdb_pipeline(pdb_id)

    if data:
        print(f"\n--- Results for {data['pdb_id']} ---")
        print(f"Sequence ({len(data['sequence'])} residues): {data['sequence']}")

Processing 101M...
Downloading 101M...
Download complete: ./101M.pdb

--- Results for 101M ---
Sequence (154 residues): MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG
Processing 102L...
Downloading 102L...
Download complete: ./102L.pdb

--- Results for 102L ---
Sequence (163 residues): MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYK
Processing 102M...
Downloading 102M...
Download complete: ./102M.pdb

--- Results for 102M ---
Sequence (154 residues): MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASEDLKKAGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG
Processing 103L...
Downloading 103L...
Download complete: ./103L.pdb

--- Results for 103L ---
Sequence (159 residues): MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTSLDAAKSELDKA

In [10]:
protein_dict = {}

In [11]:

protein_dict[data['pdb_id']] = (data['sequence'], data['ramachandran'])

In [12]:
protein_dict

{'107L': ('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYK',
  array([[-145.87450235,  175.53809219],
         [ -55.82793668,  -41.09638299],
         [ -61.94609212,  -49.93547553],
         [ -67.00905809,  -34.60037108],
         [ -63.78804171,  -53.68502038],
         [ -63.17517412,  -33.64460437],
         [ -61.99941578,  -30.16338333],
         [ -78.13103762,  -41.40244174],
         [ -70.75995328,  -35.24122609],
         [-100.37572338,  -25.62298745],
         [  76.81055052, -173.49859269],
         [-139.10031203,  126.98060169],
         [-137.78557477,  127.69274472],
         [ -89.43798077,   -1.06667158],
         [-125.04445621,  148.16328167],
         [ -48.40189062,  139.49093154],
         [-144.92083932,  171.9451595 ],
         [-103.10467497,  134.83686852],
         [ -76.53526049, -169.44520924],
         [ -70.33750302,  -21.61089684],
    