In [22]:
from Bio import PDB
import math
import os
import requests
import numpy as np

In [23]:
from Bio import PDB
import math

def get_ramachandran_coordinates(file_path):
    """
    Extracts only the phi (x) and psi (y) angles from a PDB file.
    Returns a dictionary with lists of x and y values.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('input_structure', file_path)
    
    ramachandran_data = {"x": [], "y": []}

    for model in structure:
        for chain in model:
            polypeptides = PDB.PPBuilder().build_peptides(chain)
            for poly in polypeptides:
                phi_psi = poly.get_phi_psi_list()
                
                for i, (phi, psi) in enumerate(phi_psi):
                    # Only append if both angles are present (not None)
                    if phi is not None and psi is not None:
                        # Convert radians to degrees
                        ramachandran_data["x"].append(math.degrees(phi))
                        ramachandran_data["y"].append(math.degrees(psi))
    ramachandran_data["x"] = np.array(ramachandran_data["x"])
    ramachandran_data["y"] = np.array(ramachandran_data["y"])
    coords = np.column_stack((ramachandran_data["x"], ramachandran_data["y"])) 
    return coords

In [25]:
def download_pdb_file(pdb_id, download_dir="."):
    """
    Downloads the .pdb file directly from RCSB using the PDB ID.
    """
    pdb_id = pdb_id.upper()
    pdb_file = f"{pdb_id}.pdb"
    file_path = os.path.join(download_dir, pdb_file)
    
    # Check if file already exists to save time/bandwidth
    if os.path.exists(file_path):
        print(f"File {pdb_file} already exists. Using local copy.")
        return file_path

    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    
    try:
        print(f"Downloading {pdb_id}...")
        response = requests.get(url)
        response.raise_for_status() # Raises error for 404 (ID not found)
        
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Download complete: {file_path}")
        return file_path
    except requests.exceptions.HTTPError:
        print(f"Error: PDB ID '{pdb_id}' not found on RCSB.")
        return None
    except Exception as e:
        print(f"Error downloading PDB: {e}")
        return None

def extract_sequence(file_path):
    """
    Extracts the amino acid sequence from the PDB file.
    Uses PPBuilder to ensure it matches the indices used in the Ramachandran calculation.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('seq_extraction', file_path)
    ppb = PDB.PPBuilder()
    
    full_sequence = ""
    
    for model in structure:
        for chain in model:
            # build_peptides filters out waters and heteroatoms automatically
            for pp in ppb.build_peptides(chain):
                full_sequence += str(pp.get_sequence())
                
    return full_sequence

# --- Main Pipeline ---

def process_pdb_pipeline(pdb_id):
    """
    Takes a PDB ID, downloads the file, calculates angles, and extracts sequence.
    """
    # 1. Download PDB
    file_path = download_pdb_file(pdb_id)
    if not file_path:
        return None

    # 2. Get Ramachandran Coordinates
    coords = get_ramachandran_coordinates(file_path)

    # 3. Extract Sequence
    sequence = extract_sequence(file_path)

    # 4. Package Results
    result = {
        "pdb_id": pdb_id,
        "sequence": sequence,
        "ramachandran": coords
    }
    
    return result

if __name__ == "__main__":
    target_pdb = "1CRN" 
    
    data = process_pdb_pipeline(target_pdb)
    
    if data:
        print(f"\n--- Results for {data['pdb_id']} ---")
        print(f"Sequence ({len(data['sequence'])} residues): {data['sequence']}")

File 1CRN.pdb already exists. Using local copy.

--- Results for 1CRN ---
Sequence (46 residues): TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN


In [26]:
protein_dict = {}

In [27]:
target_pdb = "3CRN"
data = process_pdb_pipeline(target_pdb)
protein_dict[data['pdb_id']] = (data['sequence'], data['ramachandran'])

File 3CRN.pdb already exists. Using local copy.


In [28]:
protein_dict

{'3CRN': ('LKRILIVDDDTAILDSTKQILEFEGYEVEIAATAGEGLAKIENEFFNLALFIKLPDEGTELLEKAHKLRPGKKIVTGYASLENSVFSLNAGADAYIKPVNPRDLLEKIKEKLDEQEKEGHHHHHSLKRILIVDDDTAILDSTKQILEFEGYEVEIAATAGEGLAKIENEFFNLALFIKLPDEGTELLEKAHKLRPGKKIVTGYASLENSVFSLNAGADAYIKPVNPRDLLEKIKEKLDEQEKEG',
  array([[ -73.79127199,  143.02404401],
         [-122.62705447,  136.60436491],
         [-129.35523554,  138.92850671],
         [-107.20537999,  132.94934073],
         [-114.25108985,  126.25579493],
         [-121.46467601,  112.85075215],
         [-166.89582099,  134.30174572],
         [ -77.03159681,   -8.44518855],
         [ -81.96815195,   97.52096858],
         [ -66.54872116,  -25.56403233],
         [ -68.42239034,  -38.54725628],
         [ -62.74915175,  -45.12577913],
         [ -55.16237417,  -53.32335782],
         [ -64.23255603,  -44.95776304],
         [ -64.35585877,  -42.66941786],
         [ -68.94278549,  -40.3698185 ],
         [ -55.94510642,  -49.17718106],
         [ -53.10582981,  -50.05035275],
    