In [5]:
!pip install biopython
import torch
import numpy as np
import json
from Bio.PDB import PDBParser

# 1. Function to extract pLDDT (Confidence)
def extract_plddt_from_pdb(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)

    plddt = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    # pLDDT is stored in the B-factor column of AlphaFold PDBs
                    plddt.append(atom.get_bfactor())
                    break # We only need one value per residue
    return np.array(plddt)



In [6]:
# 2. Function to load PAE (Error Matrix)
def load_pae_from_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    # Extract matrix. Shape: [N_res, N_res]
    pae_matrix = np.array(data[0]['predicted_aligned_error'])
    return pae_matrix

# 3. REVISED Function to create Residue-Level Embeddings
def create_residue_embedding(plddt, pae, output_pt):
    # A. Process PAE: Average the error for each residue
    # Shape: [N_res]
    pae_mean = np.mean(pae, axis=1)

    # B. Combine pLDDT and PAE
    # Shape: [N_res, 2]
    # We stack them side-by-side
    features = np.stack([plddt, pae_mean], axis=1)

    # C. Convert to Tensor and Expand to 384 dimensions
    # Current shape: [N_res, 2]
    x = torch.tensor(features, dtype=torch.float32)

    # We replicate the 2 features 192 times to get 384 dimensions
    # (2 * 192 = 384). Ideally, a Linear layer in the model would do this,
    # but we will match your existing input dimension requirement.
    x = x.unsqueeze(-1).repeat(1, 1, 192) # Shape: [N_res, 2, 192]
    x = x.reshape(x.shape[0], -1)      # Shape: [N_res, 384]

    # D. Add Batch Dimension
    # Final Shape: [1, N_res, 384]
    x = x.unsqueeze(0)

    print(f"Generated embedding shape: {x.shape}")

    # Save ONLY the residue embedding (we don't need global anymore)
    torch.save(x, output_pt)
    print(f"Saved to {output_pt}")

In [7]:
# --- EXECUTION BLOCK ---
print("Processing EGFR...")
egfr_plddt = extract_plddt_from_pdb('AF-P00533-F1-model_v6.pdb')
egfr_pae = load_pae_from_json('AF-P00533-F1-predicted_aligned_error_v6.json')
create_residue_embedding(egfr_plddt, egfr_pae, 'EGFR_embedding_residue.pt')

print("\nProcessing MET...")
met_plddt = extract_plddt_from_pdb('AF-P08581-F1-model_v6.pdb')
met_pae = load_pae_from_json('AF-P08581-F1-predicted_aligned_error_v6.json')
create_residue_embedding(met_plddt, met_pae, 'MET_embedding_residue.pt')

Processing EGFR...
Generated embedding shape: torch.Size([1, 1210, 384])
Saved to EGFR_embedding_residue.pt

Processing MET...
Generated embedding shape: torch.Size([1, 1390, 384])
Saved to MET_embedding_residue.pt
