In [None]:
# Ch16-2 Protein Design with LLMs

In [None]:
# Install packages 
! pip install torch transformers numpy

In [1]:
"""
ProtGPT2 Protein Design Example
This script demonstrates how to use ProtGPT2 to generate novel protein sequences.
"""

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import re
import numpy as np
from typing import List, Optional
import warnings
warnings.filterwarnings('ignore')

class ProtGPT2Designer:
    """A wrapper class for protein design using ProtGPT2."""
    
    def __init__(self, model_name: str = "nferruz/ProtGPT2"):
        """
        Initialize the ProtGPT2 model and tokenizer.
        
        Args:
            model_name: HuggingFace model identifier for ProtGPT2
        """
        print("Loading ProtGPT2 model and tokenizer...")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Load model and tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        
        # Set pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        print("Model loaded successfully!")
    
    def generate_protein(self, 
                        prompt: str = "<|endoftext|>",
                        max_length: int = 200,
                        temperature: float = 1.0,
                        num_sequences: int = 1,
                        do_sample: bool = True,
                        top_p: float = 0.9) -> List[str]:
        """
        Generate protein sequences using ProtGPT2.
        
        Args:
            prompt: Starting sequence or prompt (use "<|endoftext|>" for de novo generation)
            max_length: Maximum length of generated sequence
            temperature: Sampling temperature (higher = more random)
            num_sequences: Number of sequences to generate
            do_sample: Whether to use sampling (vs greedy decoding)
            top_p: Top-p sampling parameter
            
        Returns:
            List of generated protein sequences
        """
        print(f"Generating {num_sequences} protein sequence(s)...")
        
        # Encode the prompt
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        
        # Generate sequences
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=max_length,
                temperature=temperature,
                num_return_sequences=num_sequences,
                do_sample=do_sample,
                top_p=top_p,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
        
        # Decode sequences
        sequences = []
        for output in outputs:
            sequence = self.tokenizer.decode(output, skip_special_tokens=True)
            # Clean up the sequence (remove prompt if present)
            if prompt != "<|endoftext|>" and sequence.startswith(prompt):
                sequence = sequence[len(prompt):]
            sequences.append(sequence.strip())
        
        return sequences
    
    def validate_sequence(self, sequence: str) -> dict:
        """
        Basic validation of a protein sequence.
        
        Args:
            sequence: Protein sequence to validate
            
        Returns:
            Dictionary with validation results
        """
        # Standard amino acids
        standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
        
        # Clean sequence (remove whitespace and convert to uppercase)
        clean_seq = re.sub(r'\s+', '', sequence.upper())
        
        # Calculate statistics
        length = len(clean_seq)
        valid_aa = sum(1 for aa in clean_seq if aa in standard_aa)
        invalid_aa = length - valid_aa
        validity_ratio = valid_aa / length if length > 0 else 0
        
        # Calculate amino acid composition
        aa_counts = {aa: clean_seq.count(aa) for aa in standard_aa}
        aa_frequencies = {aa: count/length for aa, count in aa_counts.items() if length > 0}
        
        return {
            'sequence': clean_seq,
            'length': length,
            'valid_amino_acids': valid_aa,
            'invalid_amino_acids': invalid_aa,
            'validity_ratio': validity_ratio,
            'is_valid': validity_ratio > 0.95,  # At least 95% valid amino acids
            'amino_acid_composition': aa_frequencies
        }
    
    def design_proteins_with_constraints(self, 
                                       target_length: int = 150,
                                       num_attempts: int = 10,
                                       min_validity: float = 0.95) -> List[dict]:
        """
        Design proteins with specific constraints.
        
        Args:
            target_length: Desired protein length
            num_attempts: Number of generation attempts
            min_validity: Minimum validity ratio required
            
        Returns:
            List of validated protein designs
        """
        print(f"Designing proteins with target length ~{target_length} amino acids...")
        
        valid_designs = []
        
        for attempt in range(num_attempts):
            # Generate sequence
            sequences = self.generate_protein(
                max_length=target_length + 50,  # Allow some flexibility
                temperature=0.8,
                num_sequences=1
            )
            
            for seq in sequences:
                validation = self.validate_sequence(seq)
                
                # Check constraints
                if (validation['validity_ratio'] >= min_validity and 
                    abs(validation['length'] - target_length) <= target_length * 0.2):  # Within 20% of target
                    
                    validation['attempt'] = attempt + 1
                    valid_designs.append(validation)
                    print(f"Valid design found (attempt {attempt + 1}): "
                          f"Length={validation['length']}, "
                          f"Validity={validation['validity_ratio']:.3f}")
        
        return valid_designs

def main():
    """Main function demonstrating ProtGPT2 usage."""
    
    # Initialize the designer
    designer = ProtGPT2Designer()
    
    print("\n" + "="*60)
    print("EXAMPLE 1: De novo protein generation")
    print("="*60)
    
    # Generate de novo proteins
    sequences = designer.generate_protein(
        prompt="<|endoftext|>",
        max_length=100,
        temperature=0.8,
        num_sequences=3
    )
    
    for i, seq in enumerate(sequences, 1):
        print(f"\nGenerated Protein {i}:")
        print(f"Sequence: {seq}")
        
        # Validate the sequence
        validation = designer.validate_sequence(seq)
        print(f"Length: {validation['length']} amino acids")
        print(f"Validity: {validation['validity_ratio']:.3f}")
        print(f"Valid: {validation['is_valid']}")
    
    print("\n" + "="*60)
    print("EXAMPLE 2: Protein generation with a starting motif")
    print("="*60)
    
    # Generate proteins starting with a specific motif
    motif_sequences = designer.generate_protein(
        prompt="MKKLLF",  # Starting with a signal peptide-like motif
        max_length=150,
        temperature=0.7,
        num_sequences=2
    )
    
    for i, seq in enumerate(motif_sequences, 1):
        print(f"\nMotif-based Protein {i}:")
        print(f"Sequence: MKKLLF{seq}")
        validation = designer.validate_sequence(f"MKKLLF{seq}")
        print(f"Length: {validation['length']} amino acids")
        print(f"Validity: {validation['validity_ratio']:.3f}")
    
    print("\n" + "="*60)
    print("EXAMPLE 3: Constrained protein design")
    print("="*60)
    
    # Design proteins with specific constraints
    constrained_designs = designer.design_proteins_with_constraints(
        target_length=120,
        num_attempts=15,
        min_validity=0.98
    )
    
    print(f"\nFound {len(constrained_designs)} valid designs:")
    for i, design in enumerate(constrained_designs[:3], 1):  # Show first 3
        print(f"\nDesign {i}:")
        print(f"Sequence: {design['sequence']}")
        print(f"Length: {design['length']} amino acids")
        print(f"Validity: {design['validity_ratio']:.3f}")
        
        # Show amino acid composition for interesting residues
        composition = design['amino_acid_composition']
        hydrophobic = sum(composition.get(aa, 0) for aa in 'AILMFPWV')
        charged = sum(composition.get(aa, 0) for aa in 'DEKR')
        print(f"Hydrophobic content: {hydrophobic:.3f}")
        print(f"Charged content: {charged:.3f}")

if __name__ == "__main__":
    # Install required packages first:
    # pip install torch transformers numpy
    
    print("ProtGPT2 Protein Design Example")
    print("===============================")
    print("This example demonstrates how to use ProtGPT2 for protein design.")
    print("Make sure you have installed: torch, transformers, numpy")
    print()
    
    try:
        main()
    except ImportError as e:
        print(f"Missing required package: {e}")
        print("Please install required packages:")
        print("pip install torch transformers numpy")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Make sure you have an internet connection to download the model.")

ProtGPT2 Protein Design Example
This example demonstrates how to use ProtGPT2 for protein design.
Make sure you have installed: torch, transformers, numpy

Loading ProtGPT2 model and tokenizer...
Using device: cpu


Using pad_token, but it is not set yet.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Model loaded successfully!

EXAMPLE 1: De novo protein generation
Generating 3 protein sequence(s)...

Generated Protein 1:
Sequence: MNRIACIDIGLKTIGFAVSDKTNTLAFPVKVLKRKNIKKELIKLKKIIEEEKPEIVIVGL
PLNMDGTLGPMAKKTQKFAYLLKEKIKLPIYTIDERLSSFEADKILIESGASKKKRKKIV
DKIAAVYILQGYLDAI
Length: 136 amino acids
Validity: 1.000
Valid: True

Generated Protein 2:
Sequence: MDTPDWQQAWQTHFQNQPAIPHQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQNQPFP
HQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQ
NQPFPHQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQNQPFPHQPHHQGQNQPFPHQP
HHQGQNQPFPHQPHHQGQNQPFPHQP
Length: 206 amino acids
Validity: 1.000
Valid: True

Generated Protein 3:
Sequence: MIYDYFIFCSRNYSNDRTNYIIFHKNEFIRYNTSEYINNIIYYHNIFRYNYYNKYIFRYN
KYIFRYNNYYKYIFRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILRYNN
YILRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILRYNNYILR
YNNYILRYNNYILRYNNYILRYNNYILRYNNYIL
Length: 214 amino acids
Validity: 1.000
Valid: True

EXAMPLE 2: Protein generation with a starting motif
Generating 2 protein seque

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



Motif-based Protein 1:
Sequence: MKKLLFLLLILGLSISCSDDDNNGGGNSNPPPDYNALIIGKWESVYQYTGTNENGNVVTN
EYTCDDDNTFVFNDDGTVEIYNDGTEDCNETTSTGTWSLSADGLTLTTEEGTETYTLTTL
NSTTLTITESATVDGVTETETEEYTF
Length: 146 amino acids
Validity: 1.000

Motif-based Protein 2:
Sequence: MKKLLFASLFLFAGLLFAQENTLKVEKDFFELKSSFNYESYDENTKRNTAKLNYYNFNLN
GYYYPNEKWSVGLGMGYNKDKINTNIDGVTYKSSGDGFAITPFIKYYFNQENKLAPFVSF
GYNYQTFKYKNEDLKTSTSAFGVGVKVGVNYFINDQLGLDVNVGYLGYKKLESENDNKNT
NNISFGFNIGYRF
Length: 193 amino acids
Validity: 1.000

EXAMPLE 3: Constrained protein design
Designing proteins with target length ~120 amino acids...
Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Valid design found (attempt 10): Length=125, Validity=1.000
Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating 1 protein sequence(s)...

Found 1 valid designs:

Design 1:
Sequence: MTALLALLALLVLVPLAVRRLPRGPVGTLLAGALALAVAAAVAGGLLAAPDGRAGARLLGLALLLAGLAALVARRARPAPAAPRGVAALLAAAGALLLLGLLAPGLLPAPPGLAPPPGAAAGAGP
Length: 125 amino acids
Validity: 1.000
Hydrophobic content: 0.768
Charged content: 0.080
