In [59]:
import numpy as np
from collections import defaultdict
from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqRecord
import random

FILE_PATH = "8anq.pdb"
random.seed(25032024)

def size_picker_v2(fasta_file = "Scer_NCBI_iORF.faa", min_size = 20, max_size = 70):

    sizes = []

    for record in SeqIO.parse(fasta_file, "fasta"):
        length = len(record.seq)
        if min_size <= length <= max_size:

            sizes.append(length)
    
    return random.choice(sizes)


def read_pdb(file_path):

    pdb_struct = {}

    pdb_struct["protein_name"] = file_path.split(".")[0]

    pdb_struct["full"] = defaultdict(defaultdict)
    pdb_struct["CA"] = defaultdict(defaultdict)
    pdb_struct["membrane_coord"] = []

    array = []

    protein_length = 0

    with open(file_path,"r") as f:

        line = f.readline()

        while line:

            line = line.split()

            if line[0] == "ATOM":

                # Line format : 
                # ATOM      2  CA  MET A   1      24.767  -2.102  13.513  1.00  0.00      A1A9 C  

                x = float(line[6])
                y = float(line[7])
                z = float(line[8])

                atom_name = line[2]
                atom_number = line[1]
                res_name = line[3]
                res_number = line[5]
                
                chain_id = line[4]

                if chain_id not in pdb_struct["full"]:
                    pdb_struct["full"][chain_id] = OrderedDict()


                else:
                    pdb_struct["full"][chain_id][res_number] = {

                        "coord" : [x,y,z],
                        "atom_name" : atom_name,
                        "res_name" : res_name,
                        "res_number" : res_number,
                        "atom_number" : atom_number
                    }

                
                    if line[2] == "CA":

                        pdb_struct["CA"][chain_id][res_number] = {

                            "coord" : [x,y,z],
                            "res_name" : res_name,
                            "res_number" : res_number,
                        }

                        protein_length += 1

            elif line[0] == "HETATM" and "DUM" in line:

                # Line format :
                # HETATM  643  O   DUM   643     -24.000  -6.000  14.200   
            
                x = float(line[5])
                y = float(line[6])    
                z = float(line[7])

                array.append([x,y,z])

            line = f.readline()

    pdb_struct["membrane_coord"] = np.array(array)

    pdb_struct["protein_length"] = protein_length

    return pdb_struct


def pdb_struct_to_fasta(pdb_struct):

    aa_dict = {
                    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
                    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
                    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
                    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
                    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
                }
    fasta = ""

    for res_number in pdb_struct["CA"]:

        res_name = pdb_struct["CA"][res_number]["res_name"]

        fasta += aa_dict[res_name]

    record = SeqRecord.SeqRecord(Seq(fasta), id="1uaz", description="1uaz")
    filename = f"{pdb_struct['protein_name']}.fasta"
    SeqIO.write(record, filename, "fasta")

    return 0


def return_binaries(pdb_struct : dict, lower_margin = 0, margin = 5):

    bottom_membrane = np.min(pdb_struct["membrane_coord"][:,2])
    top_membrane = np.max(pdb_struct["membrane_coord"][:,2])

    for chain_id in pdb_struct["CA"]:

        for res_number in pdb_struct["CA"][chain_id]:

            z = pdb_struct["CA"][chain_id][res_number]["coord"][2]

            if bottom_membrane  <= z <= top_membrane + lower_margin :

                pdb_struct["CA"][chain_id][res_number].update({"in_membrane" : "1"})

            else:

                pdb_struct["CA"][chain_id][res_number].update({"in_membrane" : "0"})

            if bottom_membrane - margin <= z <= top_membrane + margin:

                pdb_struct["CA"][chain_id][res_number].update({"in_margin" : "1"})

            else:

                pdb_struct["CA"][chain_id][res_number].update({"in_margin" : "0"})

    in_membrane = []
    in_margin = []
    for chain_id in pdb_struct["CA"]:
        for res_number in pdb_struct["CA"][chain_id]:

            in_membrane.append(pdb_struct["CA"][chain_id][res_number]["in_membrane"])
            in_margin.append(pdb_struct["CA"][chain_id][res_number]["in_margin"])

    return "".join(in_membrane), "".join(in_margin)

def extract_tm_segments_indices(binary_sequence):

    segment_indices = []

    start_index = None

    for i, bit in enumerate(binary_sequence):
        if bit == "1":
            if start_index is None:
                start_index = i+1
        else:
            if start_index is not None:
                length = i-start_index+1
                if length >= 15: # minimum length of a TM segment, although 20 is the length of a typical alpha helical TM segment
                    
                    # python indices are 0-based, so we add 1 to match 
                    # the 1-based residue numbering in the PDB file
                    segment_indices.append((start_index, i+1, length))

                # Wether the segment is long enough or not, we reset the start_index
                start_index = None

    if start_index is not None:
        segment_indices.append((start_index, len(binary_sequence)))

    return segment_indices

def pdb_struct_to_fasta(pdb_struct):

    aa_dict = {
                    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
                    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
                    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
                    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
                    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
                }
    fasta = ""

    for res_number in pdb_struct["CA"]:

        res_name = pdb_struct["CA"][res_number]["res_name"]

        fasta += aa_dict[res_name]

    record = SeqRecord.SeqRecord(Seq(fasta), id="1uaz", description="1uaz")
    filename = f"{pdb_struct['protein_name']}.fasta"
    SeqIO.write(record, filename, "fasta")

    return 0


def elongate_tm_segments(tm_indices: list, protein_length : int, min_length=20, max_length=70):
    """
    This function takes a list of tuples containing the start and end indices of putative transmembrane (tm) segments
    Extracted for the same multiple-fragments transmembrane protein.
    For example, GPCR proteins have 7 transmembrane segments, they will end up in a list of 7 tuples.

    For each tm segment, the function will elongate the segment to a random size drawn from a given size distribution,
    given by the size_picker_v2 function.

    The function will elongate the segment only if the size of the segment is smaller than the size drawn from the distribution.
    The goal here is to "draw" from the parts of the sequence that are not transmembrane segments, and elongate the tm segments.
    One main goal is to avoid drawing twice from the same region to elongate two tm segments that are adjacent to each other.

    Input:

    tm_indices : list of tuples 
                # [ (12,26,15), (45, 60, 16), (80, 100, 21) ...]
                # [ (start, end, length), ... ]

    min_length : int
                # minimum length of the elongated segment

    max_length : int
                # maximum length of the elongated segment
    """
    
    ##### Treat first TM Segment separately ##### 


    desired_length = size_picker_v2(min_size=min_length, max_size=max_length)

    
    # First TM Segment
    start_current = tm_indices[0][0]
    end_current = tm_indices[0][1]
    length_current = tm_indices[0][2]


    if desired_length > length_current:


        # Second TM Segment
        start_next = tm_indices[1][0]

        elongation_left_to_do = desired_length - length_current


        downstream = random.randint(0, elongation_left_to_do)

        
        lefts = None

        # The new end of this tm segment should not exceed the start of the next tm segment
        if downstream + end_current > start_next:

            new_end_coordinates = start_next - 1

            lefts = downstream - ( start_next - end_current )



        else:

            new_end_coordinates = end_current + downstream


        upstream = elongation_left_to_do - downstream

        

        if lefts:

            upstream += lefts



        if start_current - upstream < 1:

            new_start_coordinates = 1



        else:

            new_start_coordinates = start_current - upstream

        tm_indices[0] = (new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates)


    ##### Treat from the second TM Segment to the penultimate one ( n-1 ) #####

    for i in range(1, len(tm_indices) - 1):

        # Target size that the current tm should reach
        desired_length = size_picker_v2(min_size=min_length, max_size=max_length)

        # ith TM Segment
        start_current = tm_indices[i][0]
        end_current = tm_indices[i][1]
        length_current = tm_indices[i][2]

        # check before anything else to save computation time
        if desired_length <= length_current:

            # If there is no elongation to do, we skip to the next segment
            # and the coordinates of the ith segment are not modified
            continue
        
        # (i+1)th TM Segment
        start_next = tm_indices[i+1][0]


        # (i-1)th TM Segment
        end_previous = tm_indices[i-1][1]
        
        # Compute the number of residues that are required to elongate the current segment
        elongation_left_to_do = desired_length - length_current


        # Randomly choose the number of residues to elongate downstream ( toward the C-terminal )
        downstream = random.randint(0, elongation_left_to_do)

        lefts = None

        # The new end of this tm segment should not exceed the start of the next tm segment
        if downstream + end_current > start_next:

            # Hence take everyting that is between the end of the current tm segment and the start of the next one
            new_end_coordinates = start_next - 1

            # What is " left " from downstream that could not be taken cause of the next tm ? 
            lefts = downstream - (start_next - end_current)

        else:

            new_end_coordinates = end_current + downstream

        ## If there is elongation that was not taken from downstream, add it to the upstream
        upstream = elongation_left_to_do - downstream
        if lefts:

            upstream += lefts


        # The new start of this tm segment should not be lower than the end of the previous tm segment
        if start_current - upstream < end_previous:

            new_start_coordinates = end_previous + 1 

        else:

            new_start_coordinates = start_current - upstream


        tm_indices[i] = (new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates)

        


    ##### Treat the last TM Segment #####

    # Target size that the current tm should reach
    desired_length = size_picker_v2(min_size=min_length, max_size=max_length)


    # Last TM Segment
    start_current = tm_indices[-1][0]
    end_current = tm_indices[-1][1]
    length_current = tm_indices[-1][2]

    # check before anything else to save computation time
    if desired_length <= length_current:

        # If there is no elongation to do, we skip to the next segment
        # and the coordinates of the ith segment are not modified
        return 0

    # (i-1)th TM Segment
    end_previous = tm_indices[-2][1]

    # Compute the number of residues that are required to elongate the current segment
    elongation_left_to_do = desired_length - length_current



    # Randomly choose the number of residues to elongate downstream ( toward the C-terminal )
    downstream = random.randint(0, elongation_left_to_do)

    lefts = None

    # The new end of this final tm should not exceed the protein length
    if downstream + end_current > protein_length:

        # Hence take everyting that is between the end of the current tm segment and the start of the next tm segment
        new_end_coordinates = protein_length

        # What is " left " from downstream that could not be taken because the protein is too short after the last tm ? 
        lefts = downstream - (protein_length - end_current)


    else:

        new_end_coordinates = end_current + downstream


    upstream = elongation_left_to_do - downstream
    if lefts:

        upstream += lefts


    # The new start of this tm segment should not be lower than the end of the previous tm segment
    if start_current - upstream < end_previous:

        new_start_coordinates = end_previous + 1 

    else:

        new_start_coordinates = start_current - upstream     


    tm_indices[-1] =(new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates + 1)

    return 0

def extract_characters(binary_string, tuples_list):
    extracted_chunks = []
    for start, end, _ in tuples_list:
        extracted_chunks.append(binary_string[start:end-1])
    return extracted_chunks
 

# Extract the sequence 

In [61]:

pdb_struct = read_pdb(FILE_PATH)



membrane_binary, margin_binary = return_binaries(pdb_struct, lower_margin = 12)

tm_indices = extract_tm_segments_indices(membrane_binary)

print(len(tm_indices))

elongate_tm_segments(tm_indices, pdb_struct["protein_length"])

print(tm_indices)





6


IndexError: tuple index out of range

In [None]:
print(membrane_binary)
print(margin_binary)


tm_indices = extract_tm_segments_indices(membrane_binary)
test = tm_indices.copy()
print(tm_indices)
elongate_tm_segments(tm_indices, pdb_struct["protein_length"])
print(tm_indices)
print(extract_characters(membrane_binary, test))
print(extract_characters(membrane_binary, tm_indices))