In [2]:
import numpy as np
from collections import defaultdict
from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqRecord
import random

FILE_PATH = "1uaz.pdb"

def size_picker_v2(fasta_file = "Scer_NCBI_iORF.faa", min_size = 20, max_size = 70):

    sizes = []

    for record in SeqIO.parse(fasta_file, "fasta"):
        length = len(record.seq)
        if min_size <= length <= max_size:

            sizes.append(length)
    
    return random.choice(sizes)


def read_pdb(file_path):

    pdb_struct = {}

    pdb_struct["protein_name"] = file_path.split(".")[0]

    pdb_struct["full"] = defaultdict(OrderedDict)
    pdb_struct["CA"] = defaultdict(OrderedDict)
    pdb_struct["membrane_coord"] = []

    array = []

    protein_length = 0

    with open(file_path,"r") as f:

        line = f.readline()

        while line:

            line = line.split()

            if line[0] == "ATOM":

                # Line format : 
                # ATOM      2  CA  MET A   1      24.767  -2.102  13.513  1.00  0.00      A1A9 C  

                x = float(line[6])
                y = float(line[7])
                z = float(line[8])

                atom_name = line[2]
                atom_number = line[1]
                res_name = line[3]
                res_number = line[5]
                
                chain_id = line[4]

                pdb_struct["full"][res_number] = {

                    "coord" : [x,y,z],
                    "atom_name" : atom_name,
                    "chain_id" : chain_id,
                    "res_name" : res_name,
                    "res_number" : res_number,
                    "atom_number" : atom_number
                }
                
                if line[2] == "CA":

                    pdb_struct["CA"][res_number] = {

                        "coord" : [x,y,z],
                        "res_name" : res_name,
                        "res_number" : res_number,
                    }

                    protein_length += 1

            elif line[0] == "HETATM" and "DUM" in line:

                # Line format :
                # HETATM  643  O   DUM   643     -24.000  -6.000  14.200   
            
                x = float(line[5])
                y = float(line[6])    
                z = float(line[7])

                array.append([x,y,z])

            line = f.readline()

    pdb_struct["membrane_coord"] = np.array(array)

    pdb_struct["protein_length"] = protein_length

    return pdb_struct


def pdb_struct_to_fasta(pdb_struct):

    aa_dict = {
                    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
                    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
                    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
                    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
                    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
                }
    fasta = ""

    for res_number in pdb_struct["CA"]:

        res_name = pdb_struct["CA"][res_number]["res_name"]

        fasta += aa_dict[res_name]

    record = SeqRecord.SeqRecord(Seq(fasta), id="1uaz", description="1uaz")
    filename = f"{pdb_struct['protein_name']}.fasta"
    SeqIO.write(record, filename, "fasta")

    return 0


def return_binaries(pdb_struct : dict, margin = 5):

    bottom_membrane = np.min(pdb_struct["membrane_coord"][:,2])
    top_membrane = np.max(pdb_struct["membrane_coord"][:,2])


    for res_number in pdb_struct["CA"]:

        z = pdb_struct["CA"][res_number]["coord"][2]

        if bottom_membrane <= z <= top_membrane:

            pdb_struct["CA"][res_number].update({"in_membrane" : "1"})

        else:

            pdb_struct["CA"][res_number].update({"in_membrane" : "0"})

        if bottom_membrane - margin <= z <= top_membrane + margin:

            pdb_struct["CA"][res_number].update({"in_margin" : "1"})

        else:

            pdb_struct["CA"][res_number].update({"in_margin" : "0"})

    in_membrane = []
    in_margin = []
    for res_number in pdb_struct["CA"]:

        in_membrane.append(pdb_struct["CA"][res_number]["in_membrane"])
        in_margin.append(pdb_struct["CA"][res_number]["in_margin"])

    return "".join(in_membrane), "".join(in_margin)

def extract_tm_segments_indices(binary_sequence):

    segment_indices = []

    start_index = None

    for i, bit in enumerate(binary_sequence):
        if bit == "1":
            if start_index is None:
                start_index = i+1
        else:
            if start_index is not None:
                length = i-start_index+1
                if length >= 15: # minimum length of a TM segment, although 20 is the length of a typical alpha helical TM segment
                    
                    # python indices are 0-based, so we add 1 to match 
                    # the 1-based residue numbering in the PDB file
                    segment_indices.append((start_index + 1, i+1, length))
                start_index = None

    if start_index is not None:
        segment_indices.append((start_index, len(binary_sequence)))

    return segment_indices

def pdb_struct_to_fasta(pdb_struct):

    aa_dict = {
                    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
                    'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
                    'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
                    'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
                    'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
                }
    fasta = ""

    for res_number in pdb_struct["CA"]:

        res_name = pdb_struct["CA"][res_number]["res_name"]

        fasta += aa_dict[res_name]

    record = SeqRecord.SeqRecord(Seq(fasta), id="1uaz", description="1uaz")
    filename = f"{pdb_struct['protein_name']}.fasta"
    SeqIO.write(record, filename, "fasta")

    return 0


 
pdb_struct = read_pdb(FILE_PATH)
membrane_binary, margin_binary = return_binaries(pdb_struct)

print(membrane_binary)
print(margin_binary)
print(pdb_struct["protein_length"])
tm_indices = extract_tm_segments_indices(membrane_binary)

print(tm_indices)

# Extract the sequence 

00000000000000111111111111111111111111000000000111111111111111111111011000000000000011111111111111111111000000111111111111111111111110000011111111111111111111100000000000000000001111111111111111111101100010011111111111111111111111000000
00111111100011111111111111111111111111110000111111111111111111111111111110100000001111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000001111111111111111111111111111111111111111111111111111111110010
236
[(16, 39, 24), (49, 69, 21), (86, 105, 20), (112, 134, 23), (140, 160, 21), (180, 199, 20), (209, 231, 23)]


In [12]:
import random


def elongate_tm_segments(tm_indices: list, protein_length : int, min_length=20, max_length=70):
    """
    This function takes a list of tuples containing the start and end indices of putative transmembrane (tm) segments
    Extracted for the same multiple-fragments transmembrane protein.
    For example, GPCR proteins have 7 transmembrane segments, they will end up in a list of 7 tuples.

    For each tm segment, the function will elongate the segment to a random size drawn from a given size distribution,
    given by the size_picker_v2 function.

    The function will elongate the segment only if the size of the segment is smaller than the size drawn from the distribution.
    The goal here is to "draw" from the parts of the sequence that are not transmembrane segments, and elongate the tm segments.
    One main goal is to avoid drawing twice from the same region to elongate two tm segments that are adjacent to each other.

    Input:

    tm_indices : list of tuples 
                # [ (12,26,15), (45, 60, 16), (80, 100, 21) ...]
                # [ (start, end, length), ... ]

    min_length : int
                # minimum length of the elongated segment

    max_length : int
                # maximum length of the elongated segment
    """
    
    ##### Treat first TM Segment separately ##### 


    print("Old coordinates : ")
    print(tm_indices[0])
    desired_length = size_picker_v2(min_size=min_length, max_size=max_length)
    print(f"Length drawn : {desired_length}")

    
    # First TM Segment
    start_current = tm_indices[0][0]
    end_current = tm_indices[0][1]
    length_current = tm_indices[0][2]


    if desired_length > length_current:


        # Second TM Segment
        start_next = tm_indices[1][0]

        elongation_left_to_do = desired_length - length_current
        print(f"Elongation left to do : {elongation_left_to_do}")

        downstream = random.randint(0, elongation_left_to_do)
        print(f"Downstream : {downstream}")
        
        lefts = None

        # The new end of this tm segment should not exceed the start of the next tm segment
        if downstream + end_current > start_next:

            new_end_coordinates = start_next - 1

            lefts = downstream - ( start_next - end_current )

            print(f"Downstream too long, lefts : {lefts}")

        else:

            new_end_coordinates = end_current + downstream


        upstream = elongation_left_to_do - downstream

        

        if lefts:

            upstream += lefts

        print(f"Upstream : {upstream}")

        if start_current - upstream < 1:

            new_start_coordinates = 1

            print(f"Start too low, new start : {new_start_coordinates}")

        else:

            new_start_coordinates = start_current - upstream

        tm_indices[0] = (new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates)

        print("New coordinates : ")

        print(tm_indices[0])

    else:

        print("No elongation to do")

    ##### Treat from the second TM Segment to the penultimate one ( n-1 ) #####

    for i in range(1, len(tm_indices) - 1):

        print("Old coordinates : ")
        print(tm_indices[i])


        # Target size that the current tm should reach
        desired_length = size_picker_v2(min_size=min_length, max_size=max_length)
        print(f"Length drawn : {desired_length}")

        # ith TM Segment
        start_current = tm_indices[i][0]
        end_current = tm_indices[i][1]
        length_current = tm_indices[i][2]

        # check before anything else to save computation time
        if desired_length <= length_current:

            # If there is no elongation to do, we skip to the next segment
            # and the coordinates of the ith segment are not modified

            print("No elongation to do")
            continue
        
        # (i+1)th TM Segment
        start_next = tm_indices[i+1][0]


        # (i-1)th TM Segment
        end_previous = tm_indices[i-1][1]
        
        # Compute the number of residues that are required to elongate the current segment
        elongation_left_to_do = desired_length - length_current

        print(f"Elongation left to do : {elongation_left_to_do}")

        # Randomly choose the number of residues to elongate downstream ( toward the C-terminal )
        downstream = random.randint(0, elongation_left_to_do)

        print(f"Downstream : {downstream}")

        lefts = None

        # The new end of this tm segment should not exceed the start of the next tm segment
        if downstream + end_current > start_next:

            # Hence take everyting that is between the end of the current tm segment and the start of the next one
            new_end_coordinates = start_next - 1

            # What is " left " from downstream that could not be taken cause of the next tm ? 
            lefts = downstream - (start_next - end_current)

            print(f"Downstream too long, lefts : {lefts}")

        else:

            new_end_coordinates = end_current + downstream

        ## If there is elongation that was not taken from downstream, add it to the upstream
        upstream = elongation_left_to_do - downstream
        if lefts:

            upstream += lefts

        print(f"Upstream : {upstream}")

        # The new start of this tm segment should not be lower than the end of the previous tm segment
        if start_current - upstream < end_previous:

            new_start_coordinates = end_previous + 1 

            print(f"Start too low, new start : {new_start_coordinates}")

        else:

            new_start_coordinates = start_current - upstream


        tm_indices[i] = (new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates)

        
        print("New coordinates : ")

        print(tm_indices[i])

    ##### Treat the last TM Segment #####


    print("Old coordinates : ")
    print(tm_indices[-1])

    # Target size that the current tm should reach
    desired_length = size_picker_v2(min_size=min_length, max_size=max_length)

    print(f"Length drawn : {desired_length}")

    # Last TM Segment
    start_current = tm_indices[-1][0]
    end_current = tm_indices[-1][1]
    length_current = tm_indices[-1][2]

    # check before anything else to save computation time
    if desired_length <= length_current:

        # If there is no elongation to do, we skip to the next segment
        # and the coordinates of the ith segment are not modified
        return 0

    # (i-1)th TM Segment
    end_previous = tm_indices[-2][1]

    # Compute the number of residues that are required to elongate the current segment
    elongation_left_to_do = desired_length - length_current

    print(f"Elongation left to do : {elongation_left_to_do}")


    # Randomly choose the number of residues to elongate downstream ( toward the C-terminal )
    downstream = random.randint(0, elongation_left_to_do)
    print(f"Downstream : {downstream}")

    lefts = None

    # The new end of this final tm should not exceed the protein length
    if downstream + end_current > protein_length:

        # Hence take everyting that is between the end of the current tm segment and the start of the next tm segment
        new_end_coordinates = protein_length

        # What is " left " from downstream that could not be taken because the protein is too short after the last tm ? 
        lefts = downstream - (protein_length - end_current)

        print(f"Downstream too long, lefts : {lefts}")

    else:

        new_end_coordinates = end_current + downstream


    upstream = elongation_left_to_do - downstream
    if lefts:

        upstream += lefts

    print(f"Upstream : {upstream}")

    # The new start of this tm segment should not be lower than the end of the previous tm segment
    if start_current - upstream < end_previous:

        new_start_coordinates = end_previous + 1 

        print(f"Start too low, new start : {new_start_coordinates}")

    else:

        new_start_coordinates = start_current - upstream     


    tm_indices[-1] =(new_start_coordinates, new_end_coordinates, new_end_coordinates - new_start_coordinates)


    
    print("New coordinates : ")
    print(tm_indices[-1])

    return 0


test = tm_indices.copy()
elongate_tm_segments(test, pdb_struct["protein_length"])

Old coordinates : 
(16, 39, 24)
Length drawn : 32
Elongation left to do : 8
Downstream : 2
Upstream : 6
New coordinates : 
(10, 41, 31)
Old coordinates : 
(49, 69, 21)
Length drawn : 30
Elongation left to do : 9
Downstream : 0
Upstream : 9
Start too low, new start : 42
New coordinates : 
(42, 69, 27)
Old coordinates : 
(86, 105, 20)
Length drawn : 42
Elongation left to do : 22
Downstream : 15
Downstream too long, lefts : 8
Upstream : 15
New coordinates : 
(71, 111, 40)
Old coordinates : 
(112, 134, 23)
Length drawn : 48
Elongation left to do : 25
Downstream : 18
Downstream too long, lefts : 12
Upstream : 19
Start too low, new start : 112
New coordinates : 
(112, 139, 27)
Old coordinates : 
(140, 160, 21)
Length drawn : 38
Elongation left to do : 17
Downstream : 17
Upstream : 0
New coordinates : 
(140, 177, 37)
Old coordinates : 
(180, 199, 20)
Length drawn : 34
Elongation left to do : 14
Downstream : 11
Downstream too long, lefts : 1
Upstream : 4
Start too low, new start : 178
New coor

0

In [26]:
a = [1,2,3]

a[-2]

2

In [4]:
from collections import defaultdict, OrderedDict
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

class PDBProcessor:
    def __init__(self):
        self.pdb_struct = {
            "full": defaultdict(OrderedDict),
            "CA": defaultdict(OrderedDict),
            "membrane_coord": []
        }

        self.aa_dict = {
            'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
            'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
            'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
            'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
            'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
        }

        self.protein_name = ""

    def read_pdb(self, file_path):


        self.protein_name = file_path.split(".")[0]

        array = []

        with open(file_path, "r") as f:
            line = f.readline()
            while line:
                line = line.split()
                if line[0] == "ATOM":
                    x = float(line[6])
                    y = float(line[7])
                    z = float(line[8])
                    atom_name = line[2]
                    atom_number = line[1]
                    res_name = line[3]
                    res_number = line[5]
                    chain_id = line[4]

                    self.pdb_struct["full"][res_number] = {
                        "coord": [x, y, z],
                        "atom_name": atom_name,
                        "chain_id": chain_id,
                        "res_name": res_name,
                        "res_number": res_number,
                        "atom_number": atom_number
                    }

                    if line[2] == "CA":
                        self.pdb_struct["CA"][res_number] = {
                            "coord": [x, y, z],
                            "res_name": res_name,
                            "res_number": res_number,
                        }
                elif line[0] == "HETATM" and "DUM" in line:
                    x = float(line[5])
                    y = float(line[6])
                    z = float(line[7])
                    array.append([x, y, z])
                line = f.readline()

        self.pdb_struct["membrane_coord"] = np.array(array)

    def write_pdb(self):

        fasta = ""

        for res_number in self.pdb_struct["CA"]:
            res_name = self.pdb_struct["CA"][res_number]["res_name"]
            fasta += self.aa_dict[res_name]

        record = SeqRecord(Seq(fasta), self.protein_name, description="")
        filename = f"{self.protein_name}.fasta"
        SeqIO.write(record, filename, "fasta")

# Example usage:
pdb_processor = PDBProcessor()
pdb_processor.read_pdb("1uaz.pdb")
pdb_processor.pdb_struct_to_fasta()

140