In [65]:
import numpy as np
import pandas as pd
from Bio.PDB import *

In [66]:
import os
 
# Get the list of all files and directories
path = "/Users/mikep/OneDrive/Documents/StatML/Structures/"
dir_list = os.listdir(path)
for i, l in enumerate(dir_list):
    if l.startswith('.'):
        del dir_list[i]
    if '.pdb' not in l:
        del dir_list[i]

In [67]:
len(dir_list)

287

In [69]:
p = PDBParser()

#Create dictionary to convert residue to letter
amino_acid_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLU': 'E',
        'GLN': 'Q', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
        'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W',
        'TYR': 'Y', 'VAL': 'V'
    }

In [72]:
def distance_mat(chain): 
    residues = [residue for residue in chain]
    dist_mat = np.zeros((len(residues),len(residues)), dtype = np.float32)
    for  i, residue_x in enumerate(residues):
        for j, residue_y in enumerate(residues):
            dist_mat[i,j] = np.linalg.norm(residue_x.center_of_mass() - residue_y.center_of_mass())
    return dist_mat

In [73]:
def get_childname(object):
    return ([child.get_id() for child in object])

In [75]:
def export_fasta(filepath, seq_name, full_seq): 
    f = open(filepath, 'w')
    f.write('>' + seq_name + '\n')
    for i in range(len(full_seq)):
        f.write(full_seq[i])

In [78]:
ids = [ids.split('.')[0] for ids in dir_list ]

In [79]:
ids

['MATRLLCCVVLCLLGEELIDARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQGLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGPETYEQYFGPGTRLTVT_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000',
 'MATRLLCCVVLCLLGEELIDARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQGLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGSTDSGWQETQYFGPGTRLLVL_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000',
 'MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEEGLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSLDRQSRNQPQHFGDGTRLSIL_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000',
 'MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEEGLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSPFRVGSGLAGRGRADTQYFGPGTRLTVL_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000',
 'MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEEGLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSPPSASGYTIYFGEGSWLTVV_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000',
 'MDTRVLCCAVICLLGAGLSNAGVMQNPRHLVRRRGQEARLR

In [80]:
all_Sequences = [] #RUN UP TO THIS TO GET CSV
Beta = []
CDR3_dists = []
CDR3_tors = []
CDR3_starts = []
chains = []
for i, id in enumerate(ids):
    file = dir_list[i].split(".")[0]
    structure = p.get_structure(file, path + id + '.pdb')
    start_idx = 0
    residues = structure.get_residues()
    seq = [];

    for residue in residues:
        seq.append(residue.get_resname())
    

    for i,amino in enumerate(seq):
        seq[i] = amino_acid_dict.get(amino,'')
    full_seq = ''.join(seq)
    #uncomment if you are creating distance matrices for known structures
    #export_fasta('/Users/pressm/Documents/StatML/TCRProject/b_chain_fasta/'+file+'.fasta', file, full_seq)
    Beta.append(full_seq) 
    
    chain = structure[0][get_childname(structure[0])[0]]
    poly = Polypeptide.Polypeptide(chain)


    CDR3_dist = distance_mat(chain)
    CDR3_dists.append(CDR3_dist)
    CDR3_starts.append(start_idx)
    chains.append(get_childname(structure[0])[0])                   
    
    dist_mat = pd.DataFrame(CDR3_dist)
    dist_mat.to_csv('/Users/mikep/OneDrive/Documents/StatML/Allo51/'+id + '.csv')