# Training with AF3 structures

## Pre-preprocessing

The new AF3 .pdb files do not have an opening 'MODEL 1' line, will this affect the generate_fv_pdb() function

### Loading database

In [3]:
# Test sequence

light_sequence = "DIKMTQSPSSMYTSLGERVTITCKASQDINSFLTWFLQKPGKSPKTLIYRANRLMIGVPSRFSGSGSGQTYSLTISSLEYEDMGIYYCLQYDDFPLTFGAGTKLDLKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKEINVKWKIDGSERQNGVLDSWTEQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNEC"
heavy_sequence = "QDQLQQSGAELVRPGASVKLSCKALGYIFTDYEIHWVKQTPVHGLEWIGGIHPGSSGTAYNQKFKGKATLTADKSSTTAFMELSSLTSEDSAVYYCTRKDYWGQGTLVTVSAAKTTAPSVYPLVPVCGGTTGSSVTLGCLVKGYFPEPVTLTWNSGSLSSGVHTFPALLQSGLYTLSSSVTVTSNTWPSQTITCNVAHPASSTKVDKKIEPRV"
antigen_sequence = "DATPEDLGARL"

sequences = [heavy_sequence + ":" + light_sequence + ":" + antigen_sequence]# load the sequences, making sure you just have heavy_chain:light_chain
pdb_ids = ["1hh6"]

### Producing summary dataframe

In [4]:
# save as .tsv 

### Getting lists of residues and producing AF3 inputs

In [2]:
from anarci import run_anarci
import numpy as np
import os

In [3]:
residues_dict_anarci = {}
 
for i, seq in enumerate(sequences):
    split_seq = seq.split(':')
    if len(split_seq) == 2:
        heavy_seq, light_seq = split_seq
        ag_seq = None
    elif(len(split_seq)) == 3:
        heavy_seq, light_seq, ag_seq = split_seq
    else:
        raise ValueError("Unknown sequence")
 
    heavy_results = run_anarci([('heavy', heavy_seq)], scheme='chothia')
    if not heavy_results[2][0]:
        print(f'Skipping heavy chain {pdb_ids[i]}')
        continue

    heavy_residues = []
    for pos, residue in heavy_results[1][0][0][0]:
        if residue != '-':
            full_res = f'{residue}A{str(pos[0]).rjust(3)}'
            if pos[1].strip():
                full_res += pos[1].strip()
            else:
                full_res += ' '
            heavy_residues.append(full_res)

    light_results = run_anarci([('light', light_seq)], scheme='chothia')
    if not light_results[2][0]:
        print(f'Skipping light chain {pdb_ids[i]}')
        continue    
    light_residues = []
    for pos, residue in light_results[1][0][0][0]:
        if residue != '-':
            full_res = f'{residue}B{str(pos[0]).rjust(3)}'
            if pos[1].strip():
                full_res += pos[1].strip()
            else:
                full_res += ' '
            light_residues.append(full_res)

    ag_residues = []
    for idx, residue in enumerate(ag_seq):
        pos = idx + 1
        full_res = f'{residue}C{str(pos).rjust(3)} '
        ag_residues.append(full_res)
    
    list_of_residues = ['START-Ab'] + heavy_residues + light_residues + ['END-Ab'] + ag_residues
    residues_dict_anarci[pdb_ids[i]]=list_of_residues

In [4]:
# Saving ANARCI residues

residues_path = '../data/lists_of_residues'

for entry in pdb_ids:
    save_filename = f'{entry}.npy'
    np.save(os.path.join(residues_path,save_filename), residues_dict_anarci[entry])

#### Comparing to old data

In [60]:
import numpy as np
import os

residues_path = '../OLD/data/lists_of_residues/'
residues_dict_old = {}

for filename in os.listdir(residues_path):
    file_path = os.path.join(residues_path, filename)
    residues = np.load(file_path)
    filename_without_ext = os.path.splitext(filename)[0]
    residues_dict_old[filename_without_ext] = residues

# list_of_residues = list(residues_dict_old[pdb_ids[0]])[1:]

# chain_pos = 0
# if 'END-Ab' in list_of_residues:
#     list_of_residues = list_of_residues[:list_of_residues.index('END-Ab')]
#     chain_pos = 1
# else:
#     list_of_residues = residues_dict_old[pdb_ids[0]][:-1]

# h_chain = list_of_residues[0][chain_pos]
# l_chain = list_of_residues[-1][chain_pos]

# print(len([idx for idx in list_of_residues if idx[chain_pos] == h_chain]))
# if h_chain != l_chain:
#     print(len([idx for idx in list_of_residues if idx[chain_pos] == l_chain]))
# else:
#     print(0)




In [61]:
# print(residues_dict[pdb_ids[0]])

In [78]:
# selected_entries = np.load("../OLD/data/chain_lengths/selected_entries.npy")
# heavy_lengths = np.load("../OLD/data/chain_lengths/heavy_lengths.npy")
# light_lengths = np.load("../OLD/data/chain_lengths/light_lengths.npy")


# pdb_idx = np.where(selected_entries == pdb_ids[0])[0][0]
# print(heavy_lengths[pdb_idx], light_lengths[pdb_idx])

### Converting AF3 outputs to ANTIPASTI inputs

In [5]:
import pymol2
import os

In [6]:
# assign directory
cif_dir = '../af3/af_output/'
struc_dir = '../data/af3/structures/'

# iterate over files in
# that directory
for protein_name in os.listdir(cif_dir):
    cifname = os.path.join(cif_dir, protein_name, protein_name+'_model.cif')
    if os.path.isfile(cifname):
        with pymol2.PyMOL() as pymol:
            pymol.cmd.load(cifname,'myprotein')
            pymol.cmd.save(os.path.join(struc_dir, protein_name+'_af3.pdb'), selection='myprotein')

## Preprocessing

In [1]:
from antipasti.preprocessing.preprocessing import Preprocessing

In [2]:
data_path = '../data/'
structures_path = '../data/structures/'
df_file = 'sabdab_1hh6.tsv'

modes = 'all' # Number of normal modes to consider. Relevant if renew_maps is True
renew_maps = True # True to compute again all the normal mode correlation maps
renew_residues = True # True to retrieve again all the chain lengths 

# pathological = ['5omm', '5i5k', '1uwx', '1mj7', '1qfw', '1qyg', '4ffz', '3ifl', '3lrh', '3pp4', '3ru8', '3t0w', '3t0x', '4fqr', '4gxu', '4jfx', '4k3h', '4jfz', '4jg0', '4jg1', '4jn2', '4o4y', '4qxt', '4r3s', '4w6y', '4w6y', '5ies', '5ivn', '5j57', '5kvd', '5kzp', '5mes', '5nmv', '5sy8', '5t29', '5t5b', '5vag', '3etb', '3gkz', '3uze', '3uzq', '4f9l', '4gqp', '4r2g', '5c6t', '3fku', '1oau', '1oay']
# scfv = ['4gqp', '3etb', '3gkz', '3uze', '3uzq', '3gm0', '4f9l', '6ejg', '6ejm', '1h8s', '5dfw', '6cbp', '4f9p', '5kov', '1dzb', '5j74', '5aaw', '3uzv', '5aam', '3ux9', '5a2j', '5a2k', '5a2i', '3fku', '5yy4', '3uyp', '5jyl', '1y0l', '1p4b', '3kdm', '4lar', '4ffy', '2ybr', '1mfa', '5xj3', '5xj4', '4kv5', '5vyf'] 
# pathological += scfv

preprocessed_data = Preprocessing(data_path=data_path, structures_path=structures_path, df=df_file, modes=modes, renew_maps=renew_maps, renew_residues=renew_residues, alphafold=True)

In read.pdb(args[1]) : duplicated element numbers ('eleno') detected


Map 1 out of 1 processed.
