In [None]:
import pandas as pd                                       #### Importing some libraries, probably not all of them are important
import numpy as np
import torch
from rdkit import Chem
from rdkit.Chem import rdmolfiles, rdmolops
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
import numpy as np
import pickle 

In [None]:
df = pd.read_csv('/srv/scratch/ALL/DATA/NR-DBIND.csv', sep=';')

In [None]:
df['ID'] = df['ID'].astype(int).astype(str)
df = df[df['p_binding_type'].isin(['pIC50', 'pKi'])] ## We only want p_binding_type to be pIC50 and pKi

In [None]:
df = df[['ID', 'accession', 'smiles', 'CHEMBLID', 'p_binding_value']]

In [None]:
df = df.groupby(['accession', 'smiles']).agg('median').reset_index()

In [None]:
# only choose rows where p_binding_value is not NaN
df = df[df['p_binding_value'].notna()]

In [None]:
def get_accession(name):
    name = name.split('/')[2]
    name = name.split('-')[1]
    return name

prot = pd.read_pickle('results/prepare_proteins/protein_data_label_label.pkl')
prot.drop('sif', axis = 1)
prot['sif'] = prot['sif'].apply(get_accession)
prot['accession'] = prot['sif']
prot.drop('sif', inplace=True, axis=1)

In [None]:
### (nodes, edges, edge attributes (bond types))

def smiles_to_torch(smiles: str) -> Data:               #### this is a function that takes in the smiles of drug 
    '''                                                 #### (example:CC(C)c1onc(c1COc2ccc(cc2)c3ccc4c(cccc4c3)C(=O)O)c5c(Cl)cccc5Cl)
    Converts molecular smiles into torch data           #### it uses the torch library so i dont quite understand what its doing
    '''
    mol = Chem.MolFromSmiles(smiles)
    if not mol:  # when rdkit fails to read a molecule it returns None
        return np.nan
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    dictionary = {'SINGLE':0,'DOUBLE':1,'AROMATIC':2}   ### dictionary for bond types
    edges = []
    edge_atributes = []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_type = str(bond.GetBondType())                        ### the addition of information about bond type

        if bond_type in dictionary:
            type = dictionary[bond_type]
        else:
            type = 3
        edge_atributes.append(type)
        edges.append([start, end])

    if not edges:  # If no edges (bonds) were found, exit (single ion etc)
        return np.nan
    atom_features = []
    for atom in mol.GetAtoms():
        atom_num = atom.GetAtomicNum()
        atom_feat = atom_num_dict.get(atom_num)
        atom_features.append(atom_num)

    x = torch.tensor(atom_features, dtype=torch.long)
    edge_index = torch.tensor(edges).t().contiguous()
    edge_atributes = torch.tensor(edge_atributes, dtype = torch.long)

    return dict(x=x, edge_index=edge_index, edge_atributes=edge_atributes)   #returns a dictionary of values (we are pairing the 
                                                                             #names of lists and list in tensor format of all nodes (atoms),
                                                                             #list of edges (bonds), and list of bond types
        
df['data'] = df['smiles'].apply(smiles_to_torch)           ### we apply the smiles_to_torch function to every smiles

In [None]:
prot_dict = {}
for i in range(len(prot)):
    row = prot.iloc[i]
    prot_dict[row['accession']] = row['data']

In [None]:
list_for_model = []
for name, row in df.iterrows():
    new_dict = {}
    accession = row['accession']
    data = row['data']
    new_dict['drug_x'] = data['x'] 
    new_dict['drug_edge_index'] = data['edge_index']
    new_dict['label'] = row['p_binding_value']
    new_dict['protein_x'] = prot_dict[accession]['x']
    new_dict['protein_edge_index'] = prot_dict[accession]['edge_index']
    list_for_model.append(new_dict)

In [None]:
final_list = open('final_list.pkl', 'wb') 
pickle.dump(list_for_model, final_list)                ### And finally we have a beautiful pickle file ready for the model