# Load the shrunken data

In [1]:
import numpy as np
import pandas as pd 
import pickle
from pathlib import Path

In [2]:
dtypes = {'buildingblock1_smiles': np.int16, 'buildingblock2_smiles': np.int16, 'buildingblock3_smiles': np.int16,
          'binds_BRD4':np.byte, 'binds_HSA':np.byte, 'binds_sEH':np.byte}

directory = Path("../data/shrunken/")

In [3]:
train = pd.read_parquet(directory / "train.parquet")

In [4]:
train.head()

In [5]:
# Find rows where binds_BRD4, binds_HSA, and binds_sEH are all 1
train[(train['binds_BRD4'] == 1) & (train['binds_HSA'] == 1) & (train['binds_sEH'] == 1)]

In [6]:
# Find the percentage of binds_BRD4 that are 1 and also in the binds of sEH
seh_binds = train[train['binds_sEH'] == 1]
brd4_binds = train[train['binds_BRD4'] == 1]
hsa_binds = train[train['binds_HSA'] == 1]
subset_brd4_seh = brd4_binds[brd4_binds.index.isin(seh_binds.index)]
subset_hsa_seh = hsa_binds[hsa_binds.index.isin(seh_binds.index)]
subset_brd4_hsa = brd4_binds[brd4_binds.index.isin(hsa_binds.index)]
print(f"Count seh_binds: {len(seh_binds)}, Count brd4_binds: {len(brd4_binds)}, Count subset: {len(subset_brd4_seh)}, percentage: {len(subset_brd4_seh)/len(brd4_binds)}")
print(f"Count hsa_binds: {len(hsa_binds)}, Count subset_hsa_seh: {len(subset_hsa_seh)}, percentage: {len(subset_hsa_seh)/len(hsa_binds)}")
print(f"Count subset_brd4_hsa: {len(subset_brd4_hsa)}, percentage: {len(subset_brd4_hsa)/len(brd4_binds)}")
print(f"Percentage of seh in brd4: {len(subset_brd4_seh)/len(seh_binds)}")
print(f"Percentage of seh in hsa: {len(subset_hsa_seh)/len(seh_binds)}")

In [7]:
brd4_binds

In [74]:
brd4_binds.molecule_smiles.to_list()[0]

In [8]:
BBs_dict_reverse_1 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_1.p', 'br'))
BBs_dict_reverse_2 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_2.p', 'br'))
BBs_dict_reverse_3 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_3.p', 'br'))

In [9]:
buildingblock3_smiles_original = [BBs_dict_reverse_3[x] for x in train.buildingblock3_smiles[:1000]]
print(buildingblock3_smiles_original[0])

In [13]:
import rdkit

In [46]:
rdkit.Chem.MolFromSmiles(buildingblock3_smiles_original[1])

# Show chemical structure of buildingblock3_smiles one by one, in a slide show
from rdkit import Chem
from rdkit.Chem import Draw

mols = [Chem.MolFromSmiles(x) for x in buildingblock3_smiles_original]

## Chem.rdchem.Mol

In [54]:
mol = mols[0]


#### Descriptors

In [58]:
from rdkit.Chem import Descriptors

# Example: Calculate the molecular weight
mol_weight = Descriptors.MolWt(mol)
print(f"Molecular Weight: {mol_weight}")

# Calculate all descriptors
all_descriptors = {desc_name: desc_func(mol) for desc_name, desc_func in Descriptors.descList}
print(len(all_descriptors))

# Display all descriptors in table vertically
from IPython.display import display, HTML

html = "<table>"
for key, value in all_descriptors.items():
    html += f"<tr><td>{key}</td><td>{value}</td></tr>"
html += "</table>"
display(HTML(html))

In [59]:
from rdkit.Chem import AllChem

# Example: Generate Morgan fingerprint
fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
print(list(fingerprint))


In [60]:
# Number of rings
num_rings = Chem.GetSSSR(mol)
print(f"Number of Rings: {num_rings}")

# Bond types
bonds = [bond.GetBondType() for bond in mol.GetBonds()]
print(f"Bond Types: {bonds}")


In [63]:
# Generate conformations
AllChem.EmbedMolecule(mol)
conformer = mol.GetConformer()
for atom in mol.GetAtoms():
    pos = conformer.GetAtomPosition(atom.GetIdx())
    print(f"Atom {atom.GetSymbol()}, Position: {pos.x}, {pos.y}, {pos.z}")


## Graph representation using RDKit

In [64]:
# Access atoms and bonds
atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
bonds = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()]

print("Atoms:", atoms)
print("Bonds:", bonds)


In [66]:
# Draw the molecule
Draw.MolToImage(mol)

In [67]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.Graph()
G.add_nodes_from(range(len(atoms)))
G.add_edges_from(bonds)

# Position nodes using spring layout
pos = nx.spring_layout(G)

# Draw the graph
nx.draw(G, pos, with_labels=True, labels={i: atom for i, atom in enumerate(atoms)}, node_color='skyblue')
plt.show()
