In [2]:
import torch
import networkx as nx
import os
from rdkit import Chem

Crear grafos

In [None]:
#Leer archivos
archivos = os.listdir('..\\gdb11')
print(archivos)

#Crear carpeta para guardar grafos
"""if not os.path.exists('grafos'):
    os.makedirs('grafos')"""

data = []
#Crear grafos
for archivo in archivos:
    #if archivo == 'gdb11_size10.smi':
        #break
    with open('..\\gdb11\\'+archivo, 'r') as f:
        lines = f.readlines()
        #Crear grafo
        for line in lines:
            mol = Chem.MolFromSmiles(line.split()[0])
            smile = line.split()[0]
            adj = Chem.GetAdjacencyMatrix(mol)
            nodesym = [atom.GetSymbol() for atom in mol.GetAtoms()]
            bond_type = [bond.GetBondType() for bond in mol.GetBonds()]
            G = nx.Graph(adj, symbol=nodesym, bond_type=bond_type, smile=smile)
            pos = nx.spring_layout(G)

            data.append(nx.node_link_data(G))
            #nx.draw(G, pos, labels={i: G.graph['symbol'][i] for i in range(len(G.graph['symbol']))}, with_labels=True)"""

#Guardar json
import json
with open('grafos.json', 'w') as f:
    json.dump(data, f)

['gdb11_size01.smi', 'gdb11_size02.smi', 'gdb11_size03.smi', 'gdb11_size04.smi', 'gdb11_size05.smi', 'gdb11_size06.smi', 'gdb11_size07.smi', 'gdb11_size08.smi', 'gdb11_size09.smi', 'gdb11_size10.smi', 'gdb11_size11.smi']


Crear Dataset

In [None]:
from torch_geometric.utils import from_networkx

import torch
from torch_geometric.data import Data
import networkx as nx
import json

with open('grafos.json', 'r') as f:
    graph_data = json.load(f)


def atom_features(atom):
    return [
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),      # Number of bonded neighbors
        atom.GetHybridization(),
        atom.GetIsAromatic()
    ]

# No supervisado
def nx_to_pyg(graph):
    # Extract node features (symbols)
    node_features = torch.tensor([[ord(symbol)] for symbol in graph.graph['symbol']], dtype=torch.float)
    #node_features = torch.tensor([atom_features(atom) for atom in Chem.MolFromSmiles(graph.graph['smile']).GetAtoms()], dtype=torch.float)
    # Extract edge indices
    edge_index = torch.tensor(list(graph.edges), dtype=torch.long).t().contiguous()

    # Extract edge features (bond types)
    edge_attr = torch.tensor([bond_type for bond_type in graph.graph['bond_type']], dtype=torch.float).view(-1, 1)
    
    # Create PyG Data object
    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

# Convert all graphs to PyG format
pyg_data_list = [nx_to_pyg(nx.node_link_graph(graph)) for graph in graph_data]
print(pyg_data_list[200])
print(pyg_data_list[200].x)
print(pyg_data_list[200].edge_index)
print(pyg_data_list[200].edge_attr)
print(len(pyg_data_list))

# Show the first graph
from torch_geometric.utils import to_networkx

#G = to_networkx(pyg_data_list[200])
#print(G)
#pos = nx.spring_layout(G)
#nx.draw(G, pos, labels={i: chr(int(pyg_data_list[200].x[i])) for i in range(len(G))}, with_labels=True)

from torch_geometric.data import Dataset

class CustomGraphDataset(Dataset):
    def __init__(self, data_list):
        super(CustomGraphDataset, self).__init__()
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

# Create the dataset
dataset = CustomGraphDataset(pyg_data_list)

from torch_geometric.loader import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Save the dataset
torch.save(dataset, 'dataset.pt')



Data(x=[5, 1], edge_index=[2, 4], edge_attr=[4, 1])
tensor([[67.],
        [78.],
        [67.],
        [78.],
        [79.]])
tensor([[0, 1, 2, 2],
        [1, 2, 3, 4]])
tensor([[1.],
        [1.],
        [1.],
        [2.]])
523902


Crear Red

In [4]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

embedding_size = 64
class GCN(torch.nn.Module):
    def __init__(self):
        # Init
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        # GCN layers
        self.initial_conv = GCNConv(dataset.num_node_features, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)
        # Output layer
        self.out = Linear(embedding_size*2, dataset.num_classes)

    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.tanh(hidden)

        # Others
        hidden = self.conv1(hidden, edge_index)
        hidden = F.tanh(hidden)
        hidden = self.conv2(hidden, edge_index)
        hidden = F.tanh(hidden)
        hidden = self.conv3(hidden, edge_index)
        hidden = F.tanh(hidden)

        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index), gap(hidden, batch_index)], dim=1)

        # Final
        out = self.out(hidden)
        return out, hidden
    
model = GCN()
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))


GCN(
  (initial_conv): GCNConv(1, 64)
  (conv1): GCNConv(64, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (out): Linear(in_features=128, out_features=0, bias=True)
)
Number of parameters:  12608




Entrenar

In [6]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

data_size = len(dataset)
batch_size = 256

loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=batch_size, shuffle=True)
print(len(loader))
test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=batch_size, shuffle=False)

def train(data):
    for batch in loader:
        batch.to(device)
        optimizer.zero_grad()
        pred, embedding = model(batch.x, batch.edge_index, batch.batch)
        optimizer.step()
    return embedding

print("Comienzo del entrenamiento")
losses = []
embeddings = []
for epoch in range(100):
    embedding = train(dataset)
    #losses.append(loss)
    embeddings.append(embedding)
    print(f'Epoch {epoch}')

# Visualize the embeddings
# The embeddings are the output of the global pooling layer
# The embeddings are the concatenation of the global max pooling and global mean pooling
embedding = embeddings[-1].detach().cpu().numpy()
print(embedding)
print(embedding.shape)
print(len(embedding))

def embedding_to_csv(embeddings, filename):
    with open(filename, 'w') as f:
        for embedding in embeddings:
            f.write(','.join([str(x) for x in embedding.tolist()]) + '\n')
    
embedding_to_csv(embeddings, 'embeddings.csv')


# Save the model
torch.save(model.state_dict(), 'model.pth')






1638
Comienzo del entrenamiento
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
[[ 0.5106915  -0.45644715 -0.50678694 ... -0.36097744 -0.16058762
  -0.0743839

Entrenar Con Perdida

In [13]:
from torch_geometric.nn import GATConv
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

class ContrastiveGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, embedding_dim):
        super(ContrastiveGNN, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim)
        self.conv2 = GATConv(hidden_dim, hidden_dim)
        self.conv3 = GATConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, embedding_dim)
    
    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return x

# Contrastive loss function
def contrastive_loss(embeddings, temperature=0.1):
    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    # Compute similarity matrix
    sim_matrix = torch.matmul(embeddings, embeddings.T) / temperature
    
    # Contrastive loss (example: NT-Xent)
    # Replace this with your actual contrastive loss implementation
    loss = -torch.log(torch.diag(sim_matrix).exp() / sim_matrix.exp().sum(dim=1))
    return loss.mean()

# Training loop for contrastive learning
def train_contrastive(epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            batch = batch.to(device)
            optimizer.zero_grad()
            embeddings = model(batch.x, batch.edge_index, batch.batch)
            loss = contrastive_loss(embeddings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

# Initialize the model
model = ContrastiveGNN(input_dim=dataset.num_node_features, hidden_dim=128, embedding_dim=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Train the model
train_contrastive(epochs=100)

embeddings = []
for batch in dataloader:
    model.eval()
    with torch.no_grad():
        embeddings.append(model(batch.x, batch.edge_index, batch.batch))

True
Epoch 1, Loss: 3.394988247490791


KeyboardInterrupt: 

Featurizar

In [43]:
embedding = embeddings[-1].detach().cpu().numpy()
print(embedding)
print(embedding.shape)
print(len(embedding))

[[ 0.5106915  -0.49164522 -0.5399986  ... -0.3614224  -0.16016841
  -0.07859194]
 [ 0.5106915  -0.45544335 -0.50530016 ... -0.35572392 -0.15966657
  -0.07829054]
 [ 0.5297204  -0.49081612 -0.5399988  ... -0.37471935 -0.16792496
  -0.07916652]
 ...
 [ 0.5106915  -0.48980203 -0.5399988  ... -0.35558358 -0.1618756
  -0.07869917]
 [ 0.5106915  -0.49159074 -0.5399986  ... -0.36136273 -0.16039062
  -0.07865348]
 [ 0.5106915  -0.4914373  -0.5399986  ... -0.3569494  -0.15863296
  -0.07605761]]
(17, 128)
17


In [47]:
# Cargar el modelo
model = GCN()
model.load_state_dict(torch.load('model.pth'))
model.eval()

# Cargar los embeddings
#embedding = np.loadtxt('embeddings.csv', delimiter=',')

# Sentence2vec
mol = Chem.MolFromSmiles('CC(C)C')
adj = Chem.GetAdjacencyMatrix(mol)

nodesym = [atom.GetSymbol() for atom in mol.GetAtoms()]
bond_type = [bond.GetBondType() for bond in mol.GetBonds()]
G = nx.Graph(adj, symbol=nodesym, bond_type=bond_type)

def get_vector(G):
    data = nx_to_pyg(G)
    data.to(device)
    _, embedding = model(data.x, data.edge_index, data.batch)
    return embedding.detach().cpu().numpy()

vector = get_vector(G)
print(vector)
print(vector.shape)

# Calcular la distancia euclidiana
from scipy.spatial.distance import euclidean

mol2 = Chem.MolFromSmiles('CC(C)O')
adj2 = Chem.GetAdjacencyMatrix(mol2)

nodesym2 = [atom.GetSymbol() for atom in mol2.GetAtoms()]
bond_type2 = [bond.GetBondType() for bond in mol2.GetBonds()]
G2 = nx.Graph(adj2, symbol=nodesym2, bond_type=bond_type2)

vector2 = get_vector(G2)
print(vector2)
print(vector2.shape)

print(euclidean(vector.flatten(), vector2.flatten()))


[[ 0.5106915  -0.4920931  -0.5399988  -0.23681097  0.69906086  0.378033
   0.6728528  -0.3410698  -0.02140541 -0.3838586  -0.16989408 -0.48199406
   0.59261876  0.39275134  0.82839155  0.46965823  0.70105994 -0.03312314
   0.6600193  -0.53970134  0.5308237  -0.6937653  -0.3765128  -0.15563856
  -0.24933942  0.12511554  0.09804676  0.68223536  0.47348642  0.72101986
  -0.19723928  0.10990268 -0.5461353   0.7248185   0.35445288 -0.22667696
   0.50009793  0.22784697 -0.03144305 -0.11254678  0.40345314  0.8560601
  -0.2608003   0.43231446  0.72672033 -0.73081625  0.7488088  -0.3922557
  -0.20873761 -0.38060647 -0.425526   -0.01969332  0.6490324   0.75536466
  -0.7369295   0.4101855   0.9153064   0.23333922 -0.37043187 -0.3937288
   0.44654912 -0.34494868 -0.15267359 -0.06420745  0.45360258 -0.5310152
  -0.5906396  -0.2729106   0.6348447   0.33583236  0.6066788  -0.3874403
  -0.02549218 -0.41605377 -0.2046188  -0.5271914   0.53462243  0.34657347
   0.77175945  0.41269475  0.63477343 -0.0390

  model.load_state_dict(torch.load('model.pth'))
