# Test

In [None]:
import py3Dmol

text = 'xyz_files/homopoly_109997_chain_1.pdb'
viewer = py3Dmol.view(width=300, height=300)

with open(text, 'r') as pdb_file:
    pdb_data = pdb_file.read()

viewer.addModel(pdb_data, 'pdb')
viewer.setStyle({}, {'sphere': {'scale': 0.25}, 'stick': {}})
viewer.zoomTo()
viewer.show()

# 2D images of connectivity and node features

In [2]:
import pandas as pd
import torch
from tqdm import tqdm
from torch_geometric.data import Data

def read_train_data(data: pd.DataFrame):
    training_dataset = []

    print(f'Reading training data.')
    for row in tqdm(data.itertuples()):
        try:
            training_dataset.append(torch.load(f'../polygraphpy/data/training_input_data/{row.id}_{row.chain_size}.pt', weights_only=False))
        except Exception as e:
            pass
    
    input_dim : int = training_dataset[0].x.shape[1] if training_dataset else 0

    return training_dataset, input_dim

def pad_graphs(dataset):
    max_nodes = max(data.x.shape[0] for data in dataset)
    feature_dim = dataset[0].x.shape[1]
    max_edges = max(data.edge_index.shape[1] for data in dataset)

    padded_dataset = []
    for data in dataset:
        num_nodes = data.x.shape[0]
        num_edges = data.edge_index.shape[1]

        x_padded = torch.zeros((max_nodes, feature_dim))
        x_padded[:num_nodes] = data.x

        edge_index_padded = torch.full((2, max_edges), fill_value=-1, dtype=torch.long)  # use -1 as invalid edge
        edge_index_padded[:, :num_edges] = data.edge_index

        node_mask = torch.zeros(max_nodes, dtype=torch.bool)
        node_mask[:num_nodes] = 1

        edge_mask = torch.zeros(max_edges, dtype=torch.bool)
        edge_mask[:num_edges] = 1

        padded_data = Data(
            x=x_padded,
            edge_index=edge_index_padded,
            y=getattr(data, 'y', None),
            node_mask=node_mask,
            edge_mask=edge_mask,
            id = getattr(data, 'mol_id', None),
            chain_size = getattr(data, 'chain_size', None),
        )

        padded_dataset.append(padded_data)

    return padded_dataset

def get_adjacency_matrix(edge_index: torch.Tensor, num_nodes: int, edge_mask: torch.Tensor = None):
    adj = torch.zeros((num_nodes, num_nodes), dtype=torch.float)

    if edge_mask is not None:
        edge_index = edge_index[:, edge_mask]

    valid = (edge_index[0] >= 0) & (edge_index[1] >= 0)
    edge_index = edge_index[:, valid]

    adj[edge_index[0], edge_index[1]] = 1.0

    return adj

df = pd.read_csv('../polygraphpy/data/polarizability_data.csv')
df = df[df['chain_size'] == 2]
training_dataset, NODE_FEATURE_DIM = read_train_data(df)
padded_dataset = pad_graphs(training_dataset)

node_size = [data.x.shape[0] for data in training_dataset]
max_node = max(node_size)
max_node_idx = node_size.index(max_node)

Reading training data.


8665it [00:02, 3162.52it/s]


In [3]:
import matplotlib.pyplot as plt

id = max_node_idx

df = pd.DataFrame()

for data in tqdm(padded_dataset):
    padded_data = data
    adj = get_adjacency_matrix(
        edge_index=padded_data.edge_index,
        num_nodes=padded_data.x.shape[0],
        edge_mask=getattr(padded_data, 'edge_mask', None)
    )
    x = padded_data.x.numpy()

    desired_width_pixels = adj.shape[0]
    desired_height_pixels = adj.shape[0]
    dpi = 100

    figsize_width = desired_width_pixels / dpi
    figsize_height = desired_height_pixels / dpi

    string = f'adj_{int(padded_data.id.numpy()[0])}_chainsize_{int(padded_data.chain_size.numpy()[0])}'
    y = padded_data.y.numpy()[0]

    df = pd.concat([df, pd.DataFrame({'string': string, 'polarizability': y}, index=[0])]).reset_index(drop=True)

    plt.figure(figsize=(figsize_width, figsize_height))
    plt.imshow(adj, cmap='Greys')
    plt.axis('off')
    plt.savefig(f'images/{string}.png', bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

df.to_csv('input.csv', index=False)

100%|██████████| 7360/7360 [01:01<00:00, 119.75it/s]
