In [1]:
# =========================
# Complete GNN pipeline
# =========================

import pandas as pd
import torch
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from rdkit.Chem import AllChem

def is_alkaneetc(inchi):
    mol = Chem.MolFromInchi(inchi)
    if not mol:
        return False
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in ["C", "H"]:
            return False
    return True

# -------------------------
# 1. Load & clean CSV
# -------------------------
df = pd.read_csv("C:\\Users\\satya\\OneDrive\\Desktop\\vapour_pressure_data_final.csv")
df = df.drop_duplicates().dropna(subset=['SMILES', 'VapourPressure_kPa']).reset_index(drop=True)

#filter hydrocarbons
df = df[df['InChI'].apply(is_alkaneetc)]
print(f"Filtered {len(df)} hydrocarbon records with vapor pressure data.")
# -------------------------
# 2. Function: SMILES â†’ graph
# -------------------------
def mol_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Nodes: atom features
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([
            atom.GetAtomicNum(),     # atomic number
            atom.GetDegree(),        # number of bonded neighbors
            atom.GetHybridization().real,  # hybridization type
            atom.GetIsAromatic()     # aromaticity
        ])
    x = torch.tensor(atom_features, dtype=torch.float)

    # Edges: bonds
    edge_index = []
    for bond in mol.GetBonds():
        start = bond.GetBeginAtomIdx()
        end = bond.GetEndAtomIdx()
        edge_index.append([start, end])
        edge_index.append([end, start])  # undirected
    if len(edge_index) == 0:
        edge_index = torch.empty((2,0), dtype=torch.long)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Target
    y = torch.tensor([target], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

# -------------------------
# 3. Create dataset
# -------------------------
data_list = []
for _, row in df.iterrows():
    graph = mol_to_graph(row['SMILES'], row['VapourPressure_kPa'])
    if graph is not None:
        data_list.append(graph)

class VaporPressureDataset(Dataset):
    def __init__(self, data_list):
        super().__init__()
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

dataset = VaporPressureDataset(data_list)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# -------------------------
# 4. Define GNN model
# -------------------------
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(4, hidden_channels)  # 4 input features
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, 1)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = global_mean_pool(x, batch)  # graph-level pooling
        x = self.lin(x)
        return x

model = GNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

# -------------------------
# 5. Training loop
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(1, 51):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)
        loss = criterion(out.view(-1), batch.y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs

    total_loss /= len(dataset)
    print(f'Epoch {epoch}, Loss: {total_loss:.4f}')


ModuleNotFoundError: No module named 'torch'