In [1]:
# =========================
# Complete GNN pipeline
# =========================

import pandas as pd
import torch
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from rdkit.Chem import AllChem

def is_alkaneetc(inchi):
    mol = Chem.MolFromInchi(inchi)
    if not mol:
        return False
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in ["C", "H"]:
            return False
    return True

# -------------------------
# 1. Load & clean CSV
# -------------------------
df = pd.read_csv("C:\\Users\\matrix\\Documents\\Projects\\Exploratory_Project\\vapour_pressure_data_final.csv")
df = df.drop_duplicates().dropna(subset=['SMILES', 'VapourPressure_kPa']).reset_index(drop=True)

#filter hydrocarbons
df = df[df['InChI'].apply(is_alkaneetc)]
print(f"Filtered {len(df)} hydrocarbon records with vapor pressure data.")
# -------------------------
# 2. Function: SMILES â†’ graph
# -------------------------
def mol_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Nodes: atom features
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([
            atom.GetAtomicNum(),     # atomic number
            atom.GetDegree(),        # number of bonded neighbors
            atom.GetHybridization().real,  # hybridization type
            atom.GetIsAromatic()     # aromaticity
        ])
    x = torch.tensor(atom_features, dtype=torch.float)

    # Edges: bonds
    edge_index = []
    for bond in mol.GetBonds():
        start = bond.GetBeginAtomIdx()
        end = bond.GetEndAtomIdx()
        edge_index.append([start, end])
        edge_index.append([end, start])  # undirected
    if len(edge_index) == 0:
        edge_index = torch.empty((2,0), dtype=torch.long)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Target
    y = torch.tensor([target], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

# -------------------------
# 3. Create dataset
# -------------------------
data_list = []
for _, row in df.iterrows():
    graph = mol_to_graph(row['SMILES'], row['VapourPressure_kPa'])
    if graph is not None:
        data_list.append(graph)

class VaporPressureDataset(Dataset):
    def __init__(self, data_list):
        super().__init__()
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

dataset = VaporPressureDataset(data_list)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# -------------------------
# 4. Define GNN model
# -------------------------
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(4, hidden_channels)  # 4 input features
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, 1)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = global_mean_pool(x, batch)  # graph-level pooling
        x = self.lin(x)
        return x

model = GNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

# -------------------------
# 5. Training loop
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(1, 51):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)
        loss = criterion(out.view(-1), batch.y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs

    total_loss /= len(dataset)
    print(f'Epoch {epoch}, Loss: {total_loss:.4f}')


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\matrix\\Documents\\Projects\\Exploratory_Projects\\vapour_pressure_data_final.csv'

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
import numpy as np
from sklearn.model_selection import train_test_split

# ---------------------------
# 1. Load and preprocess data
# ---------------------------
df = pd.read_csv("C:\\Users\\satya\\OneDrive\\Desktop\\vapour_pressure_data_final.csv")

# Drop undefined columns (if any)
df = df.dropna(subset=['SMILES', 'VapourPressure_kPa'])

# Log scaling of target
df['VapourPressure_log'] = np.log1p(df['VapourPressure_kPa'])

# ---------------------------
# 2. Convert SMILES to PyG graphs
# ---------------------------
def mol_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # Atom features: atomic number, degree, hybridization, aromaticity
    atom_features = [[atom.GetAtomicNum(),
                      atom.GetDegree(),
                      int(atom.GetHybridization()),
                      int(atom.GetIsAromatic())]
                     for atom in mol.GetAtoms()]
    x = torch.tensor(atom_features, dtype=torch.float)

    # Bonds / edges
    edge_index = []
    for bond in mol.GetBonds():
        start = bond.GetBeginAtomIdx()
        end = bond.GetEndAtomIdx()
        edge_index.append([start, end])
        edge_index.append([end, start])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() if edge_index else torch.empty((2, 0), dtype=torch.long)

    y = torch.tensor([target], dtype=torch.float)
    return Data(x=x, edge_index=edge_index, y=y)

# Build list of Data objects
data_list = []
for _, row in df.iterrows():
    graph = mol_to_graph(row['SMILES'], row['VapourPressure_log'])
    if graph:
        data_list.append(graph)

# ---------------------------
# 3. Train/test split
# ---------------------------
train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# ---------------------------
# 4. Define a simple GCN
# ---------------------------
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, 1)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # Pooling over nodes
        x = self.lin(x)
        return x

# Input features = number of atom features
model = GCN(in_channels=4, hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# ---------------------------
# 5. Training loop
# ---------------------------
num_epochs = 50

for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)
        # Ensure same shape
        out = out.view(-1)
        target = batch.y.view(-1)
        loss = criterion(out, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs

    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

# ---------------------------
# 6. Evaluation
# ---------------------------
model.eval()
preds, targets = [], []
with torch.no_grad():
    for batch in test_loader:
        out = model(batch.x, batch.edge_index, batch.batch)
        out = out.view(-1)
        preds.extend(out.tolist())
        targets.extend(batch.y.view(-1).tolist())

# Convert back from log scale
preds_original = np.expm1(preds)
targets_original = np.expm1(targets)

# Compute RMSE
rmse = np.sqrt(np.mean((np.array(preds_original) - np.array(targets_original))**2))
print(f"Test RMSE (kPa): {rmse:.4f}")
