In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()

# Convert to pandas DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
# df['target'] = data.target


KeyboardInterrupt: 

In [2]:
from causalnex.structure.notears import from_pandas

sm = from_pandas(df)
sm.remove_edges_below_threshold(0.8)

In [4]:
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz.toggle_physics(False)
viz.show("01_connected.html")

01_fully_connected.html


In [6]:
import networkx as nx

adj_matrix = nx.to_pandas_adjacency(sm)
adj_matrix = adj_matrix.applymap(lambda x: 1 if x != 0 else 0)
adj_matrix.to_csv('graph.csv')

In [10]:
len(df)

569

In [39]:
import torch
import numpy as np
import torch_geometric.utils
from torch_geometric.data import Data, DataLoader
from tqdm.notebook import tqdm


x_tensor = torch.from_numpy(np.array(df)).float()
target_tensor = torch.Tensor(data.target).long()
adj_tensor = torch.from_numpy(np.array(adj_matrix))
sm_convert = torch_geometric.utils.from_networkx(sm)

positional_encoder = torch.rand(30, 6).float()

split_idx = {
    'train': torch.tensor(np.arange(0, 400)),
    'valid': torch.tensor(np.arange(400, 500)),
    'test': torch.tensor(np.arange(500, 569)),
}

train_list = []
test_list = []
valid_list = []
for i in range(len(target_tensor)):
  x_yeet = x_tensor[i,:]
  x_scalar = torch.t(torch.reshape(x_yeet, (1, len(x_yeet)))).float()
  x = torch.cat((x_scalar, positional_encoder), 1)
  y = target_tensor[i]
  if i in split_idx['train']:
    train_list.append(Data(x=x, y=y, edge_index=sm_convert.edge_index, edge_attr=sm_convert.weight))
  elif i in split_idx['valid']:
    valid_list.append(Data(x=x, y=y, edge_index=sm_convert.edge_index, edge_attr=sm_convert.weight))
  elif i in split_idx['test']:
    test_list.append(Data(x=x, y=y, edge_index=sm_convert.edge_index, edge_attr=sm_convert.weight))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_loader = DataLoader(train_list, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_list, batch_size=8, shuffle=False)
test_loader = DataLoader(test_list, batch_size=8, shuffle=False)

In [45]:
args = {
    'device': device,
    'num_layers': 5,
    'hidden_dim': 256,
    'dropout': 0.3,
    'lr': 1e-3,
    'epochs': 50,
}

{'device': 'cuda',
 'num_layers': 5,
 'hidden_dim': 256,
 'dropout': 0,
 'lr': 0.001,
 'epochs': 50}

In [46]:
import torch.nn.functional as F


class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds=False):
        super(GCN, self).__init__()

        self.convs = None
        self.bns = None
        self.softmax = None

        self.convs = torch.nn.ModuleList()
        self.bns = torch.nn.ModuleList()

        for l in range(num_layers):
          if l==0:
            self.convs.append(GCNConv(input_dim, hidden_dim))
          elif l == num_layers-1:
            self.convs.append(GCNConv(hidden_dim, output_dim))
          else:
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
          if l < num_layers-1:
            self.bns.append(torch.nn.BatchNorm1d(hidden_dim))

        self.last_conv = GCNConv(hidden_dim, output_dim)
        self.log_soft = torch.nn.LogSoftmax()

        self.dropout = dropout
        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t, edge_weight):
        out = None
        for l in range(len(self.convs)-1):
          x = self.convs[l](x, adj_t, edge_weight)
          x = self.bns[l](x)
          x = F.relu(x)
          x = F.dropout(x, training=self.training)

        x = self.last_conv(x, adj_t, edge_weight)
        if self.return_embeds is True:
          out = x
        else:
          out = self.log_soft(x)

        return out

In [53]:
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool


class GCN_Graph(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph, self).__init__()

        self.gnn_node = GCN(input_dim, hidden_dim,
            hidden_dim, num_layers, dropout, return_embeds=True)

        self.gnn_node_2 = GCN(hidden_dim, hidden_dim,
        hidden_dim, num_layers, dropout, return_embeds=True)

        self.asap = torch_geometric.nn.pool.ASAPooling(in_channels=256, ratio=0.5, dropout=0.1, negative_slope=0.2, add_self_loops=False)

        self.pool = global_mean_pool

        self.linear = torch.nn.Linear(hidden_dim, output_dim)

    def reset_parameters(self):
      self.gnn_node.reset_parameters()
      self.linear.reset_parameters()

    def forward(self, batched_data):
        x, edge_index, batch, edge_weight = batched_data.x, batched_data.edge_index, batched_data.batch, batched_data.edge_attr
        embed = x
        out = None

        num_graphs = int(len(batch)/30)
        post_GCN_1 = self.gnn_node(embed, edge_index, edge_weight)
        post_pool_1 = self.asap(post_GCN_1, edge_index)
        post_GCN_2 = self.gnn_node_2(post_pool_1[0], post_pool_1[1], post_pool_1[2])
        post_pool_2 = self.asap(post_GCN_2, post_pool_1[1])
        ultimate_gcn = self.gnn_node_2(post_pool_2[0], post_pool_2[1], post_pool_2[2])

        glob_pool = self.pool(ultimate_gcn, post_pool_2[3], num_graphs)
        out = self.linear(glob_pool)

        return out


def train(model, device, data_loader, optimizer, loss_fn):
    model.train()
    loss = 0

    for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
        batch = batch.to(device)
        if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
            pass
        else:
            is_labeled = batch.y == batch.y
            optimizer.zero_grad()
            out = model(batch)
            loss = loss_fn(out[is_labeled].squeeze(), batch.y[is_labeled].to(torch.float32).squeeze())
            loss.backward()
            optimizer.step()

    return loss.item()


def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
    model.eval()
    y_true = []
    y_pred = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)

            y_true.append(batch.y.view(pred.shape).detach().cpu())
            y_pred.append(pred.detach().cpu())

    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()

    input_dict = {"y_true": y_true, "y_pred": y_pred}

    if save_model_results:
        # print("Saving Model Predictions")
        data = {}
        data['y_pred'] = y_pred.reshape(-1)
        data['y_true'] = y_true.reshape(-1)

        df = pd.DataFrame(data=data)
        df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)

    return evaluator.eval(input_dict)

In [54]:
import os
from torch_geometric.nn import GCNConv
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator


if 'IS_GRADESCOPE_ENV' not in os.environ:
  model = GCN_Graph(7, args['hidden_dim'],
              1, args['num_layers'],
              args['dropout']).to(device)
  evaluator = Evaluator(name='ogbg-molhiv')

  dataset = PygGraphPropPredDataset(name='ogbg-molhiv')

In [55]:
import copy
from pprint import pprint
import torch_geometric.transforms as T


if 'IS_GRADESCOPE_ENV' not in os.environ:
  model.reset_parameters()

  optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
  loss_fn = torch.nn.BCEWithLogitsLoss()

  best_model = None
  best_valid_acc = 0

  for epoch in range(1, 1 + args["epochs"]):
    pprint('Training...')
    loss = train(model, device, train_loader, optimizer, loss_fn)

    pprint('Evaluating...')
    train_result = eval(model, device, train_loader, evaluator)
    val_result = eval(model, device, valid_loader, evaluator)
    test_result = eval(model, device, test_loader, evaluator)

    train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_model = copy.deepcopy(model)
    print(f'Epoch: {epoch:02d}, '
          f'Loss: {loss:.4f}, '
          f'Train: {100 * train_acc:.2f}%, '
          f'Valid: {100 * valid_acc:.2f}% '
          f'Test: {100 * test_acc:.2f}%')

'Training...'


Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

'Evaluating...'


Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

ValueError: Input contains NaN.

In [None]:
if 'IS_GRADESCOPE_ENV' not in os.environ:
  train_acc = eval(best_model, device, train_loader, evaluator)[dataset.eval_metric]
  valid_acc = eval(best_model, device, valid_loader, evaluator, save_model_results=True, save_file="valid")[dataset.eval_metric]
  test_acc  = eval(best_model, device, test_loader, evaluator, save_model_results=True, save_file="test")[dataset.eval_metric]

  print(f'Best model: '
      f'Train: {100 * train_acc:.2f}%, '
      f'Valid: {100 * valid_acc:.2f}% '
      f'Test: {100 * test_acc:.2f}%')