In [2]:
# !pip3 install statsmodels
# !pip3 install gmpy2
# !pip3 install cvxpy
# !pip install Mosek
# !pip install ipywidgets



In [2]:
from easydict import EasyDict
import numpy as np
import gmpy2
from tqdm import tqdm
from statsmodels.stats.proportion import proportion_confint
from itertools import product
from collections import defaultdict


import os
import pathlib
import sys

import pandas as pd
import torch
import torchvision
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset
import seaborn as sns
import numpy as np

import torch_geometric
from torch_geometric.data import Data as GraphData
import torch_geometric.datasets as pyg_datasets

import matplotlib.pyplot as plt
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device = {device}")

from tgnnu.data_utils.splits import SplitManager
from tgnnu.networks.node_classif_models import GCN
from tgnnu.networks.node_classif_lightner import NodeLevelGNN

import gnn_cp.cp.transformations as cp_t
import gnn_cp.cp.graph_transformations as cp_gt
from gnn_cp.cp.graph_cp import GraphCP

from graph_split import GraphSplit

# import regions_binary
import cvxpy as convex


from scipy.stats import norm

from utils import ModelManager
from utils import standard_l2_norm

# assignments
datasets_folder = "path_to_datasets"
models_direction = "path_to_model"
from certify_utils import *
import pickle

def save_pkl(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_pkl(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


# Training

In [4]:
dataset_key = "cora_ml"
model_key = "GCN"

# cora_ml
p_add = 0.01
p_del = 0.6
p_add_edge = 0.0
p_del_edge = 0.0
 
# laod graph datasets
if dataset_key in ["cora_ml", "pubmed", "citeseer"]:
    dataset = pyg_datasets.CitationFull(root=datasets_folder, name=dataset_key).data.to(device)
if dataset_key in ["Coauth-CS", "Coauth-Physics"]:
    dataset = pyg_datasets.Coauthor(root=datasets_folder, name=dataset_key.replace("Coauth-", "")).data.to(device)
if dataset_key in ["Amz-Computers", "Amz-Photo"]:
    dataset = pyg_datasets.Amazon(root=datasets_folder, name=dataset_key.replace("Amz-", "")).data.to(device)

# Baseline Parameters
if model_key == "GCN":
    model_params = {
        "n_features": dataset.x.shape[1], 
        "n_hidden": 64, 
        "n_classes": dataset.y.max().item() + 1, 
        "p_dropout": 0.6
    }
    optimizer_params = {"weight_decay": 1e-2}

model_r = GCN(**model_params)

In [5]:
model = NodeLevelGNN(model_r)
model.set_optimizer(optimizer_keyargs=optimizer_params)


training_budget = 20
calibration_budget = training_budget * (dataset.y.max().item() + 1)


main_split = GraphSplit.from_dataset(dataset)
training_mask = main_split.sample_nodes(training_budget, stratified=True)
validation_mask = main_split.sample_nodes(training_budget, stratified=True)
test_mask = ~(training_mask | validation_mask)

dataset.train_mask = training_mask
dataset.val_mask = validation_mask

In [6]:

param_to_str = lambda p: str(p).replace(".", "_")

s_model_loaded = False
try:
    model.model.load_state_dict(
        torch.load(os.path.join(
            models_direction, 
            f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}.pth")))
    s_model_loaded = True
    masks_state_dict = torch.load(os.path.join(
        models_direction, 
        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-masks.pth"))
    dataset.train_mask = masks_state_dict["train_mask"]
    training_mask = dataset.train_mask
    dataset.val_mask = masks_state_dict["val_mask"]
    validation_mask = dataset.val_mask
    test_mask = ~(training_mask | validation_mask)
    print("Model loaded")
except:
    print("Unable to load model")
    tloss, vloss = model.fit(dataset, epochs=1000, patience=20, 
        smoothing_lambda=lambda d: standard_sparse_smoothing_concat(d, n_samples=4, p_add=p_add, p_del=p_del, p_add_edge=p_add_edge, p_del_edge=p_del_edge), smoothing_attrs=True)
    pathlib.Path(models_direction).mkdir(exist_ok=True)
    torch.save(model.model.state_dict(), os.path.join(
        models_direction, 
        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}.pth"))
    # saving train and val masks
    masks_state_dict = {
        "train_mask": dataset.train_mask,
        "val_mask": dataset.val_mask
    }
    torch.save(masks_state_dict, os.path.join(
        models_direction, 
        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-masks.pth"))

accuracy = model.evaluate(dataset, dataset.y, test_mask)
print(f"Accuracy of one-time prediction = {accuracy}")

In [7]:
smooth_logits = model.smooth_predict(dataset, n_samples=10000, 
                     smoothing_function=lambda input: standard_sparse_smoothing(
                         input=input, p_add=p_add, p_del=p_del, p_add_edge=p_add_edge, p_del_edge=p_del_edge), 
                     mask=None)

torch.save(smooth_logits, 
           os.path.join(models_direction, 
                        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-smooth_logits.pth"))

# Robust Conformal

In [13]:
smooth_logits= torch.load( 
           os.path.join(models_direction, 
                        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-smooth_logits.pth"),
                        map_location=torch.device('cpu'))


In [None]:
r_add = 0
r_del = 3
r_add_edge = 0
r_del_edge = 0
coverage_guarantees = np.arange(0.7, 1, 0.05)
coverage_guarantees

scoreing= 'APS'
for r_del in range(4):
    print('fine')

    smooth_logits= torch.load( 
               os.path.join(models_direction, 
                            f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-smooth_logits.pth"),
                            map_location=torch.device('cpu'))

    y_true_mask = F.one_hot(dataset.y).bool()
   
    #TPS
    if scoreing== 'TPS':
        cp = GraphCP(transformation_sequence=[cp_t.TPSTransformation(softmax=True)], coverage_guarantee=0.9)
        sc_scores = torch.stack([cp.get_scores_from_logits(smooth_logits[:, i, :]) for i in range(smooth_logits.shape[1])]).permute(1, 2, 0)
    else:
    #APS
        cp = GraphCP(transformation_sequence=[cp_t.APSTransformation(softmax=True)], coverage_guarantee=0.9)
        sc_scores = torch.stack([cp.get_scores_from_logits(smooth_logits[:, i, :]) for i in range(smooth_logits.shape[1])]).permute(1, 2, 0)  + 1

    esc_scores = sc_scores.mean(axis=2)
    esc_scores.shape
   
    dkw_score_upperbound = torch.stack([torch.tensor([
    dkw_offset(sc_scores, node_i, class_i, pf_plus_att=p_add, pf_minus_att=p_del, ra=r_add, rd=r_del, num_s=2000, alpha=0.1) 
    for class_i in range(dataset.y.max().item() + 1)
    ]) for node_i in tqdm(range(dataset.x.shape[0]))]).to(device)
    
    np_score_upperbound = torch.stack([torch.tensor([
    np_offset(sc_scores, node_i, class_i, pf_plus_att=p_add, pf_minus_att=p_del, ra=r_add, rd=r_del, alpha=0.1) 
    for class_i in range(dataset.y.max().item() + 1)
    ]) for node_i in range(dataset.x.shape[0])]).to(device)

    args = EasyDict(model_key= model_key, dataset_key=dataset_key, p_add=p_add, p_del=p_del, p_add_edge= p_add_edge,
                p_del_edge=p_del_edge, r_add=r_add, r_del=r_del, r_add_edge=r_add_edge, r_del_edge=r_del_edge)
    
    save_pkl((args, np_score_upperbound.cpu(), dkw_score_upperbound.cpu()),
         path=os.path.join(models_direction, 
                        f"{model_key}-{dataset_key}-{param_to_str(p_add)}-{param_to_str(p_del)}-{param_to_str(p_add_edge)}-{param_to_str(p_del_edge)}-rs-{r_add}-{r_del}-{r_add_edge}-{r_del_edge}-upper_bouds_new.pth"))




    result = []

    for coverage_guarantee in coverage_guarantees:

        cp = GraphCP(transformation_sequence=[cp_t.APSTransformation(softmax=True)], coverage_guarantee=coverage_guarantee)
        for iter_i in range(100):
            split = GraphSplit(dataset.x.shape[0], n_edges=dataset.edge_index.shape[1], edge_index=dataset.edge_index, device=device)

            split._vertices_budget[training_mask | validation_mask] = False
            cal_mask = split.sample_nodes(calibration_budget, stratified=False)

            # COMMENT THIS:
            limiter_mask = torch.ones_like(cal_mask)
            # limiter_mask[:200] = True

            threshold = cp.calibrate_from_scores(esc_scores[cal_mask], y_true_mask[cal_mask])

            np_pred_set = (np_score_upperbound > threshold)
            dkw_pred_set = (dkw_score_upperbound > threshold)

            eval_mask = (~cal_mask) & limiter_mask

            result.append({
                "p_add_attr": p_add,
                "p_del_attr": p_del,
                "p_add_adj": p_add_edge,
                "p_del_adj": p_del_edge,
                "r_add_attr": r_add,
                "r_del_attr": r_del,
                "r_add_adj": r_add_edge,
                "r_del_adj": r_del_edge,
                "iter": iter_i,
                "method": "NP",
                "$1-\\alpha$": coverage_guarantee,
                "coverage": cp.coverage(np_pred_set[eval_mask], y_true_mask[eval_mask]),
                "set_size": cp.average_set_size(np_pred_set[eval_mask]),
                "singleton_hits": singleton_hit(np_pred_set[eval_mask], y_true_mask[eval_mask]),
            })

            result.append({
                "p_add_attr": p_add,
                "p_del_attr": p_del,
                "p_add_adj": p_add_edge,
                "p_del_adj": p_del_edge,
                "r_add_attr": r_add,
                "r_del_attr": r_del,
                "r_add_adj": r_add_edge,
                "r_del_adj": r_del_edge,
                "iter": iter_i,
                "method": "DKW",
                "$1-\\alpha$": coverage_guarantee,
                "coverage": cp.coverage(dkw_pred_set[eval_mask], y_true_mask[eval_mask]),
                "set_size": cp.average_set_size(dkw_pred_set[eval_mask]),
                "singleton_hits": singleton_hit(dkw_pred_set[eval_mask], y_true_mask[eval_mask]),
            })
    result = pd.DataFrame(result)
    result.to_csv(os.path.join(models_direction, f"{scoreing}-results-cora_{r_del}.csv"), index=False)



# print(f"NP Cov: {cp.coverage(np_pred_set[eval_mask[:200]], y_true_mask[eval_mask])}, DKW Cov: {cp.coverage(dkw_pred_set[eval_mask[:200]], y_true_mask[eval_mask])}")


fine



100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2995/2995 [20:59<00:00,  2.38it/s]
