In [None]:
import pickle
from typing import Dict, Tuple, List
import os
import numpy as np
import json
import logging
import pandas as pd
import sys

import glob

import torch
from torch.utils.data import DataLoader

# from evaluation import evaluation
import evaluation
from model import Distmult, Complex, Conve, Transe
import utils

In [None]:
'''
Pseudocode - 
    - Load the poisoned dataset, test.txt is the file with target triples, influential_triples.txt has influential triples
    - (but need to load the target triples from target dataset to get correct to_skip_eval; otherwise can regenerate the dicts)
    - Load the original model and compute ranks on target triples
    - Load the poisoned model and compute ranks on target triples 
    - Compute the difference in original and poisoned ranks
    - Sort the indexes of target triples based on the difference in their ranks
    - identify the influential triple for highest rank diff and lowest rank diff
'''

In [None]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt = '%m/%d/%Y %H:%M:%S',
                            level = logging.INFO,
                            #filename = log_path
                           )
logger = logging.getLogger(__name__)

In [None]:
## set arguments to pass to model init later
parser = utils.get_argument_parser()
sys.argv = ['prog.py']
args = parser.parse_args()

In [None]:
args.model = 'distmult'
args.original_data = 'FB15k-237'
attack_method = 'if'
args.data = '{}_del_{}_{}_0_100_1_1_1'.format(attack_method, args.model, args.original_data)

In [None]:
## set the hyperparams
args = utils.set_hyperparams(args)

## set the device - legacy code to re-use functions from utils
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
logger.info('Model name: {}\n'.format(args.model))
logger.info('Dataset name: {} \n'.format(args.data))
logger.info('Original dataset name: {} \n'.format(args.original_data))

In [None]:
## Load the target dataset and coresponding eval dictionaries
logger.info('------------ Load the target dataset ----------')
data_path = 'data/target_{}_{}_0_100_1'.format(args.model, args.original_data)

n_ent, n_rel, ent_to_id, rel_to_id = utils.generate_dicts(data_path)

data  = utils.load_data(data_path)
train_data, valid_data, test_data = data['train'], data['valid'], data['test']

inp_f = open(os.path.join(data_path, 'to_skip_eval.pickle'), 'rb')
to_skip_eval: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f)
inp_f.close()
to_skip_eval['lhs'] = {(int(k[0]), int(k[1])): v for k,v in to_skip_eval['lhs'].items()}
to_skip_eval['rhs'] = {(int(k[0]), int(k[1])): v for k,v in to_skip_eval['rhs'].items()}

In [None]:
## example name of original model
## FB15k-237_distmult_200_0.2_0.3_0.3.model

## example name of poisoned model
## cos_del_distmult_FB15k-237_0_100_1_1_1_distmult_200_0.2_0.3_0.3.model


In [None]:
logger.info('-------- Load the original model -----------')
## set the model path without hyperparam arguments
model_dir = 'saved_models/{}_{}_*.model'.format(args.original_data, args.model)
for filename in glob.glob(model_dir):
    model_path = filename
    
# add a model and load the pre-trained params
original_model = utils.load_model(model_path, args, n_ent, n_rel, device)

In [None]:
logger.info('------- Ranks on target dataset from original model ----------')
### legacy code
if args.add_reciprocals:
    num_rel= n_rel
else:
    num_rel = 0
    
test_data = torch.from_numpy(test_data.astype('int64')).to(device)
ranks_lhs, ranks_rhs = evaluation.get_ranking(original_model, test_data, num_rel, to_skip_eval, device)
ranks_lhs, ranks_rhs = np.array(ranks_lhs), np.array(ranks_rhs)
ranks = np.mean( np.array([ ranks_lhs, ranks_rhs ]), axis=0 )

In [None]:
mr_lhs = np.mean(ranks_lhs, dtype=np.float64)
mr_rhs = np.mean(ranks_rhs, dtype=np.float64)
mr = np.mean(ranks, dtype=np.float64)
### these should match the mean values from log files
logger.info('Original mean ranks. Lhs:{}, Rhs:{}, Mean:{}\n'.format(mr_lhs, mr_rhs, mr))

In [None]:
## Load the poisoned dataset and coresponding eval dictionaries
logger.info('------------ Load the poisoned dataset ----------')
data_path = 'data/{}'.format(args.data)

n_ent, n_rel, ent_to_id, rel_to_id = utils.generate_dicts(data_path)

data  = utils.load_data(data_path)
train_data, valid_data, test_data = data['train'], data['valid'], data['test']

inp_f = open(os.path.join(data_path, 'to_skip_eval.pickle'), 'rb')
to_skip_eval: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f)
inp_f.close()
to_skip_eval['lhs'] = {(int(k[0]), int(k[1])): v for k,v in to_skip_eval['lhs'].items()}
to_skip_eval['rhs'] = {(int(k[0]), int(k[1])): v for k,v in to_skip_eval['rhs'].items()}

In [None]:
# influential triples
inf_df = pd.read_csv(os.path.join(data_path, 'influential_triples.txt'), sep='\t', header=None, names=None, dtype=int)
inf_data = inf_df.values
del inf_df

In [None]:
logger.info('-------- Load the poisoned model -----------')
## set the model path without hyperparam arguments
model_dir = 'saved_models/{}_{}_*.model'.format(args.data, args.model)
for filename in glob.glob(model_dir):
    model_path = filename
    
# add a model and load the pre-trained params
poisoned_model = utils.load_model(model_path, args, n_ent, n_rel, device)

In [None]:
logger.info('------- Ranks on target dataset from poisoned model ----------')
logger.info('(using eval dicts from poisoned data)')

### legacy code
if args.add_reciprocals:
    num_rel= n_rel
else:
    num_rel = 0
    
test_data = torch.from_numpy(test_data.astype('int64')).to(device)
pos_ranks_lhs, pos_ranks_rhs = evaluation.get_ranking(poisoned_model, test_data, num_rel, to_skip_eval, device)
pos_ranks_lhs, pos_ranks_rhs = np.array(pos_ranks_lhs), np.array(pos_ranks_rhs)
pos_ranks = np.mean( np.array([ pos_ranks_lhs, pos_ranks_rhs ]), axis=0 )

In [None]:
pos_mr_lhs = np.mean(pos_ranks_lhs, dtype=np.float64)
pos_mr_rhs = np.mean(pos_ranks_rhs, dtype=np.float64)
pos_mr = np.mean(pos_ranks, dtype=np.float64)
### these should match the mean values from log files
logger.info('Poisoned mean ranks. Lhs:{}, Rhs:{}, Mean:{}\n'.format(pos_mr_lhs, pos_mr_rhs, pos_mr))

In [None]:
ranks_diff = pos_ranks - ranks
sorted_idx = np.argsort(ranks_diff) ## indices of sorted ranks
sorted_diffs = ranks_diff[sorted_idx] ## values of sorted ranks

In [None]:
try: 
    if test_data.is_cuda:
        test_data = test_data.cpu().numpy() #remove the torch tensor
except:
    test_data = np.array(test_data)

In [None]:
# get the entities from IDs
id_to_ent = {ent_to_id[k]:k for k in ent_to_id.keys()}
id_to_rel = {rel_to_id[k]:k for k in rel_to_id.keys()}


In [None]:
max_s, max_p, max_o = test_data[sorted_idx[-1]]
max_h, max_r, max_t = inf_data[sorted_idx[-1]]

min_s, min_p, min_o = test_data[sorted_idx[0]]
min_h, min_r, min_t = inf_data[sorted_idx[0]]

In [None]:
max_target = [id_to_ent[max_s], id_to_rel[max_p], id_to_ent[max_o]]
max_inf = [id_to_ent[max_h], id_to_rel[max_r], id_to_ent[max_t]]

min_target = [id_to_ent[min_s], id_to_rel[min_p], id_to_ent[min_o]]
min_inf = [id_to_ent[min_h], id_to_rel[min_r], id_to_ent[min_t]]

In [None]:
logger.info('---- For {} on {} {}\n'.format(attack_method, args.model, args.original_data))

logger.info('Maximum change in ranks: {}\n'.format(sorted_diffs[-1]))
logger.info('Target triple with maximum change: {}\n'.format(max_target))
logger.info('Corresponding influential triple: {}\n'.format(max_inf))

logger.info('Minimum change in ranks: {}\n'.format(sorted_diffs[0]))
logger.info('Target triple with minimum change: {}\n'.format(min_target))
logger.info('Corresponding influential triple: {}\n'.format(min_inf))


use this to change Freebase IDs to values

Link - https://freebase.toolforge.org/

Another method is to use the Google Knowledge Graph Search API

Link - https://developers.google.com/knowledge-graph/reference/rest/v1/

Original WN18RR dataset with definition files (to get entity values from IDs) - 
- Link1 - https://figshare.com/articles/dataset/WN18/11869548/2
- Link2 - https://everest.hds.utc.fr/doku.php?id=en:smemlj12