# Neural Theorem Prover using pandas and Pytorch

## 1. Symbolic Unificaiton using pandas DataFrame
- Load Files(Config, KG, Rule Template, ...)
- Generate Meta Tables(Rule Structure, KG index, ...)
- Run Backward Chaining and generate batch 

## 2. NTP Model Training with PyTorch
- Define Model Structure using PyTorch
- Define Foward Function 
- Training Model

## 3. Extract Rules from Trained Embedding Vectors
- Matching Rule templates with Embedding vectors 
- Extract Induced Rules

### import packages

In [1]:
#custom functions
from util.fileUtils import load_conf, load_from_file, create_directory
from ntp.prover import backward_chaining
from preprocess.dataPreprocessing import data_filter, padding

import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pprint import pprint
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from datetime import datetime, timedelta

# to print pandas dataframe
from IPython.display import display
pd.set_option('display.max_columns', 50)
random.seed(1337)
torch.manual_seed(1337)
torch.cuda.manual_seed_all(1337)

- dataname options:
    - example_8 / kinship / umls / nations

In [2]:
data_name = 'example_8'

In [3]:
config = load_conf(f"./config/{data_name}.conf")
config

{'data': {'kg': './data/example_8.txt', 'templates': './data/example_8.nlt'},
 'meta': {'result_dir': './out/example_8',
  'result_file': '/example_8_rule.tsv'},
 'training': {'num_epochs': 100,
  'report_interval': 10,
  'batch_size': 8,
  'neg_per_pos': 2,
  'learning_rate': 0.01,
  'shuffle': True},
 'model': {'embedding_size': 100,
  'l2': 0.0001,
  'drop_prob': 0.05,
  'init': True}}

In [4]:
embedding_size = config['model']['embedding_size']
drop_prob = config['model']['drop_prob']
weight_decay = config['model']['l2']
epochs = config['training']['num_epochs']
report_interver_epoch = config['training']['report_interval']
learning_rate = config['training']['learning_rate']
neg_per_pos = config['training']['neg_per_pos']
batch_size = config['training']['batch_size']
init = config['model']['init']
result_dir = config['meta']['result_dir']
result_file = config['meta']['result_file']
shuffle = config['training']['shuffle'] # True일 경우 매 epoch마다 데이터 random suffle

### Load Data Files using pandas
- KG : Knowledge Graph file with triple form
- Query : query with triple form

In [5]:
KG = pd.read_csv(config['data']['kg'], sep='\t', names=['subj','pred','obj'])
Query = pd.read_csv(config['data']['kg'], sep='\t', names=['subj','pred','obj'])

In [6]:
KG = KG[['pred', 'subj', 'obj']]
KG.head()

Unnamed: 0,pred,subj,obj
0,nationality,BART,USA
1,birthPlace,BART,NEWYORK
2,locatedIn,NEWYORK,USA
3,hasFather,BART,HOMER
4,hasGrandfather,BART,ABE


In [7]:
Query = Query[['pred', 'subj', 'obj']]
Query.head()

Unnamed: 0,pred,subj,obj
0,nationality,BART,USA
1,birthPlace,BART,NEWYORK
2,locatedIn,NEWYORK,USA
3,hasFather,BART,HOMER
4,hasGrandfather,BART,ABE


In [8]:
entity_list = sorted(set(KG.subj.values).union(set(KG.obj.values)))
len(entity_list)

6

In [9]:
start = datetime.now()

#KG index dictionary initializing
KG_index = {}
for entity in entity_list:
    KG_index[entity] = {'subj':[], 'obj':[]}
    
subj_entities = KG['subj'].tolist()
obj_entities = KG['obj'].tolist()

#KG index dictionary generation
for i in range(len(KG)):
    KG_index[subj_entities[i]]['subj'] = KG_index.get(subj_entities[i]).get('subj')+[i]
    KG_index[obj_entities[i]]['obj'] = KG_index.get(obj_entities[i]).get('obj')+[i]

end = datetime.now() 
print('converting time : ', end-start)

converting time :  0:00:00.000332


In [10]:
KG_index

{'ABE': {'subj': [], 'obj': [4, 5]},
 'BART': {'subj': [0, 1, 3, 4, 7], 'obj': [6]},
 'HOMER': {'subj': [5], 'obj': [3]},
 'LISA': {'subj': [6], 'obj': [7]},
 'NEWYORK': {'subj': [2], 'obj': [1]},
 'USA': {'subj': [], 'obj': [0, 2]}}

### Load Rule template

In [11]:
rules, max_atom = load_from_file(config['data']['templates'])
rules

[[('#1', 'X', 'Y'), ('#2', 'X', 'Z'), ('#3', 'Z', 'Y'), 2],
 [('#1', 'X', 'Y'), ('#2', 'Y', 'X'), 2]]

In [12]:
rule_structure = pd.DataFrame(list(map(lambda x : [{atom[1]: 'subj', atom[2]: 'obj'} for atom in x[:-1]], rules)))
rule_structure['rule_number'] = [i for i in range(len(rules))]
rule_structure

Unnamed: 0,0,1,2,rule_number
0,"{'X': 'subj', 'Y': 'obj'}","{'X': 'subj', 'Z': 'obj'}","{'Z': 'subj', 'Y': 'obj'}",0
1,"{'X': 'subj', 'Y': 'obj'}","{'Y': 'subj', 'X': 'obj'}",,1


### Generate Dictionary from KG & Query data

In [13]:
KG_predicate_list = sorted(set(KG.pred.values).union(set(Query.pred.values)))

rule_pred_list = []
for i, rule in enumerate(rules):
    # iterate rule components
    for r in rule[:-1]:
        #iterate augmnet number
        for j in range(rule[-1]):
            suffix = '_' + str(i) + '_' + str(j)
            rule_pred_list.append(r[0]+suffix)
            
predicate_list = sorted(set(KG_predicate_list).union(set(rule_pred_list)))
# print('predicates : ',predicate_list)

In [14]:
id2sym_dict = {}
sym2id_dict = {}
sym2id_dict['UNK'] = 0
sym2id_dict['PAD'] = 1
id2sym_dict[0] = 'UNK'
id2sym_dict[1] = 'PAD'


for i, p in enumerate(predicate_list):
    sym2id_dict[p] = i+2
    id2sym_dict[i+2] = p

In [15]:
sym2id_dict

{'UNK': 0,
 'PAD': 1,
 '#1_0_0': 2,
 '#1_0_1': 3,
 '#1_1_0': 4,
 '#1_1_1': 5,
 '#2_0_0': 6,
 '#2_0_1': 7,
 '#2_1_0': 8,
 '#2_1_1': 9,
 '#3_0_0': 10,
 '#3_0_1': 11,
 'birthPlace': 12,
 'hasFather': 13,
 'hasGrandfather': 14,
 'hasParent': 15,
 'locatedIn': 16,
 'nationality': 17,
 'sibling': 18}

### Run Backward Chaining

#### unification
- goal: query (e.g. nationality BART USA)
- rule: rule template (e.g. #1(X,Y) :- #2(X,Z), #3(Z,Y))

- 주어진 rule template의 conclusion과 query를 unify
    - unify된 트리플은 rule component substitution에 key를 rule component(e.g. #1(X,Y))로  
        value를 unified triples(dataframe)으로 저장   
    
        #1(X, Y) :
    
            |     pred    | subj | obj |
            |-------------|------|-----|
            | nationality | BART | USA |
    
    - conclusion의 X,Y와 같은 variable에 대하여 unify된 트리플을 참조하여 variable substitution에  
    X : [BART], Y: [USA] 와 같이 binding




- 앞서 binding된 variable을 참조하여 각 rule body에 맞는 트리플을 unify
    - #1(X,Y)를 통해 binding된 X에 대한 variable substitution을 참조하여 #2(X,Z)와 같은 body에 트리플을 unify하는 작업을 수행
        - 위 경우에는 variable substitution을 참조하여 X가 subject인 트리플을 찾아 unify   
    - unify된 트리플은 rule component substitution에 key를 rule component(e.g. #2(X,Y))로  
        value를 unified triples(dataframe)으로 저장   
           
       #2(X, Z) :

            |     pred     | subj | obj     |
            |--------------|------|---------|
            | placeOfBirth | BART | NEWYORK |
            | hasFather    | BART | HOMMER  |    
        
    - 규칙 body의 X,Z와 같은 variable에 대하여 unify된 트리플을 참조하여 variable substitution에  
    Z : [NEWYORK, HOMMER] 와 같이 binding
    
#### proof path completion
- rule template을 분석하여 인접한 rule component간의 common variable 도출 
- common variable을 기준으로 unified triple을 join하여 proof path를 생성

In [16]:
relation_path, rule_temp_path, max_path = backward_chaining(Query, KG, KG_index, 
                                                            rules, rule_structure, sym2id_dict, neg_per_pos)

complete generating proof paths! : 8/8


### data filtering
- proof path가 없는 데이터 제거

In [17]:
relation_path = list(filter(data_filter, relation_path))
rule_temp_path = list(filter(data_filter, rule_temp_path))

### padding
- rule의 최대 구성 요소 수와 최대 proof path 수로 모든 proof path padding

In [18]:
relation_path, rule_temp_path = padding(relation_path, rule_temp_path, rules, max_path, 
                                        max_atom, neg_per_pos, sym2id_dict)

### Create Batch Generator

In [19]:
def convert_list_to_tensor(path_data):
    path_tensor = []
    for i in path_data:
        for j in i:
            path_tensor.append(j)
    path_tensor = torch.tensor(path_tensor)
    return path_tensor

relation_tensor = convert_list_to_tensor(relation_path)
rule_temp_tensor = convert_list_to_tensor(rule_temp_path)

answer = [1]
for j in range(neg_per_pos):
    answer += [0]
answer = torch.tensor(answer, dtype=torch.float32)

In [20]:
class proof_path_dataset(Dataset): 
    def __init__(self, relation_tensor, rule_temp_tensor, label):
        self.relation_tensor = relation_tensor
        self.rule_temp_tensor = rule_temp_tensor
        self.label = label

    def __len__(self): 
        return len(self.relation_tensor)

    def __getitem__(self, idx): 
        rel_path = self.relation_tensor[idx]
        rule_tamplate= self.rule_temp_tensor[idx]
        label = self.label
        return rel_path, rule_tamplate, label
    

dataset = proof_path_dataset(relation_tensor, rule_temp_tensor, answer)
batch_generator = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

### Define Model Architecture

In [21]:
class NTP(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, max_path, dropout):
        super(NTP, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.embedding_matrix = nn.Embedding(self.vocab_size, self.embedding_size, padding_idx = sym2id_dict['PAD'])
        self.max_path = max_path
        self.dropout = nn.Dropout(dropout)
        
    def RBF_kernel(self, embed_aug_rule_temp_path, embed_aug_rel_path):
        L2_norm = torch.sqrt((embed_aug_rel_path - embed_aug_rule_temp_path).pow(2).sum(4))
        sim = torch.exp(-L2_norm/2)
        return sim
    
    def calculate_sim_avg(self, aug_rule_temp_path, aug_rel_path):

        embed_aug_rule_temp_path = self.embedding_matrix(aug_rule_temp_path)
        embed_aug_rel_path = self.embedding_matrix(aug_rel_path)
                                                         
        embed_aug_rule_temp_path = self.dropout(embed_aug_rule_temp_path)
        embed_aug_rel_path = self.dropout(embed_aug_rel_path)
        
        sims=self.RBF_kernel(embed_aug_rule_temp_path, embed_aug_rel_path)
        avg_sims = torch.mean(sims, 3)

        return avg_sims
        
        
    def forward(self, aug_rule_temp_path, aug_rel_path):
        
        avg_sims = self.calculate_sim_avg(aug_rule_temp_path, aug_rel_path)
        max_sims = torch.max(avg_sims, axis=2)[0]
        min_sims = torch.min(max_sims.view(-1, self.max_path), axis=1)[0]
        
        return min_sims

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(sym2id_dict)
ntp = NTP(vocab_size, embedding_size, max_path, drop_prob)
ntp.to(device)

NTP(
  (embedding_matrix): Embedding(19, 100, padding_idx=1)
  (dropout): Dropout(p=0.05, inplace=False)
)

In [23]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
if init:
    ntp.apply(initialize_weights); 

### Train Relation Embedding

In [24]:
optimizer = torch.optim.Adam(ntp.parameters(), lr = learning_rate, weight_decay = weight_decay)
criterion = torch.nn.BCELoss()

ntp.train()
epoch_loss = 0
start_time = datetime.now()
estart_time = datetime.now()
for epoch in range(1, epochs+1):
    for aug_rel_path, aug_rule_temp_path, label in batch_generator:
        optimizer.zero_grad()
        
        label = torch.flatten(label).to(device)
        
        y_hat = ntp.forward(aug_rule_temp_path.to(device), aug_rel_path.to(device))
        
        loss = criterion(y_hat, label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    if epoch%report_interver_epoch == 0:
        end_time = datetime.now()
        print(f'Epoch Time: {end_time-estart_time} \tEpoch: {epoch}', end='')
        print(f'\tTotal Loss: {epoch_loss/epoch:.3f} \tCurrent Loss: {loss.item():.3f}')
        estart_time = end_time

end_time = datetime.now()
print('\nTotal Training Time : ', end_time-start_time)

Epoch Time: 0:00:00.180884 	Epoch: 10	Total Loss: 0.621 	Current Loss: 0.541
Epoch Time: 0:00:00.011256 	Epoch: 20	Total Loss: 0.552 	Current Loss: 0.440
Epoch Time: 0:00:00.010340 	Epoch: 30	Total Loss: 0.499 	Current Loss: 0.368
Epoch Time: 0:00:00.010247 	Epoch: 40	Total Loss: 0.461 	Current Loss: 0.338
Epoch Time: 0:00:00.009947 	Epoch: 50	Total Loss: 0.435 	Current Loss: 0.326
Epoch Time: 0:00:00.013355 	Epoch: 60	Total Loss: 0.416 	Current Loss: 0.305
Epoch Time: 0:00:00.012854 	Epoch: 70	Total Loss: 0.403 	Current Loss: 0.309
Epoch Time: 0:00:00.012741 	Epoch: 80	Total Loss: 0.393 	Current Loss: 0.322
Epoch Time: 0:00:00.010659 	Epoch: 90	Total Loss: 0.384 	Current Loss: 0.317
Epoch Time: 0:00:00.012505 	Epoch: 100	Total Loss: 0.377 	Current Loss: 0.331

Total Training Time :  0:00:00.285095


## write rule file

In [25]:
def representation_match(x, emb):
    dist = torch.torch.nn.functional.pairwise_distance(x, emb)
    sim = torch.exp(-dist)
    return sim

In [26]:
#get trained embedding matrix
for i in enumerate(ntp.parameters()):
    print(i[1])
    embeddings = i[1]

Parameter containing:
tensor([[-2.8460e-04, -6.1410e-04, -4.3693e-04,  ..., -2.5244e-04,
         -2.8932e-05,  1.3444e-04],
        [-3.1353e-04, -4.0521e-05, -7.6841e-05,  ..., -1.0415e-03,
          2.9141e-05,  1.2764e-04],
        [-1.4191e-01,  2.1010e-01, -5.4320e-01,  ..., -2.3453e-02,
          1.4475e-01, -1.3630e-01],
        ...,
        [ 3.1612e-01,  1.8139e-01, -5.5350e-01,  ..., -4.6141e-01,
          3.3765e-01, -6.6033e-02],
        [-2.1487e-01,  4.9005e-02, -3.7651e-01,  ..., -2.8724e-01,
          7.4786e-02, -7.3228e-02],
        [-2.8372e-01, -6.2879e-02,  3.2541e-01,  ...,  2.3129e-01,
         -1.8233e-01, -7.9255e-02]], requires_grad=True)


In [27]:
#get parameterized rule template
rule_templates = {}
idx_rule_templates = {}
for rule_number, template in enumerate(rules):
    result_template_key = []
    ids_result_template_value = []
    ids_result_template_values = []
    for i in range(len(template)-1):
        rule_element=(f'p{int(template[i][0][1])-1}_{rule_number}', template[i][1], template[i][2])       
        result_template_key.append(rule_element)
        rule_element = ()

    for aug in range(template[-1]):
        for j in range(len(template)-1):
            ids_result_template_value.append([sym2id_dict[template[j][0]+'_'+str(rule_number)+'_'+
                                                           str(aug)], template[j][1], template[j][2]])
        ids_result_template_values.append(ids_result_template_value)
        ids_result_template_value = []
    idx_rule_templates[tuple(result_template_key)] = ids_result_template_values

In [28]:
#get rule instance & write rule file

# 자기 자신을 masking하기 위한 rule template의 relation index생성
masking_index = []
for key, rules in idx_rule_templates.items():
    for rule in rules:
        for element in rule:
            masking_index.append(element[0])
            
create_directory(result_dir)
with open(result_dir + result_file, 'w') as f:
    for key, rules in idx_rule_templates.items():
        result = []
        f.write(str(key)+'\n')
        for rule in rules:
            relation_similarities = []
            rule_result = []
            for element in rule:
                masking_index = masking_index+[element[0]]+[0, 1]
                x = ntp.embedding_matrix(torch.tensor([element[0]]).to(device))
                match = representation_match(x, embeddings)
                match[masking_index] = 0
                top_k = torch.topk(match, 1)
                rule_result.append(id2sym_dict[top_k.indices.item()]+'('+element[1]+','+element[2]+')')
                relation_similarities.append(match[top_k.indices])
            confidence_score = str(min(relation_similarities).item())

            head = rule_result[0]
            body = rule_result[1:]
            
            result.append((round(min(relation_similarities).item(), 6), head + ' :- ' +", ".join(body)+'\n'))
            result.sort(reverse = True)
        for score, rule in result:
            f.write(str(score)+ '\t' + rule)
        f.write('\n')