In [1]:
from datasets.get_datasets import *
from revision import *
from boostsrl import boostsrl
import random
from sklearn.model_selection import KFold

## Parameters

In [3]:
target = 'hasacademicadvisor'
#test_size = 0.3
small_train_size = 0.3
validation_size = 0.2
revision_threshold = 0.8
max_revision_iterations = 10

## Dataset

In [4]:
[facts, pos, neg] = get_yago2s_dataset(target, acceptedPredicates=[
'hascurrency',
'hascapital',
'hasacademicadvisor',
'participatedin',
'haswonprize',
'participatedin',
'owns',
'isinterestedin',
'livesin',
'happenedin',
'holdspoliticalposition',
'diedin',
'actedin',
'iscitizenof',
'worksat',
'directed',
'dealswith',
'wasbornin',
'created',
'isleaderof',
'haschild',
'ismarriedto',
'imports',
'hasmusicalrole',
'influences',
'isaffiliatedto',
'isknownfor',
'ispoliticianof',
'graduatedfrom',
'exports',
'edited',
'wrotemusicfor'
])

## Background  configuration

In [5]:
bk = ['hascurrency(+place,+currency).',
'hascurrency(+place,-currency).',
'hascurrency(-place,+currency).',
'hascapital(+place,+place).',
'hascapital(+place,-place).',
'hascapital(-place,+place).',
'hasacademicadvisor(+person,+person).',
'hasacademicadvisor(+person,-person).',
'hasacademicadvisor(-person,+person).',
'haswonprize(+person,+prize).',
'haswonprize(+person,-prize).',
'haswonprize(-person,+prize).',
'participatedin(+place,+event).',
'participatedin(+place,-event).',
'participatedin(-place,+event).',
'owns(+institution,+institution).',
'owns(+institution,-institution).',
'owns(-institution,+institution).',
'isinterestedin(+person,+concept).',
'isinterestedin(+person,-concept).',
'isinterestedin(-person,+concept).',
'livesin(+person,+place).',
'livesin(+person,-place).',
'livesin(-person,+place).',
'happenedin(+event,+place).',
'happenedin(+event,-place).',
'happenedin(-event,+place).',
'holdspoliticalposition(+person,+politicalposition).',
'holdspoliticalposition(+person,-politicalposition).',
'holdspoliticalposition(-person,+politicalposition).',
'diedin(+person,+place).',
'diedin(+person,-place).',
'diedin(-person,+place).',
'actedin(+person,+media).',
'actedin(+person,-media).',
'actedin(-person,+media).',
'iscitizenof(+person,+place).',
'iscitizenof(+person,-place).',
'iscitizenof(-person,+place).',
'worksat(+person,+institution).',
'worksat(+person,-institution).',
'worksat(-person,+institution).',
'directed(+person,+media).',
'directed(+person,-media).',
'directed(-person,+media).',
'dealswith(+place,+place).',
'dealswith(+place,-place).',
'dealswith(-place,+place).',
'wasbornin(+person,+place).',
'wasbornin(+person,-place).',
'wasbornin(-person,+place).',
'created(+person,+media).',
'created(+person,-media).',
'created(-person,+media).',
'isleaderof(+person,+place).',
'isleaderof(+person,-place).',
'isleaderof(-person,+place).',
'haschild(+person,+person).',
'haschild(+person,-person).',
'haschild(-person,+person).',
'ismarriedto(+person,+person).',
'ismarriedto(+person,-person).',
'ismarriedto(-person,+person).',
'imports(+person,+material).',
'imports(+person,-material).',
'imports(-person,+material).',
'hasmusicalrole(+person,+musicalrole).',
'hasmusicalrole(+person,-musicalrole).',
'hasmusicalrole(-person,+musicalrole).',
'influences(+person,+person).',
'influences(+person,-person).',
'influences(-person,+person).',
'isaffiliatedto(+person,+team).',
'isaffiliatedto(+person,-team).',
'isaffiliatedto(-person,+team).',
'isknownfor(+person,+theory).',
'isknownfor(+person,-theory).',
'isknownfor(-person,+theory).',
'ispoliticianof(+person,+place).',
'ispoliticianof(+person,-place).',
'ispoliticianof(-person,+place).',
'graduatedfrom(+person,+institution).',
'graduatedfrom(+person,-institution).',
'graduatedfrom(-person,+institution).',
'exports(+place,+material).',
'exports(+place,-material).',
'exports(-place,+material).',
'edited(+person,+media).',
'edited(+person,-media).',
'edited(-person,+media).',
'wrotemusicfor(+person,+media).',
'wrotemusicfor(+person,-media).',
'wrotemusicfor(-person,+media).']

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

## Sampling examples

In [6]:
# shuffle all examples
random.shuffle(pos)
random.shuffle(neg)

neg = neg[:len(pos)] # balanced

import numpy as np
pos = np.array(pos)
neg = np.array(neg)
small_dataset_aucroc = []
complete_dataset_aucroc = []
revision_dataset_aucroc = []


# separate train and test
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(pos):
    train_pos, test_pos = pos[train_index], pos[test_index]
    train_neg, test_neg = neg[train_index], neg[test_index]

    # learn from scratch in a small dataset
    s_train_pos = train_pos[:int(small_train_size*len(train_pos))]
    s_train_neg = train_neg[:int(small_train_size*len(train_neg))]

    # shuffle all train examples
    random.shuffle(train_pos)
    random.shuffle(train_neg)

    # train set used in revision and validation set
    r_train_pos = train_pos[int(validation_size*len(train_pos)):]
    r_train_neg = train_neg[int(validation_size*len(train_neg)):]
    validation_pos = train_pos[:int(validation_size*len(train_pos))]
    validation_neg = train_neg[:int(validation_size*len(train_neg))]

    print('Total examples')
    print('Positive examples: %s' % len(pos))
    print('Negative examples: %s' % len(neg))
    print('\n')
    print('Train and test sets from total')
    print('Train Positive examples: %s' % len(train_pos))
    print('Train Negative examples: %s' % len(train_neg))
    print('Test Positive examples: %s' % len(test_pos))
    print('Test Negative examples: %s' % len(test_neg))
    print('\n')
    print('Small dataset')
    print('Train Positive examples: %s' % len(s_train_pos))
    print('Train Negative examples: %s' % len(s_train_neg))
    print('\n')
    print('Revision theory train dataset and validation')
    print('Train Positive examples: %s' % len(r_train_pos))
    print('Train Negative examples: %s' % len(r_train_neg))
    print('Validation Positive examples: %s' % len(validation_pos))
    print('Validation Negative examples: %s' % len(validation_neg))
    
    # learning from small dataset
    delete_model_files()
    model = boostsrl.train(background, s_train_pos, s_train_neg, facts)
    learning_time = model.traintime()
    will = model.get_will_produced_tree()
    structured = model.get_structured_tree().copy()
    results = boostsrl.test(model, test_pos, test_neg, facts)
    inference_time = results.testtime()
    t_results = results.summarize_results()
    print('WILL-Produced Tree:')
    print_will_produced_tree(will)
    print('\n')
    print('Results:')
    print(t_results)
    print('\n')
    print('Total learning time: %s seconds' % learning_time)
    print('Total inference time: %s seconds' % inference_time)
    print('AUC ROC: %s' % t_results['AUC ROC'])
    print('\n')
    #print('Tree:')
    #model.tree(0, target, image=True)
    small_dataset_aucroc.append(t_results['AUC ROC'])
    
    # learning from complete dataset
    delete_model_files()
    model = boostsrl.train(background, train_pos, train_neg, facts)
    learning_time = model.traintime()
    will = model.get_will_produced_tree()
    #structured = model.get_structured_tree().copy()
    results = boostsrl.test(model, test_pos, test_neg, facts)
    inference_time = results.testtime()
    t_results = results.summarize_results()
    print('WILL-Produced Tree:')
    print_will_produced_tree(will)
    print('\n')
    print('Results:')
    print(t_results)
    print('\n')
    print('Total learning time: %s seconds' % learning_time)
    print('Total inference time: %s seconds' % inference_time)
    print('AUC ROC: %s' % t_results['AUC ROC'])
    print('\n')
    #print('Tree:')
    #model.tree(0, target, image=True)
    complete_dataset_aucroc.append(t_results['AUC ROC'])
    
    # theory revision
    total_revision_time = 0
    best_aucroc = 0
    best_structured = None

    print('Performing parameter learning')
    print('******************************************')
    delete_model_files()
    model = boostsrl.train(background, r_train_pos, r_train_neg, facts, refine=get_refine_file(structured))
    learning_time = model.traintime()
    will = model.get_will_produced_tree()
    structured = model.get_structured_tree().copy()
    results = boostsrl.test(model, test_pos, test_neg, facts)
    inference_time = results.testtime()
    t_results = results.summarize_results()
    total_revision_time += learning_time + inference_time
    print('WILL-Produced Tree:')
    print_will_produced_tree(will)
    print('\n')
    print('Results:')
    print(t_results)
    print('\n')
    print('Total learning time: %s seconds' % learning_time)
    print('Total inference time: %s seconds' % inference_time)
    print('AUC ROC: %s' % t_results['AUC ROC'])
    print('\n')
    #print('Tree:')
    #model.tree(0, target, image=True)

    best_aucroc = t_results['AUC ROC']
    best_structured = structured.copy()
    save_model_files()

    for i in range(max_revision_iterations):
        print('Refining iteration %s' % str(i+1))
        print('******************************************')
        found_better = False
        for candidate in get_cantidates(best_structured, revision_threshold):
            print('Refining node candidate')
            print('******************************************')
            delete_model_files()
            model = boostsrl.train(background, r_train_pos, r_train_neg, facts, refine=candidate)
            learning_time = model.traintime()
            will = model.get_will_produced_tree()
            results = boostsrl.test(model, test_pos, test_neg, facts)
            inference_time = results.testtime()
            t_results = results.summarize_results()
            total_revision_time += learning_time + inference_time
            print('WILL-Produced Tree:')
            print_will_produced_tree(will)
            print('\n')
            print('Results:')
            print(t_results)
            print('\n')
            print('Total learning time: %s seconds' % learning_time)
            print('Total inference time: %s seconds' % inference_time)
            print('AUC ROC: %s' % t_results['AUC ROC'])
            print('\n')
            #print('Tree:')
            #model.tree(0, target, image=True)
            print('******************************************')

            if t_results['AUC ROC'] > best_aucroc:
                found_better = True
                best_aucroc = t_results['AUC ROC']
                best_structured = model.get_structured_tree().copy()
                save_model_files()
        print('Best Tree AUC ROC so far: %s' % best_aucroc)
        print('******************************************\n')
        if found_better == False:
            break

    print('******************************************')
    delete_model_files()
    get_saved_model_files()
    delete_test_files()
    print('Total revision time: %s' % total_revision_time)
    print('Best validation AUC ROC: %s' % best_aucroc)
    will = model.get_will_produced_tree()
    results = boostsrl.test(model, test_pos, test_neg, facts)
    inference_time = results.testtime()
    t_results = results.summarize_results()
    print('WILL-Produced Tree:')
    print_will_produced_tree(will)
    print('\n')
    print('Results:')
    print(t_results)
    print('\n')
    print('Total inference time: %s seconds' % inference_time)
    print('AUC ROC: %s' % t_results['AUC ROC'])
    print('\n')
    #print('Tree:')
    #model.tree(0, target, image=True)
    revision_dataset_aucroc.append(t_results['AUC ROC'])

Total examples
Positive examples: 2895
Negative examples: 2895


Train and test sets from total
Train Positive examples: 2605
Train Negative examples: 2605
Test Positive examples: 290
Test Negative examples: 290


Small dataset
Train Positive examples: 781
Train Negative examples: 781


Revision theory train dataset and validation
Train Positive examples: 2084
Train Negative examples: 2084
Validation Positive examples: 521
Validation Negative examples: 521
WILL-Produced Tree:
% FOR hasacademicadvisor(A, B):
%   if ( diedin(B, C), diedin(A, D), livesin(E, D) )
%   then return 0.5378364350995104;  // std dev = 0,467, 128,000 (wgt'ed) examples reached here.  /* #neg=41 #pos=87 */
%   else if ( diedin(A, F), diedin(G, F), influences(G, H) )
%   | then if ( iscitizenof(B, I), iscitizenof(A, I), livesin(J, F) )
%   | | then return 0.7914822684328456;  // std dev = 0,249, 15,000 (wgt'ed) examples reached here.  /* #neg=1 #pos=14 */
%   | | else if ( diedin(H, F), influences(H, K), wasbornin(H

In [7]:
small_dataset_aucroc

[0.596873,
 0.669346,
 0.568424,
 0.625612,
 0.586504,
 0.646951,
 0.54882,
 0.614265,
 0.612092,
 0.613486]

In [8]:
complete_dataset_aucroc

[0.658936,
 0.602218,
 0.667592,
 0.592426,
 0.673716,
 0.650321,
 0.59967,
 0.59503,
 0.653997,
 0.66263]

In [9]:
revision_dataset_aucroc

[0.620559,
 0.670963,
 0.582741,
 0.626635,
 0.609328,
 0.647634,
 0.565666,
 0.619982,
 0.61718,
 0.63123]

In [10]:
print('Small dataset AUC ROC: %s +/- %s' % ((np.array(small_dataset_aucroc)).mean(), (np.array(small_dataset_aucroc)).std()))
print('Complete dataset AUC ROC: %s +/- %s' % ((np.array(complete_dataset_aucroc)).mean(), (np.array(complete_dataset_aucroc)).std()))
print('Theory revision dataset AUC ROC: %s +/- %s' % ((np.array(revision_dataset_aucroc)).mean(), (np.array(revision_dataset_aucroc)).std()))


Small dataset AUC ROC: 0.6082373 +/- 0.03366102011838025
Complete dataset AUC ROC: 0.6356536 +/- 0.03197167189309936
Theory revision dataset AUC ROC: 0.6191918000000001 +/- 0.02826135701554333
