In [1]:
from datasets.get_datasets import *
from revision import *
from boostsrl import boostsrl
import random
from sklearn.model_selection import KFold

## Parameters

In [2]:
target = 'athleteplaysforteam'
#test_size = 0.3
small_train_size = 0.3
validation_size = 0.2
revision_threshold = 0.95
max_revision_iterations = 10

## Dataset

In [3]:
[facts, pos, neg] = get_nell_dataset(target)

## Background  configuration

In [4]:
bk = ['athleteledsportsteam(+athlete,+sportsteam).',
      'athleteledsportsteam(+athlete,-sportsteam).',
      'athleteledsportsteam(-athlete,+sportsteam).',
      'athleteplaysforteam(+athlete,+sportsteam).',
      'athleteplaysforteam(+athlete,-sportsteam).',
      'athleteplaysforteam(-athlete,+sportsteam).',
      'athleteplaysinleague(+athlete,+sportsleague).',
      'athleteplaysinleague(+athlete,-sportsleague).',
      'athleteplaysinleague(-athlete,+sportsleague).',
      'athleteplayssport(+athlete,+sport).',
      'athleteplayssport(+athlete,-sport).',
      'athleteplayssport(-athlete,+sport).',
      'teamalsoknownas(+sportsteam,+sportsteam).',
      'teamalsoknownas(+sportsteam,-sportsteam).',
      'teamalsoknownas(-sportsteam,+sportsteam).',
      'teamplaysagainstteam(+sportsteam,+sportsteam).',
      'teamplaysagainstteam(+sportsteam,-sportsteam).',
      'teamplaysagainstteam(-sportsteam,+sportsteam).',
      'teamplaysinleague(+sportsteam,+sportsleague).',
      'teamplaysinleague(+sportsteam,-sportsleague).',
      'teamplaysinleague(-sportsteam,+sportsleague).',
      'teamplayssport(+sportsteam,+sport).',
      'teamplayssport(+sportsteam,-sport).',
      'teamplayssport(-sportsteam,+sport).']

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

## Sampling examples

In [5]:
# shuffle all examples
random.shuffle(pos)
random.shuffle(neg)

neg = neg[:len(pos)] # balanced

import numpy as np
pos = np.array(pos)
neg = np.array(neg)
small_dataset_aucroc = []
complete_dataset_aucroc = []
revision_dataset_aucroc = []
small_learning_time = []
small_inference_time = []
complete_learning_time = []
complete_inference_time = []
revision_learning_time = []
revision_inference_time = []

# separate train and test
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(pos):
    train_pos, test_pos = pos[train_index], pos[test_index]
    train_neg, test_neg = neg[train_index], neg[test_index]

    # learn from scratch in a small dataset
    s_train_pos = train_pos[:int(small_train_size*len(train_pos))]
    s_train_neg = train_neg[:int(small_train_size*len(train_neg))]

    # shuffle all train examples
    random.shuffle(train_pos)
    random.shuffle(train_neg)

    # train set used in revision and validation set
    r_train_pos = train_pos[int(validation_size*len(train_pos)):]
    r_train_neg = train_neg[int(validation_size*len(train_neg)):]
    validation_pos = train_pos[:int(validation_size*len(train_pos))]
    validation_neg = train_neg[:int(validation_size*len(train_neg))]

    print('Total examples')
    print('Positive examples: %s' % len(pos))
    print('Negative examples: %s' % len(neg))
    print('\n')
    print('Train and test sets from total')
    print('Train Positive examples: %s' % len(train_pos))
    print('Train Negative examples: %s' % len(train_neg))
    print('Test Positive examples: %s' % len(test_pos))
    print('Test Negative examples: %s' % len(test_neg))
    print('\n')
    print('Small dataset')
    print('Train Positive examples: %s' % len(s_train_pos))
    print('Train Negative examples: %s' % len(s_train_neg))
    print('\n')
    print('Revision theory train dataset and validation')
    print('Train Positive examples: %s' % len(r_train_pos))
    print('Train Negative examples: %s' % len(r_train_neg))
    print('Validation Positive examples: %s' % len(validation_pos))
    print('Validation Negative examples: %s' % len(validation_neg))
    
    # learning from small dataset
    [model, learning_time, inference_time, t_results, small_structured] = learn_test_model(background, boostsrl, target, s_train_pos, s_train_neg, facts, test_pos, test_neg, trees=10)
    small_dataset_aucroc.append(t_results['AUC ROC'])
    small_learning_time.append(learning_time)
    small_inference_time.append(inference_time)
    
    # learning from complete dataset
    [model, learning_time, inference_time, t_results, structured] = learn_test_model(background, boostsrl, target, train_pos, train_neg, facts, test_pos, test_neg, trees=10)
    complete_dataset_aucroc.append(t_results['AUC ROC'])
    complete_learning_time.append(learning_time)
    complete_inference_time.append(inference_time)
    
    # theory revision
    [model, total_revision_time, inference_time, t_results, structured] = theory_revision(background, boostsrl, target, r_train_pos, r_train_neg, facts, validation_pos, validation_neg, test_pos, test_neg, revision_threshold, small_structured, max_revision_iterations=10)
    revision_dataset_aucroc.append(t_results['AUC ROC'])
    revision_learning_time.append(total_revision_time)
    revision_inference_time.append(inference_time)

Total examples
Positive examples: 1458
Negative examples: 1458


Train and test sets from total
Train Positive examples: 1312
Train Negative examples: 1312
Test Positive examples: 146
Test Negative examples: 146


Small dataset
Train Positive examples: 393
Train Negative examples: 393


Revision theory train dataset and validation
Train Positive examples: 1050
Train Negative examples: 1050
Validation Positive examples: 262
Validation Negative examples: 262
WILL-Produced Tree:
% FOR athleteplaysforteam(A, B):
%   if ( athleteledsportsteam(C, B), teamplaysinleague(B, D), athleteplaysinleague(E, D) )
%   then if ( athleteledsportsteam(A, B) )
%   | then return 4.206277228909364;  // std dev = 0,285, 680,000 (wgt'ed) examples reached here.  /* #pos=680 */
%   | else return 1.782291565861202;  // std dev = 4,403, 3.780,000 (wgt'ed) examples reached here.  /* #neg=1.680 #pos=2.100 */
%   else if ( teamalsoknownas(B, F), teamplaysinleague(B, G) )
%   | then return -0.38626876130962284;  // st

In [6]:
small_dataset_aucroc

[0.741743,
 0.74559,
 0.749601,
 0.732619,
 0.730015,
 0.802871,
 0.74803,
 0.800737,
 0.783686,
 0.804209]

In [7]:
complete_dataset_aucroc

[0.802003,
 0.691101,
 0.687254,
 0.760251,
 0.765106,
 0.796983,
 0.767639,
 0.798743,
 0.817218,
 0.789536]

In [8]:
revision_dataset_aucroc

[0.796585,
 0.751126,
 0.772213,
 0.760931,
 0.784082,
 0.805264,
 0.802918,
 0.804583,
 0.783639,
 0.799073]

In [9]:
print('Small dataset AUC ROC: %s +/- %s' % ((np.array(small_dataset_aucroc)).mean(), (np.array(small_dataset_aucroc)).std()))
print('Complete dataset AUC ROC: %s +/- %s' % ((np.array(complete_dataset_aucroc)).mean(), (np.array(complete_dataset_aucroc)).std()))
print('Theory revision dataset AUC ROC: %s +/- %s' % ((np.array(revision_dataset_aucroc)).mean(), (np.array(revision_dataset_aucroc)).std()))
print('Small dataset learning time: %s +/- %s' % ((np.array(small_learning_time)).mean(), (np.array(small_learning_time)).std()))
print('Small dataset inference time: %s +/- %s' % ((np.array(small_inference_time)).mean(), (np.array(small_inference_time)).std()))
print('Complete dataset learning time: %s +/- %s' % ((np.array(complete_learning_time)).mean(), (np.array(complete_learning_time)).std()))
print('Complete dataset inference time: %s +/- %s' % ((np.array(complete_inference_time)).mean(), (np.array(complete_inference_time)).std()))
print('Revision time: %s +/- %s' % ((np.array(revision_learning_time)).mean(), (np.array(revision_learning_time)).std()))
print('Revision model inference time: %s +/- %s' % ((np.array(revision_inference_time)).mean(), (np.array(revision_inference_time)).std()))

Small dataset AUC ROC: 0.7639101000000001 +/- 0.028811811645399876
Complete dataset AUC ROC: 0.7675834 +/- 0.04281092253899698
Theory revision dataset AUC ROC: 0.7860414 +/- 0.01824708682064072
Small dataset learning time: 629.8982 +/- 310.957204002673
Small dataset inference time: 1.0429 +/- 0.2432021587075246
Complete dataset learning time: 1495.6859000000002 +/- 665.3716977738759
Complete dataset inference time: 0.9577000000000002 +/- 0.10197847812161154
Revision time: 126.03519999999999 +/- 257.9179714113772
Revision model inference time: 1.1384 +/- 0.4437907615081684
