In [1]:
from datasets.get_datasets import *
from revision import *
from boostsrl import boostsrl
import random
from sklearn.model_selection import KFold

## Parameters

In [2]:
target = 'advisedby'
#test_size = 0.3
small_train_size = 0.3
validation_size = 0.2
revision_threshold = 0.8
max_revision_iterations = 10

## Dataset

Sameproject, sameperson and samecourse predicates are not being used.

In [4]:
[facts, pos, neg] = get_uwcse_dataset(target, acceptedPredicates=[
'professor',
'student',
'advisedby',
'tempadvisedby',
'ta',
'hasposition',
'publication',
'inphase',
'courselevel',
'yearsinprogram',
'projectmember',
])

## Background  configuration

In [5]:
bk = ['professor(+person).',
'student(+person).',
'advisedby(+person,+person).',
'advisedby(+person,-person).',
'advisedby(-person,+person).',
'tempadvisedby(+person,+person).',
'tempadvisedby(+person,-person).',
'tempadvisedby(-person,+person).',
'ta(+course,+person,+quarter).',
'ta(-course,+person,+quarter).',
'ta(+course,-person,+quarter).',
'ta(+course,+person,-quarter).',
'ta(-course,+person,-quarter).',
'ta(+course,-person,-quarter).',
'hasposition(+person,+faculty).',
'hasposition(+person,-faculty).',
'hasposition(-person,+faculty).',
'publication(+title,+person).',
'publication(+title,-person).',
'publication(-title,+person).',
'inphase(+person,+prequals).',
'inphase(+person,-prequals).',
'inphase(-person,+prequals).',
'courselevel(+course,+level).',
'courselevel(+course,-level).',
'courselevel(-course,+level).',
'yearsinprogram(+person,+year).',
'yearsinprogram(-person,+year).',
'yearsinprogram(+person,-year).',
'projectmember(+project,+person).',
'projectmember(+project,-person).',
'projectmember(-project,+person).']
#'sameproject(project, project).',
#'samecourse(course, course).',
#'sameperson(person, person).',]

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

## Sampling examples

In [7]:
# shuffle all examples
random.shuffle(pos)
random.shuffle(neg)

neg = neg[:len(pos)] # balanced

import numpy as np
pos = np.array(pos)
neg = np.array(neg)
small_dataset_aucroc = []
complete_dataset_aucroc = []
revision_dataset_aucroc = []
small_learning_time = []
small_inference_time = []
complete_learning_time = []
complete_inference_time = []
revision_learning_time = []
revision_inference_time = []

# separate train and test
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(pos):
    train_pos, test_pos = pos[train_index], pos[test_index]
    train_neg, test_neg = neg[train_index], neg[test_index]

    # learn from scratch in a small dataset
    s_train_pos = train_pos[:int(small_train_size*len(train_pos))]
    s_train_neg = train_neg[:int(small_train_size*len(train_neg))]

    # shuffle all train examples
    random.shuffle(train_pos)
    random.shuffle(train_neg)

    # train set used in revision and validation set
    r_train_pos = train_pos[int(validation_size*len(train_pos)):]
    r_train_neg = train_neg[int(validation_size*len(train_neg)):]
    validation_pos = train_pos[:int(validation_size*len(train_pos))]
    validation_neg = train_neg[:int(validation_size*len(train_neg))]

    print('Total examples')
    print('Positive examples: %s' % len(pos))
    print('Negative examples: %s' % len(neg))
    print('\n')
    print('Train and test sets from total')
    print('Train Positive examples: %s' % len(train_pos))
    print('Train Negative examples: %s' % len(train_neg))
    print('Test Positive examples: %s' % len(test_pos))
    print('Test Negative examples: %s' % len(test_neg))
    print('\n')
    print('Small dataset')
    print('Train Positive examples: %s' % len(s_train_pos))
    print('Train Negative examples: %s' % len(s_train_neg))
    print('\n')
    print('Revision theory train dataset and validation')
    print('Train Positive examples: %s' % len(r_train_pos))
    print('Train Negative examples: %s' % len(r_train_neg))
    print('Validation Positive examples: %s' % len(validation_pos))
    print('Validation Negative examples: %s' % len(validation_neg))
    
    # learning from small dataset
    [model, learning_time, inference_time, t_results, small_structured] = learn_test_model(background, boostsrl, target, s_train_pos, s_train_neg, facts, test_pos, test_neg, trees=10)
    small_dataset_aucroc.append(t_results['AUC ROC'])
    small_learning_time.append(learning_time)
    small_inference_time.append(inference_time)
    
    # learning from complete dataset
    [model, learning_time, inference_time, t_results, structured] = learn_test_model(background, boostsrl, target, train_pos, train_neg, facts, test_pos, test_neg, trees=10)
    complete_dataset_aucroc.append(t_results['AUC ROC'])
    complete_learning_time.append(learning_time)
    complete_inference_time.append(inference_time)
    
    # theory revision
    [model, total_revision_time, inference_time, t_results, structured] = theory_revision(background, boostsrl, target, r_train_pos, r_train_neg, facts, validation_pos, validation_neg, test_pos, test_neg, revision_threshold, small_structured, max_revision_iterations=10)
    revision_dataset_aucroc.append(t_results['AUC ROC'])
    revision_learning_time.append(total_revision_time)
    revision_inference_time.append(inference_time)

Total examples
Positive examples: 113
Negative examples: 113


Train and test sets from total
Train Positive examples: 101
Train Negative examples: 101
Test Positive examples: 12
Test Negative examples: 12


Small dataset
Train Positive examples: 30
Train Negative examples: 30


Revision theory train dataset and validation
Train Positive examples: 81
Train Negative examples: 81
Validation Positive examples: 20
Validation Negative examples: 20
WILL-Produced Tree:
% FOR advisedby(A, B):
%   if ( professor(B), student(A) )
%   then if ( ta(C, A, D) )
%   | then if ( publication(E, B), publication(F, A), tempadvisedby(G, B) )
%   | | then if ( publication(E, A) )
%   | | | then return 4.27215511084593;  // std dev = 0,269, 30,000 (wgt'ed) examples reached here.  /* #pos=30 */
%   | | | else return 1.3246924694611444;  // std dev = 4,448, 20,000 (wgt'ed) examples reached here.  /* #neg=10 #pos=10 */
%   | | else return 3.885309388258713;  // std dev = 0,247, 110,000 (wgt'ed) examples reache

In [8]:
small_dataset_aucroc

[0.916667,
 0.763889,
 0.920139,
 0.88843,
 0.847107,
 0.661157,
 0.884298,
 0.834711,
 0.764463,
 0.913223]

In [9]:
complete_dataset_aucroc

[1.0,
 0.923611,
 0.972222,
 0.805785,
 0.921488,
 0.979339,
 0.863636,
 0.96281,
 1.0,
 0.983471]

In [10]:
revision_dataset_aucroc

[1.0,
 0.875,
 0.996528,
 0.863636,
 0.847107,
 0.983471,
 0.900826,
 0.921488,
 0.96281,
 0.958678]

In [11]:
print('Small dataset AUC ROC: %s +/- %s' % ((np.array(small_dataset_aucroc)).mean(), (np.array(small_dataset_aucroc)).std()))
print('Complete dataset AUC ROC: %s +/- %s' % ((np.array(complete_dataset_aucroc)).mean(), (np.array(complete_dataset_aucroc)).std()))
print('Theory revision dataset AUC ROC: %s +/- %s' % ((np.array(revision_dataset_aucroc)).mean(), (np.array(revision_dataset_aucroc)).std()))

print('Small dataset learning time: %s +/- %s' % ((np.array(small_learning_time)).mean(), (np.array(small_learning_time)).std()))
print('Small dataset inference time: %s +/- %s' % ((np.array(small_inference_time)).mean(), (np.array(small_inference_time)).std()))
print('Complete dataset learning time: %s +/- %s' % ((np.array(complete_learning_time)).mean(), (np.array(complete_learning_time)).std()))
print('Complete dataset inference time: %s +/- %s' % ((np.array(complete_inference_time)).mean(), (np.array(complete_inference_time)).std()))
print('Revision time: %s +/- %s' % ((np.array(revision_learning_time)).mean(), (np.array(revision_learning_time)).std()))
print('Revision model inference time: %s +/- %s' % ((np.array(revision_inference_time)).mean(), (np.array(revision_inference_time)).std()))

Small dataset AUC ROC: 0.8394084000000002 +/- 0.08094936568398792
Complete dataset AUC ROC: 0.9412362 +/- 0.06052848620905698
Theory revision dataset AUC ROC: 0.9309544000000001 +/- 0.05412849894501047
Small dataset learning time: 9.930499999999999 +/- 3.448080255736516
Small dataset inference time: 0.5179 +/- 0.047632866804340056
Complete dataset learning time: 15.666300000000001 +/- 6.107232794809774
Complete dataset inference time: 0.48810000000000003 +/- 0.05266963071828016
Revision time: 4.848800000000001 +/- 2.4179522658646513
Revision model inference time: 0.48410000000000003 +/- 0.0461810567224268


In [12]:
'''sem boosting

Small dataset AUC ROC: 0.9278783 +/- 0.03755699045197845
Complete dataset AUC ROC: 0.9256486 +/- 0.0576748788211991
Theory revision dataset AUC ROC: 0.9243744000000002 +/- 0.042361955547873385
Small dataset learning time: 2.9913999999999996 +/- 1.7272083950699173
Small dataset inference time: 0.5094000000000001 +/- 0.027868261517360583
Complete dataset learning time: 4.038400000000001 +/- 0.3932762896488931
Complete dataset inference time: 0.5464999999999999 +/- 0.06962650357442918
Revision time: 2.3796 +/- 1.6737868562036204
Revision model inference time: 0.5252000000000001 +/- 0.03326499661806686
'''

'sem boosting\n\nSmall dataset AUC ROC: 0.9278783 +/- 0.03755699045197845\nComplete dataset AUC ROC: 0.9256486 +/- 0.0576748788211991\nTheory revision dataset AUC ROC: 0.9243744000000002 +/- 0.042361955547873385\nSmall dataset learning time: 2.9913999999999996 +/- 1.7272083950699173\nSmall dataset inference time: 0.5094000000000001 +/- 0.027868261517360583\nComplete dataset learning time: 4.038400000000001 +/- 0.3932762896488931\nComplete dataset inference time: 0.5464999999999999 +/- 0.06962650357442918\nRevision time: 2.3796 +/- 1.6737868562036204\nRevision model inference time: 0.5252000000000001 +/- 0.03326499661806686\n'