In [1]:
from datasets.get_datasets import *
from revision import *
from boostsrl import boostsrl
import random

In [2]:
target = 'advisedby'
test_size = 0.1
validation_size = 0.1
revision_threshold = 0.95

In [3]:
[facts, pos, neg] = get_uwcse_dataset(target, acceptedPredicates=[
'professor',
'student',
'advisedby',
'tempadvisedby',
'ta',
'hasposition',
'publication',
'inphase',
'courselevel',
'yearsinprogram',
'projectmember',
])

In [4]:
bk = ['professor(+person).',
'student(+person).',
'advisedby(+person,+person).',
'advisedby(+person,-person).',
'advisedby(-person,+person).',
'tempadvisedby(+person,+person).',
'tempadvisedby(+person,-person).',
'tempadvisedby(-person,+person).',
'ta(+course,+person,+quarter).',
'ta(-course,+person,+quarter).',
'ta(+course,-person,+quarter).',
'ta(+course,+person,-quarter).',
'ta(-course,+person,-quarter).',
'ta(+course,-person,-quarter).',
'hasposition(+person,+faculty).',
'hasposition(+person,-faculty).',
'hasposition(-person,+faculty).',
'publication(+title,+person).',
'publication(+title,-person).',
'publication(-title,+person).',
'inphase(+person,+prequals).',
'inphase(+person,-prequals).',
'inphase(-person,+prequals).',
'courselevel(+course,+level).',
'courselevel(+course,-level).',
'courselevel(-course,+level).',
'yearsinprogram(+person,+year).',
'yearsinprogram(-person,+year).',
'yearsinprogram(+person,-year).',
'projectmember(+project,+person).',
'projectmember(+project,-person).',
'projectmember(-project,+person).']
#'sameproject(project, project).',
#'samecourse(course, course).',
#'sameperson(person, person).',]

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

In [5]:
# shuffle all examples
random.shuffle(pos)
random.shuffle(neg)
neg = neg[:len(pos)]

# separate train and test
test_pos = pos[:int(test_size*len(pos))]
test_neg = neg[:int(test_size*len(neg))]
train_pos = pos[int(test_size*len(pos)):]
train_neg = neg[int(test_size*len(neg)):]

train_pos = train_pos[int(validation_size*len(train_pos)):]
train_neg = train_neg[int(validation_size*len(train_neg)):]
validation_pos = train_pos[:int(validation_size*len(train_pos))]
validation_neg = train_neg[:int(validation_size*len(train_neg))]

print('Train pos size: %s' % len(train_pos))
print('Train neg size: %s' % len(train_neg))
print('Validation pos size: %s' % len(validation_pos))
print('Validation neg size: %s' % len(validation_neg))
print('Test pos size: %s' % len(test_pos))
print('Test neg size: %s' % len(test_neg))

Train pos size: 92
Train neg size: 92
Validation pos size: 9
Validation neg size: 9
Test pos size: 11
Test neg size: 11


## Learning 1 tree

In [6]:
delete_model_files()
model = boostsrl.train(background, train_pos[:int(0.1*len(train_pos))], train_neg[:int(0.1*len(train_pos))], facts, trees=1)
learning_time = model.traintime()
will = model.get_will_produced_tree()
structured = model.get_structured_tree().copy()
results = boostsrl.test(model, test_pos, test_neg, facts, trees=1)
inference_time = results.testtime()
t_results = results.summarize_results()
print('WILL-Produced Tree:')
print_will_produced_tree(will)
print('\n')
print('Results:')
print(t_results)
print('\n')
print('Total learning time: %s seconds' % learning_time)
print('Total inference time: %s seconds' % inference_time)
print('AUC ROC: %s' % t_results['AUC ROC'])

WILL-Produced Tree:
% FOR advisedby(A, B):
%   if ( professor(B), student(A) )
%   then if ( publication(C, A) )
%   | then return 0.5248156017661788;  // std dev = 0,816, 3,000 (wgt'ed) examples reached here.  /* #neg=1 #pos=2 */
%   | else return 0.8581489350995123;  // std dev = 0,000, 7,000 (wgt'ed) examples reached here.  /* #pos=7 */
%   else return -0.14185106490048777;  // std dev = 0,000, 8,000 (wgt'ed) examples reached here.  /* #neg=8 */


Results:
{'F1': nan, 'Recall': 0.0, 'AUC ROC': 0.842975, 'AUC PR': 0.738113, 'CLL': -0.824114, 'Precision': [nan, 0.5]}


Total learning time: 1.069 seconds
Total inference time: 0.534 seconds
AUC ROC: 0.842975


## Performing parameter

In [7]:
delete_model_files()
model = boostsrl.train(background, train_pos, train_neg, facts, refine=get_refine_file(structured), trees=1)
learning_time = model.traintime()
will = model.get_will_produced_tree()
#structured = model.get_structured_tree().copy()
results = boostsrl.test(model, test_pos, test_neg, facts, trees=1)
inference_time = results.testtime()
t_results = results.summarize_results()
print('WILL-Produced Tree:')
print_will_produced_tree(will)
print('\n')
print('Results:')
print(t_results)
print('\n')
print('Total learning time: %s seconds' % learning_time)
print('Total inference time: %s seconds' % inference_time)
print('AUC ROC: %s' % t_results['AUC ROC'])

WILL-Produced Tree:
% FOR advisedby(A, B):
%   if ( professor(B), student(A) )
%   then if ( publication(C, A) )
%   | then return 0.7601097194132377;  // std dev = 2,124, 51,000 (wgt'ed) examples reached here.  /* #neg=5 #pos=46 */
%   | else return 0.7260734634013991;  // std dev = 2,465, 53,000 (wgt'ed) examples reached here.  /* #neg=7 #pos=46 */
%   else return -0.14185106490048766;  // std dev = 6,99e-08, 80,000 (wgt'ed) examples reached here.  /* #neg=80 */


Results:
{'F1': nan, 'Recall': 0.0, 'AUC ROC': 0.884298, 'AUC PR': 0.815053, 'CLL': -0.763204, 'Precision': [nan, 0.5]}


Total learning time: 2.175 seconds
Total inference time: 0.48 seconds
AUC ROC: 0.884298


## Theory Revision

In [8]:
#background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=12, nodeSize=3, numOfClauses=12)
[model, total_revision_time, inference_time, t_results, structured] = theory_revision(background, boostsrl, target, train_pos, train_neg, facts, validation_pos, validation_neg, test_pos, test_neg, revision_threshold, structured, max_revision_iterations=10, verbose=True)

******************************************
Performing Parameter Learning
******************************************
WILL-Produced Tree:
% FOR advisedby(A, B):
%   if ( professor(B), student(A) )
%   then if ( publication(C, A) )
%   | then return 0.7601097194132377;  // std dev = 2,124, 51,000 (wgt'ed) examples reached here.  /* #neg=5 #pos=46 */
%   | else return 0.7260734634013991;  // std dev = 2,465, 53,000 (wgt'ed) examples reached here.  /* #neg=7 #pos=46 */
%   else return -0.14185106490048766;  // std dev = 6,99e-08, 80,000 (wgt'ed) examples reached here.  /* #neg=80 */


Results
   AUC ROC   = 0.901235
   AUC PR    = 0.791226
   CLL	      = -0.757501
   Precision = nan at threshold = 0.5
   Recall    = 0.0
   F1        = nan


Total learning time: 2.594 seconds
Total inference time: 0.499 seconds
AUC ROC: 0.901235


******************************************
Performing Theory Revision
******************************************
Refining iteration 1
*****************************

## Experiment (10 runs of 10-fold cross-validation)

In [35]:
from sklearn.model_selection import KFold
import numpy as np
import sys
import time

inference_10boosted = []
inference_combined = []
inference_pl = []

start = time.time()
[facts, positives, negatives] = get_uwcse_dataset(target, acceptedPredicates=[
'professor',
'student',
'advisedby',
'tempadvisedby',
'ta',
'hasposition',
'publication',
'inphase',
'courselevel',
'yearsinprogram',
'projectmember',
])

pos = list(positives)
neg = list(negatives)

for run in range(1,11):
    # shuffle all examples
    random.shuffle(pos)
    random.shuffle(neg)
    neg = neg[:len(pos)]

    pos = np.array(pos)
    neg = np.array(neg)

    fold = 0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(pos):
        train_pos, test_pos = pos[train_index], pos[test_index]
        train_neg, test_neg = neg[train_index], neg[test_index]
        fold += 1

        # learning 10 boosted trees
        delete_model_files()
        model = boostsrl.train(background, train_pos, train_neg, facts, trees=10)
        structured = model.get_structured_tree().copy()
        results = boostsrl.test(model, test_pos, test_neg, facts, trees=10)
        t_results = results.summarize_results()
        inference_10boosted.append(t_results['AUC ROC'])

        # do inference with the combined tree
        os.rename('boostsrl/train/models/bRDNs/Trees/'+ target +'Tree0.tree', 'boostsrl/train/models/bRDNs/Trees/'+ target +'Tree0_temp.tree')
        os.rename('boostsrl/train/models/bRDNs/Trees/CombinedTreesTreeFile'+ target +'.tree', 'boostsrl/train/models/bRDNs/Trees/'+ target +'Tree0.tree')
        results = boostsrl.test(model, test_pos, test_neg, facts, trees=1)
        t_results = results.summarize_results()
        inference_combined.append(t_results['AUC ROC'])

        # parameter learning
        delete_model_files()
        model = boostsrl.train(background, train_pos, train_neg, facts, refine=get_refine_file(structured), trees=1)
        results = boostsrl.test(model, test_pos, test_neg, facts, trees=1)
        t_results = results.summarize_results()
        inference_pl.append(t_results['AUC ROC'])

        sys.stdout.write('\rRun: %s, Fold: %s, 10_boosted_trees: %s, combined: %s, pl: %s, time: %s' % (run, fold, inference_10boosted[-1], inference_combined[-1], inference_pl[-1], time.strftime('%H:%M:%S', time.gmtime(time.time()-start))))
        sys.stdout.flush()

Run: 10, Fold: 10, 10_boosted_trees: 0.954545, combined: 0.92562, pl: 0.92562, time: 00:42:50

In [42]:
import pandas as pd

inference_10boosted = np.array(inference_10boosted)
inference_combined = np.array(inference_combined)
inference_pl = np.array(inference_pl)

data = [['10 boosted trees', '%s +/- %.4f' % (inference_10boosted.mean(), inference_10boosted.std())],
       ['Combined 10 boosted trees', '%s +/- %.4f' % (inference_combined.mean(), inference_combined.std())],
       ['Parameter learning', '%s +/- %.4f' % (inference_pl.mean(), inference_pl.std())]]
pd.DataFrame(data, columns=['', 'AUC ROC'])

Unnamed: 0,Unnamed: 1,AUC ROC
0,10 boosted trees,0.95455213 +/- 0.0456
1,Combined 10 boosted trees,0.9153463299999999 +/- 0.0747
2,Parameter learning,0.91537445 +/- 0.0781


In [40]:
inference_10boosted

array([0.993056, 0.986111, 0.944444, 0.921488, 0.859504, 0.991736,
       0.938017, 0.958678, 1.      , 0.991736, 0.878472, 1.      ,
       1.      , 1.      , 0.975207, 0.909091, 0.88843 , 0.909091,
       0.975207, 0.942149, 0.993056, 0.986111, 0.993056, 1.      ,
       0.942149, 1.      , 0.921488, 0.966942, 0.892562, 0.966942,
       0.923611, 0.989583, 0.909722, 0.933884, 0.966942, 1.      ,
       1.      , 0.917355, 0.793388, 1.      , 0.875   , 0.916667,
       0.9375  , 0.983471, 0.900826, 0.991736, 1.      , 0.991736,
       1.      , 0.942149, 0.951389, 0.857639, 0.951389, 1.      ,
       1.      , 0.834711, 0.966942, 0.85124 , 1.      , 0.942149,
       0.895833, 0.965278, 0.958333, 1.      , 0.954545, 0.987603,
       1.      , 0.966942, 0.88843 , 1.      , 0.916667, 0.986111,
       0.951389, 0.942149, 0.904959, 0.983471, 1.      , 0.938017,
       0.96281 , 1.      , 0.96875 , 1.      , 0.916667, 0.991736,
       0.991736, 0.884298, 0.991736, 1.      , 0.971074, 0.938

In [43]:
inference_combined

array([0.979167, 0.979167, 0.944444, 0.942149, 0.863636, 0.88843 ,
       0.876033, 0.971074, 0.909091, 0.913223, 0.850694, 1.      ,
       1.      , 1.      , 0.991736, 0.859504, 0.743802, 0.913223,
       0.904959, 0.971074, 0.954861, 0.895833, 0.927083, 0.991736,
       0.789256, 0.991736, 0.933884, 0.842975, 0.669421, 0.921488,
       0.861111, 0.899306, 0.798611, 0.958678, 0.975207, 1.      ,
       0.991736, 0.942149, 0.876033, 1.      , 0.829861, 0.951389,
       0.736111, 0.971074, 0.801653, 0.917355, 1.      , 1.      ,
       0.909091, 0.950413, 0.909722, 0.954861, 0.881944, 1.      ,
       1.      , 0.867769, 0.913223, 0.880165, 0.917355, 0.96281 ,
       0.784722, 0.895833, 0.979167, 0.909091, 0.979339, 0.966942,
       1.      , 0.760331, 0.933884, 1.      , 0.90625 , 0.975694,
       0.833333, 0.917355, 0.772727, 0.946281, 0.909091, 0.933884,
       0.880165, 0.921488, 0.9375  , 1.      , 0.927083, 0.900826,
       0.979339, 0.863636, 0.834711, 0.909091, 0.884298, 0.863

In [44]:
inference_pl

array([0.979167, 0.979167, 0.944444, 0.942149, 0.863636, 0.904959,
       0.876033, 0.971074, 0.909091, 0.913223, 0.847222, 1.      ,
       1.      , 1.      , 0.991736, 0.859504, 0.743802, 0.913223,
       0.904959, 0.971074, 0.954861, 0.895833, 0.927083, 0.991736,
       0.789256, 0.991736, 0.933884, 0.826446, 0.669421, 0.921488,
       0.861111, 0.899306, 0.798611, 0.958678, 0.975207, 1.      ,
       1.      , 0.942149, 0.710744, 1.      , 0.829861, 0.958333,
       0.736111, 0.971074, 0.797521, 0.913223, 1.      , 1.      ,
       0.909091, 0.950413, 0.909722, 0.954861, 0.881944, 1.      ,
       1.      , 0.867769, 0.979339, 0.880165, 0.913223, 0.96281 ,
       0.788194, 0.895833, 0.979167, 0.909091, 0.971074, 0.966942,
       1.      , 0.760331, 0.933884, 1.      , 0.90625 , 0.975694,
       0.833333, 0.917355, 0.772727, 0.946281, 0.909091, 0.933884,
       0.880165, 0.96281 , 0.9375  , 1.      , 0.927083, 0.904959,
       0.979339, 0.863636, 0.830579, 0.909091, 0.950413, 0.863