In [1]:
from datasets.get_datasets import *
from boostedrevision import *
from boostsrl import boostsrl
import random

## Dataset

In [2]:
target = 'accounttype'

bk = ['accounttype(+account,+type).',
      'accounttype(+account,-type).',
      'accounttype(-account,+type).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = datasets.load('twitter', bk, target=target)

## Background  configuration

In [3]:
background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

## Learning

In [4]:
for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

Exception: ('Encountered problems while running process: ', '(cd boostsrl; java -jar v1-0.jar -l -combine -train train/ -target accounttype -trees 10 > train_output.txt 2>&1)')

## Tweets

In [21]:
target = 'tweets'

bk = ['accounttype(+account,+type).',
      'accounttype(+account,-type).',
      'accounttype(-account,+type).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = datasets.load('twitter', bk, target=target)


In [23]:
background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

In [24]:
for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR tweets(A, B):
%   if ( follows(A, C) )
%   then if ( follows(C, A) )
%   | then if ( follows(A, D), follows(D, C) )
%   | | then return 0.35814893509950757;  // std dev = 0,500, 924,000 (wgt'ed) examples reached here.  /* #neg=462 #pos=462 */
%   | | else if ( follows(A, E), follows(C, E) )
%   | | | then return 0.3581489350995117;  // std dev = 0,500, 42,000 (wgt'ed) examples reached here.  /* #neg=21 #pos=21 */
%   | | | else return 0.3581489350995113;  // std dev = 0,500, 48,000 (wgt'ed) examples reached here.  /* #neg=24 #pos=24 */
%   | else if ( follows(C, F) )
%   | | then if ( follows(F, A) )
%   | | | then return 0.3581489350995123;  // std dev = 0,500, 8,000 (wgt'ed) examples reached here.  /* #neg=4 #pos=4 */
%   | | | else if ( follows(A, F) )
%   | | | | then if ( follows(F, C) )
%   | | | | | then return 0.35814893509951135;  // std dev = 3,391, 46,000 (wgt'ed) examples reached here.  /* #neg=23 #pos=23 */
%   | | | | | else return 0.3581489350

## Follows

In [25]:
target = 'follows'

bk = ['accounttype(+account,+type).',
      'accounttype(+account,-type).',
      'accounttype(-account,+type).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = datasets.load('twitter', bk, target=target)

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR follows(A, B):
%   if ( tweets(B, C), tweets(A, C) )
%   then if ( accounttype(A, D), accounttype(B, D) )
%   | then return 0.32332999359533;  // std dev = 0,499, 359,000 (wgt'ed) examples reached here.  /* #neg=192 #pos=167 */
%   | else return 0.6353886445426075;  // std dev = 0,416, 413,000 (wgt'ed) examples reached here.  /* #neg=92 #pos=321 */
%   else if ( tweets(A, E) )
%   | then if ( accounttype(A, F), accounttype(B, F) )
%   | | then return 0.12527222277074612;  // std dev = 0,442, 292,000 (wgt'ed) examples reached here.  /* #neg=214 #pos=78 */
%   | | else if ( tweets(B, G) )
%   | | | then return 0.2537533307039092;  // std dev = 0,489, 182,000 (wgt'ed) examples reached here.  /* #neg=110 #pos=72 */
%   | | | else return 0.14981560176617864;  // std dev = 0,455, 72,000 (wgt'ed) examples reached here.  /* #neg=51 #pos=21 */
%   | else if ( accounttype(A, H), accounttype(B, H) )
%   | | then if ( tweets(B, I) )
%   | | | then return 0.2893415956499

In [2]:
def load(dataset, bk, target=None, seed=None):
    '''Load dataset from json and accept only predicates presented in bk'''
    pattern = '^(\w+)\(.*\).$'
    accepted = set()
    for line in bk:
        m = re.search(pattern, line)
        if m:
            relation = re.sub('[ _]', '', m.group(1))
            accepted.add(relation)
    data = datasets.get_json_dataset(dataset)
    facts = []
    pos = []
    neg = []
    for i in range(len(data[0])): #positives
        facts.append([])
        pos.append([])
        neg.append([])
        for relation, value in data[0][i].items():
            if relation in accepted or relation == 'accounttype':
                if relation == target or (target in ['fan', 'news', 'club'] and relation == 'accounttype'):
                    for example in value:
                        if relation == 'accounttype':
                            if example[1] == target:
                                pos[i].append(example[1] + '(' + example[0] + ').')
                            else:
                                facts[i].append(example[1] + '(' + example[0] + ').')
                        else:
                            pos[i].append(relation + '(' + ','.join(example)+ ').')
                else:
                    for example in value:
                        if relation == 'accounttype':
                            facts[i].append(example[1] + '(' + example[0] + ').')
                        else:
                            facts[i].append(relation + '(' + ','.join(example)+ ').')
    if target:
        for i in range(len(data[1])): #negatives
            if target in ['fan', 'news', 'club']:
                for j in data[0][i]['accounttype']:
                    if j[1] != target:
                        neg[i].append(target + '(' + j[0] + ').')
    return [facts, pos, neg]

In [151]:
target = 'fan'

bk = ['fan(+account).',
      'news(+account).',
      'club(+account).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = load('twitter', bk, target='fan')

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR fan(A):
%   if ( follows(B, A), news(B), tweets(A, C) )
%   then if ( news(A) )
%   | then return -0.1418510649004878;  // std dev = 0,000, 10,000 (wgt'ed) examples reached here.  /* #neg=10 */
%   | else if ( club(A) )
%   | | then return -0.1418510649004878;  // std dev = 0,000, 9,000 (wgt'ed) examples reached here.  /* #neg=9 */
%   | | else return 0.8581489350995118;  // std dev = 1,46e-07, 26,000 (wgt'ed) examples reached here.  /* #pos=26 */
%   else if ( follows(D, A), follows(A, D), tweets(A, E) )
%   | then return 0.8581489350995115;  // std dev = 2,38e-07, 35,000 (wgt'ed) examples reached here.  /* #pos=35 */
%   | else if ( follows(F, A), follows(A, G), news(G) )
%   | | then if ( news(A) )
%   | | | then return -0.14185106490048777;  // std dev = 0,000, 3,000 (wgt'ed) examples reached here.  /* #neg=3 */
%   | | | else return 0.8581489350995123;  // std dev = 0,000, 15,000 (wgt'ed) examples reached here.  /* #pos=15 */
%   | | else if ( tweets(A,

In [152]:
target = 'news'

bk = ['fan(+account).',
      'news(+account).',
      'club(+account).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = load('twitter', bk, target=target)

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR news(A):
%   if ( fan(A) )
%   then return -0.14185106490048774;  // std dev = 1,49e-08, 24,000 (wgt'ed) examples reached here.  /* #neg=24 */
%   else if ( club(A) )
%   | then return -0.14185106490048777;  // std dev = 0,000, 3,000 (wgt'ed) examples reached here.  /* #neg=3 */
%   | else return 0.8581489350995123;  // std dev = 0,000, 14,000 (wgt'ed) examples reached here.  /* #pos=14 */
WILL Produced-Tree #2
% FOR news(A):
%   if ( fan(A) )
%   then return -0.12544463852839138;  // std dev = 0,000, 32,000 (wgt'ed) examples reached here.  /* #neg=32 */
%   else if ( follows(A, B) )
%   | then return 0.7194734122109543;  // std dev = 4,21e-08, 10,000 (wgt'ed) examples reached here.  /* #pos=10 */
%   | else return 0.4378340619645058;  // std dev = 0,976, 6,000 (wgt'ed) examples reached here.  /* #neg=2 #pos=4 */
WILL Produced-Tree #3
% FOR news(A):
%   if ( fan(A) )
%   then return -0.11231637819360639;  // std dev = 0,000, 25,000 (wgt'ed) examples reached 

In [153]:
target = 'fan'

bk = ['fan(+account).',
      'news(+account).',
      'club(+account).',
      'tweets(+account,#word).',
      'tweets(+account,#word).',
      'tweets(-account,#word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = load('twitter', bk, target='fan')

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR fan(A):
%   if ( follows(A, B), tweets(C, hashwearebruges), follows(C, A) )
%   then if ( tweets(C, hashbeekvm) )
%   | then return -0.1418510649004878;  // std dev = 0,000, 10,000 (wgt'ed) examples reached here.  /* #neg=10 */
%   | else if ( tweets(B, hashcobw) )
%   | | then return 0.5248156017661788;  // std dev = 0,816, 3,000 (wgt'ed) examples reached here.  /* #neg=1 #pos=2 */
%   | | else return 0.8581489350995122;  // std dev = 0,000, 4,000 (wgt'ed) examples reached here.  /* #pos=4 */
%   else if ( follows(A, D) )
%   | then if ( tweets(D, hashcoym) )
%   | | then return 0.858148935099512;  // std dev = 2,23e-07, 45,000 (wgt'ed) examples reached here.  /* #pos=45 */
%   | | else if ( tweets(D, ezekiel) )
%   | | | then return 0.8303711573217337;  // std dev = 0,164, 36,000 (wgt'ed) examples reached here.  /* #neg=1 #pos=35 */
%   | | | else if ( tweets(D, genk), tweets(D, hashcerwbe) )
%   | | | | then return 0.28672036367094084;  // std dev = 0,495

In [4]:
target = 'follows'

bk = ['fan(+account).',
      'news(+account).',
      'club(+account).',
      'tweets(+account,+word).',
      'tweets(+account,-word).',
      'tweets(-account,+word).',
      'follows(+account,+account).',
      'follows(+account,-account).',
      'follows(-account,+account).']

[facts, pos, neg] = load('twitter', bk, target='fan')

background = boostsrl.modes(bk, [target], useStdLogicVariables=False, treeDepth=8, nodeSize=3, numOfClauses=8)

for i in range(len(pos)):
    [train_facts, test_facts] = datasets.get_kfold_small(i, facts)
    [train_pos, test_pos] = datasets.get_kfold_small(i, pos)
    [train_neg, test_neg] = datasets.get_kfold_small(i, neg)
    [model, learning_time, inference_time, t_results, structured, will] = learn_test_model(background, boostsrl, target, train_pos, train_neg, train_facts, test_pos, test_neg, test_facts, refine=None, trees=10, verbose=True)

WILL Produced-Tree #1
% FOR follows(A, B):
%   if ( fan(B) )
%   then return 0.06071752821100763;  // std dev = 0,402, 1.713,000 (wgt'ed) examples reached here.  /* #neg=1.366 #pos=347 */
%   else if ( tweets(B, C), tweets(D, C), club(D) )
%   | then if ( tweets(A, C) )
%   | | then if ( club(B), fan(A) )
%   | | | then return 0.6566563977860774;  // std dev = 0,401, 134,000 (wgt'ed) examples reached here.  /* #neg=27 #pos=107 */
%   | | | else return 0.5693558316512337;  // std dev = 0,453, 232,000 (wgt'ed) examples reached here.  /* #neg=67 #pos=165 */
%   | | else if ( club(A), club(B) )
%   | | | then return 0.762910839861417;  // std dev = 0,294, 21,000 (wgt'ed) examples reached here.  /* #neg=2 #pos=19 */
%   | | | else if ( tweets(A, E) )
%   | | | | then if ( club(B), tweets(F, E), club(F) )
%   | | | | | then return 0.3179190500420405;  // std dev = 4,649, 87,000 (wgt'ed) examples reached here.  /* #neg=47 #pos=40 */
%   | | | | | else return 0.24276431971489598;  // std dev =