# POC-English-NoAmb Generalization `2016-05-23`
static html: [http://88.99.210.144/data/clustering_2018/html/POC-English-NoAmb-Generalization.html](http://88.99.210.144/data/clustering_2018/html/POC-English-NoAmb-Generalization.html)  
data: [http://88.99.210.144/data/clustering_2018/Generalization/POC-English-NoAmb/](http://88.99.210.144/data/clustering_2018/Generalization/POC-English-NoAmb/)

## Basic settings

In [1]:
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
src_path = module_path + '/src'
if os.path.exists(src_path) and src_path not in sys.path: sys.path.append(src_path)
lg_path = '/home/oleg/miniconda3/envs/ull4/lib/python3.6/site-packages/linkgrammar'
if os.path.exists(lg_path) and lg_path not in sys.path: sys.path.append(lg_path)
from src.utl.utl import UTC
from src.utl.read_files import check_dir
from src.utl.turtle import html_table
from src.grammar_learner.poc04 import learn_grammar, params, run_learn_grammar
prefix = ''
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
print(UTC(), ':: module_path =', module_path)

2018-05-23 13:56:30 UTC :: module_path = /home/oleg/language-learning


## Grammar Learner parameters

In [2]:
out_dir = module_path + '/output/Generalization-Tests-' + str(UTC())[:10]
corpus = 'POC-English-NoAmb'
dataset = 'MST_fixed_manually'
kwargs = {
    'left_wall'     :   'LEFT-WALL' ,
    'period'        :   True        ,
    'context'       :   2           ,   # 2: disjuncts
    'word_space'    :   'discrete'  ,
    'clustering'    :   'group'     ,
    'grammar_rules' :   2           ,   # 2: disjuncts
    'verbose'       :   'min'       ,
    'tmpath'        :   tmpath      , 
    'categories_generalization': 'off',
    'rules_generalization': 'off'   ,
    # Parameters for parse_metrics:
    'test_corpus'   : module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt',
    'reference_path': module_path + '/data/POC-English-NoAmb/poc-english_noAmb-parses-gold.txt',
    'template_path' : 'poc-turtle',
    'linkage_limit' : 1 }

## Integration test: 'MST_fixed_manually', no generalization

In [3]:
%%capture
r22 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [4]:
print('Parse-ability / parse-quality = ', str(r22['parse_ability']) + '% / ' \
    + str(r22['parse_quality'])+'%; ', r22['grammar_rules'], 'Link Grammar rules')

Parse-ability / parse-quality =  0% / 0%;  15 Link Grammar rules


## Generalization: jaccard index, both word categories and rules

In [5]:
%%capture
kwargs['categories_generalization'] = 'jaccard'
kwargs['rules_generalization'] = 'jaccard'
r23 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [6]:
print('Parse-ability / parse-quality = ', str(r23['parse_ability']) + '% / ' \
    + str(r23['parse_quality'])+'%; ', r23['grammar_rules'], 'Link Grammar rules')

Parse-ability / parse-quality =  0% / 0%;  10 Link Grammar rules


# Disjuncts-ILE-Disjuncts

## 'MST_fixed_manually' -- 'poc-english_noAmb-parses-gold.txt'

In [7]:
%%capture
spaces = 'dILEd'
eng_noamb1 = []
for dataset in ['MST_fixed_manually']: #, 'R=6_distance=6:R', 'R=6_distance=1', 'LG_English']:
    for kwargs['left_wall'] in ['LEFT-WALL', '']:
        if kwargs['left_wall'] == 'LEFT-WALL': lw = 'LW'
        else: lw = ' -- '
        for kwargs['period'] in [True, False]:
            if kwargs['period']: dot = ' + '
            else: dot = ' -- '
            for g in [('',''), ('jaccard',''), ('','jaccard'), ('jaccard','jaccard')]:
                kwargs['rules_generalization'] = g[0]
                kwargs['categories_generalization'] = g[1]
                if g[0] == '' and g[1] == '': gen = 'none'
                elif g[0] == 'jaccard' and g[1] == '': gen = 'rules'
                elif g[0] == '' and g[1] == 'jaccard': gen = 'categories'
                else: gen = 'both'
                r = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)
                line = [corpus, dataset, lw, dot, gen, spaces, r['grammar_rules'], \
                        str(r['parse_ability'])+'%', str(r['parse_quality'])+'%']
                eng_noamb1.append(line)

In [8]:
header = ['Corpus','Parsing','LW','"."','Generalization','Space','Rules','PA','PQ']
display(html_table([header]+eng_noamb1))

0,1,2,3,4,5,6,7,8
Corpus,Parsing,LW,""".""",Generalization,Space,Rules,PA,PQ
POC-English-NoAmb,MST_fixed_manually,LW,+,none,dILEd,15,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,+,rules,dILEd,11,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,+,categories,dILEd,11,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,+,both,dILEd,10,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,--,none,dILEd,14,56%,20%
POC-English-NoAmb,MST_fixed_manually,LW,--,rules,dILEd,10,83%,31%
POC-English-NoAmb,MST_fixed_manually,LW,--,categories,dILEd,10,83%,31%
POC-English-NoAmb,MST_fixed_manually,LW,--,both,dILEd,9,89%,34%
POC-English-NoAmb,MST_fixed_manually,--,+,none,dILEd,13,82%,81%


## LG "English" -- 'poc-english_noAmb-parses-silver.txt'

In [9]:
%%capture
eng_noamb2 = []
for dataset in ['LG_English']:
    for kwargs['left_wall'] in ['LEFT-WALL', '']:
        if kwargs['left_wall'] == 'LEFT-WALL': lw = 'LW'
        else: lw = ' -- '
        for kwargs['period'] in [True, False]:
            if kwargs['period']: dot = ' + '
            else: dot = ' -- '
            for g in [('',''), ('jaccard',''), ('','jaccard'), ('jaccard','jaccard')]:
                kwargs['rules_generalization'] = g[0]
                kwargs['categories_generalization'] = g[1]
                if g[0] == '' and g[1] == '': gen = 'none'
                elif g[0] == 'jaccard' and g[1] == '': gen = 'rules'
                elif g[0] == '' and g[1] == 'jaccard': gen = 'categories'
                else: gen = 'both'
                r = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)
                line = [corpus, dataset, lw, dot, gen, spaces, r['grammar_rules'], \
                        str(r['parse_ability'])+'%', str(r['parse_quality'])+'%']
                eng_noamb2.append(line)

In [10]:
display(html_table([header]+eng_noamb2))

0,1,2,3,4,5,6,7,8
Corpus,Parsing,LW,""".""",Generalization,Space,Rules,PA,PQ
POC-English-NoAmb,LG_English,LW,+,none,dILEd,15,0%,0%
POC-English-NoAmb,LG_English,LW,+,rules,dILEd,12,0%,0%
POC-English-NoAmb,LG_English,LW,+,categories,dILEd,12,0%,0%
POC-English-NoAmb,LG_English,LW,+,both,dILEd,12,0%,0%
POC-English-NoAmb,LG_English,LW,--,none,dILEd,14,62%,22%
POC-English-NoAmb,LG_English,LW,--,rules,dILEd,11,89%,33%
POC-English-NoAmb,LG_English,LW,--,categories,dILEd,11,89%,33%
POC-English-NoAmb,LG_English,LW,--,both,dILEd,11,89%,33%
POC-English-NoAmb,LG_English,--,+,none,dILEd,13,93%,89%


## 'R=6_Weight=6:R_mst-weight=1:R'

In [11]:
%%capture
eng_noamb3 = []
for dataset in ['R=6_Weight=6:R_mst-weight=1:R']:
    for kwargs['left_wall'] in ['LEFT-WALL', '']:
        if kwargs['left_wall'] == 'LEFT-WALL': lw = 'LW'
        else: lw = ' -- '
        for kwargs['period'] in [True, False]:
            if kwargs['period']: dot = ' + '
            else: dot = ' -- '
            for g in [('',''), ('jaccard',''), ('','jaccard'), ('jaccard','jaccard')]:
                kwargs['rules_generalization'] = g[0]
                kwargs['categories_generalization'] = g[1]
                if g[0] == '' and g[1] == '': gen = 'none'
                elif g[0] == 'jaccard' and g[1] == '': gen = 'rules'
                elif g[0] == '' and g[1] == 'jaccard': gen = 'categories'
                else: gen = 'both'
                r = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)
                line = [corpus, dataset, lw, dot, gen, spaces, r['grammar_rules'], \
                        str(r['parse_ability'])+'%', str(r['parse_quality'])+'%']
                eng_noamb3.append(line)

In [12]:
display(html_table([header]+eng_noamb3))

0,1,2,3,4,5,6,7,8
Corpus,Parsing,LW,""".""",Generalization,Space,Rules,PA,PQ
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,+,none,dILEd,16,89%,50%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,+,rules,dILEd,10,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,+,categories,dILEd,10,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,+,both,dILEd,9,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,--,none,dILEd,15,89%,50%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,--,rules,dILEd,9,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,--,categories,dILEd,9,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,LW,--,both,dILEd,8,95%,54%
POC-English-NoAmb,R=6_Weight=6:R_mst-weight=1:R,--,+,none,dILEd,15,89%,58%


# Test 'Connectors-DRK-Disjuncts'

## MST_fixed_manually, with/without rules generalization

In [13]:
%%capture
spaces = 'cDRKd'  # "Connectors-DRK-Disjuncts"
kwargs['context'] = 1
kwargs['word_space'] = 'vectors'
kwargs['clustering'] = 'kmeans'
kwargs['categories_generalization'] = 'off'
kwargs['grammar_rules'] = 2
eng_noamb4 = []
for dataset in ['MST_fixed_manually']: #, 'R=6_distance=6:R', 'R=6_distance=1', 'LG_English']:
    for kwargs['left_wall'] in ['LEFT-WALL', '']:
        if kwargs['left_wall'] == 'LEFT-WALL': lw = 'LW'
        else: lw = ' -- '
        for kwargs['period'] in [True, False]:
            if kwargs['period']: dot = ' + '
            else: dot = ' -- '
            for g in [('',''), ('jaccard','')]:
                kwargs['rules_generalization'] = g[0]
                kwargs['categories_generalization'] = g[1]
                if g[0] == '' and g[1] == '': gen = 'none'
                elif g[0] == 'jaccard' and g[1] == '': gen = 'rules'
                elif g[0] == '' and g[1] == 'jaccard': gen = 'categories'
                else: gen = 'both'
                r = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)
                line = [corpus, dataset, lw, dot, gen, spaces, r['grammar_rules'], \
                        str(r['parse_ability'])+'%', str(r['parse_quality'])+'%']
                eng_noamb4.append(line)

In [14]:
display(html_table([header]+eng_noamb4))

0,1,2,3,4,5,6,7,8
Corpus,Parsing,LW,""".""",Generalization,Space,Rules,PA,PQ
POC-English-NoAmb,MST_fixed_manually,LW,+,none,cDRKd,9,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,+,rules,cDRKd,5,0%,0%
POC-English-NoAmb,MST_fixed_manually,LW,--,none,cDRKd,9,69%,27%
POC-English-NoAmb,MST_fixed_manually,LW,--,rules,cDRKd,7,80%,33%
POC-English-NoAmb,MST_fixed_manually,--,+,none,cDRKd,7,92%,91%
POC-English-NoAmb,MST_fixed_manually,--,+,rules,cDRKd,2,100%,98%
POC-English-NoAmb,MST_fixed_manually,--,--,none,cDRKd,6,92%,91%
POC-English-NoAmb,MST_fixed_manually,--,--,rules,cDRKd,2,100%,98%


## No LW, no period, generalization of rules ⇒ 2/3/4 rules

In [27]:
%%capture
kwargs['rules_generalization'] = 'jaccard'
kwargs['left_wall'] = ''
kwargs['period'] = False
dataset = 'MST_fixed_manually'
r42 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [28]:
print('Parse-ability / parse-quality = ', str(r42['parse_ability']) + '% / ' \
    + str(r42['parse_quality'])+'%; ', r42['grammar_rules'], 'Link Grammar rules\n')
with open(r42['lg_parse_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[3:-2]: print(line)

Parse-ability / parse-quality =  100% / 98%;  2 Link Grammar rules

A mom is a human.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     mom    is a    human [.]

A dad is a human.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     dad    is a    human [.]

A mom is a parent.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     mom    is a   parent [.]

A dad is a parent.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     dad    is a   parent [.]

A son is a child.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     son    is a    child [.]

A daughter is a child.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a  daughter  is a    child [.]

A son is a human.

              +-C01C02-+
+C01C02+C02C01+ +C01C02+
|      |      | |      |
a     son    is a    human [.]

A daughter is a human.

              +-C01

In [23]:
%%capture
r43 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [24]:
print('Parse-ability / parse-quality = ', str(r43['parse_ability']) + '% / ' \
    + str(r43['parse_quality'])+'%; ', r43['grammar_rules'], 'Link Grammar rules\n')
with open(r43['lg_parse_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[3:-2]: print(line)

Parse-ability / parse-quality =  100% / 98%;  3 Link Grammar rules

A mom is a human.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     mom    is a    human [.]

A dad is a human.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     dad    is a    human [.]

A mom is a parent.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     mom    is a   parent [.]

A dad is a parent.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     dad    is a   parent [.]

A son is a child.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     son    is a    child [.]

A daughter is a child.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a  daughter  is a    child [.]

A son is a human.

              +-C01C03-+
+C01C03+C03C01+ +C01C03+
|      |      | |      |
a     son    is a    human [.]

A daughter is a human.

              +-C01

In [19]:
%%capture
r44 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [20]:
print('Parse-ability / parse-quality = ', str(r44['parse_ability']) + '% / ' \
    + str(r44['parse_quality'])+'%; ', r44['grammar_rules'], 'Link Grammar rules\n')
with open(r44['lg_parse_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[3:-2]: print(line)

Parse-ability / parse-quality =  100% / 99%;  4 Link Grammar rules

A mom is a human.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     mom    is a    human [.]

A dad is a human.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     dad    is a    human [.]

A mom is a parent.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     mom    is a   parent [.]

A dad is a parent.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     dad    is a   parent [.]

A son is a child.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     son    is a    child [.]

A daughter is a child.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a  daughter  is a    child [.]

A son is a human.

              +-C01C03-+
+C01C04+C04C01+ +C01C03+
|      |      | |      |
a     son    is a    human [.]

A daughter is a human.

              +-C01