In [1]:
import configparser
from bilbo.importer import Importer
from bilbo.components.shape_data.shape_data import ShapeSection
from bilbo.components.features.features import FeatureHandler
from bilbo.components.crf.crf import Crf
from bilbo.bilbo import Bilbo

## Bilbo in a shell

### Construct Data Structure
First import your xml document. You can import string or a file. For any action (machine learning prediction, features extraction, set a new xml properties), you will handle this document object.


In [2]:
#xml_str = '<xml>Oustide<bibl><pubPlace>Marseille</pubPlace>, <sponsor>OpenEdition is "! inside </sponsor>>a bibl</bibl></xml>'
xml_str = """<TEI xmlns="http://www.tei-c.org/ns/1.0"> Outside 
<bibl>Hillier B., 1996, <hi>Space is the Machine</hi>, Cambridge University Press, <pubPlace>Cambridge.</pubPlace>
</bibl></TEI>"""
imp = Importer(xml_str)
doc = imp.parse_xml('bibl', is_file = False)

### Tokenize, extract and wrap xml informations

First, load parameters. 

In [3]:
dic = """                                                      
[shaper]                        
tokenizerOption = fine          
tagsOptions = {                                                                                 
    "pubPlace": "place",
    "sponsor": "publisher",
    } 
verbose = True
"""
#Load the dic.
#There are differnt ways to set parameters (ini file...)see: https://docs.python.org/3/library/configparser.html#quick-start
config = configparser.ConfigParser(allow_no_value=True) 
config.read_string(dic)

Use ShapeSection class.
Note at any moment you can call help for parameters function:

In [4]:
help(ShapeSection.__init__)

Help on function __init__ in module bilbo.components.shape_data.shape_data:

__init__(self, cfg_file, type_config='ini', lang='fr')
    Initialize self.  See help(type(self)) for accurate signature.



In [5]:
sh = ShapeSection(config, type_config='Dict')
sh.transform(doc)

<bilbo.storage.document.Document at 0x7fc3740d7390>

To see an overview of your document:

In [6]:
for section in doc.sections:
    for token in section.tokens:
        print('Token:{0}\t\t Label:{1}'.format(token.str_value, token.label))

Token:Hillier		 Label:bibl
Token:B.		 Label:bibl
Token:,		 Label:c
Token:1996		 Label:bibl
Token:,		 Label:c
Token:Space		 Label:hi
Token:is		 Label:hi
Token:the		 Label:hi
Token:Machine		 Label:hi
Token:,		 Label:c
Token:Cambridge		 Label:bibl
Token:University		 Label:bibl
Token:Press		 Label:bibl
Token:,		 Label:c
Token:Cambridge		 Label:place
Token:.		 Label:c


### Features

Set features that you are needed. For external features, you need to give the **Right Path** to externals lists...

In [7]:
dic = """                                                      
[features]
listFeatures = numbersMixed, cap, dash, biblPosition, initial
listFeaturesRegex = ('UNIVERSITY', '^Uni.*ty$')
listFeaturesExternes = ('surname', 'surname_list.txt', 'simple'),
listFeaturesXML = italic
output = output.txt 
verbose = False 
"""
config = configparser.ConfigParser(allow_no_value=True) 
config.read_string(dic)

Features are given for convenience in  Crf++ format.

In [8]:
feat = FeatureHandler(config, type_config='Dict')
feat.loadFonctionsFeatures()
doc = feat.transform(doc)
feat.print_features(doc)

Hillier NONUMBERS FIRSTCAP NODASH BIBL_START NOINITIAL NOUNIVERSITY SURNAME NOITALIC bibl

B. NONUMBERS ALLCAP NODASH BIBL_START INITIAL NOUNIVERSITY NOSURNAME NOITALIC bibl

, NONUMBERS NONIMPCAP NODASH BIBL_START NOINITIAL NOUNIVERSITY NOSURNAME NOITALIC c

1996 NUMBERS NONIMPCAP NODASH BIBL_START NOINITIAL NOUNIVERSITY NOSURNAME NOITALIC bibl

, NONUMBERS NONIMPCAP NODASH BIBL_START NOINITIAL NOUNIVERSITY NOSURNAME NOITALIC c

Space NONUMBERS FIRSTCAP NODASH BIBL_START NOINITIAL NOUNIVERSITY NOSURNAME ITALIC hi

is NONUMBERS ALLSMALL NODASH BIBL_IN NOINITIAL NOUNIVERSITY NOSURNAME ITALIC hi

the NONUMBERS ALLSMALL NODASH BIBL_IN NOINITIAL NOUNIVERSITY NOSURNAME ITALIC hi

Machine NONUMBERS FIRSTCAP NODASH BIBL_IN NOINITIAL NOUNIVERSITY NOSURNAME ITALIC hi

, NONUMBERS NONIMPCAP NODASH BIBL_IN NOINITIAL NOUNIVERSITY NOSURNAME NOITALIC c

Cambridge NONUMBERS FIRSTCAP NODASH BIBL_IN NOINITIAL NOUNIVERSITY NOSURNAME NOITALIC bibl

University NONUMBERS FIRSTCAP NODASH BIBL_END NOINITIAL 

### Make predictions 
First, to get an Document storage object which make sense (not as above, just for demonstration usage). Keep in mind to load parameters with th **Right Path** for file configuration ('pipeline_bibl.cfg') 

In [9]:
# This part is a fast resume of TOKENIZER AND FEATURE explain above.
# There are runned again with the appropriate parameter (path to pipeline_bibl.cfg).
imp = Importer(xml_str)
doc = imp.parse_xml('bibl', is_file = False)
bbo = Bilbo(doc, 'pipeline_bibl.cfg')
bbo.shape_data(doc)
bbo.features(doc)

<bilbo.storage.document.Document at 0x7fc3740ac828>

We have now a Document storage object which contains all needed information

In [10]:
# Start to make predictions
tagger = Crf(bbo.config, type_config='Dict')
labels = tagger.predict(doc)

for label in labels:
    for l in label:
        print(l[0], l[1])

Hillier surname
B. forename
, c
1996 date
, c
Space title
is title
the title
Machine title
, c
Cambridge publisher
University publisher
Press publisher
, c
Cambridge pubPlace
. c


### Add prediction at the data structure
Always use transform() function for added prediction to Document storage object. Note for estimator component, three option are availables :'tag', 'train', 'evaluate'  

In [11]:
tagger.transform(doc, 'tag')

In [12]:
for section in doc.sections:
    for token in section.tokens:
        print('Token:{0}\t\t Label:{1}'.format(token.str_value, token.predict_label))

Token:Hillier		 Label:surname
Token:B.		 Label:forename
Token:,		 Label:c
Token:1996		 Label:date
Token:,		 Label:c
Token:Space		 Label:title
Token:is		 Label:title
Token:the		 Label:title
Token:Machine		 Label:title
Token:,		 Label:c
Token:Cambridge		 Label:publisher
Token:University		 Label:publisher
Token:Press		 Label:publisher
Token:,		 Label:c
Token:Cambridge		 Label:pubPlace
Token:.		 Label:c


## Annotator bilbo usage

### For bibliography

In [13]:
imp = Importer('resources/corpus/bibl/test_bibl.xml')
doc = imp.parse_xml('bibl')
bilbo = Bilbo(doc, 'pipeline_bibl.cfg')
bilbo.run_pipeline('tag', 'bibl', '/tmp/output.xml', format_= None)

### For note

In [None]:
imp = Importer('resources/corpus/note/test_note.xml')
doc = imp.parse_xml('note')
bilbo = Bilbo(doc, 'pipeline_note.cfg')
bilbo.run_pipeline('tag', 'note', '/tmp/output.xml', format_= None)

### Train
Just modify tag parameter to train parameter!! Note: output could be some binaries constructed model (They must be specified in pipeline_bibl.cfg not as parameters in run_pipeline() function. 

### Evaluation (end to end)
For evaluate the models just launch bilbo on your datatest annotated as:

In [14]:
imp = Importer('resources/corpus/bibl/data_test.xml')
doc = imp.parse_xml('bibl')
bilbo = Bilbo(doc, 'pipeline_bibl.cfg')
bilbo.run_pipeline('evaluate', 'bibl', None, None)

-----------------------------------------------------------
         label  precision     rappel  f-measure  occurences
-----------------------------------------------------------
          abbr      0.874      0.765      0.816        452
     biblScope      0.887      0.571      0.695        594
     booktitle      0.903      0.629      0.742         89
          date      0.716      0.915      0.803        614
       edition      0.690      0.460      0.552        126
          emph      1.000      1.000      1.000          2
        extent      1.000      0.979      0.989         48
      forename      0.929      0.956      0.942        942
       genName      1.000      1.000      1.000          1
       journal      0.823      0.732      0.774        514
      nameLink      0.282      1.000      0.440         11
       orgName      0.902      0.836      0.868        110
         place      0.824      0.933      0.875         15
      pubPlace      0.962      0.934      0.948      

### Evaluation by component
You can evaluate each component. In this case we use bilbo as toolkit usage. Load your annotated data : data format annotated is depended of component used. You have to always generate this data first.
And just launch (for svm for instance)  

In [None]:
svm.evaluate(input_svm_data_format)