In [8]:
# import chemdataextractor as cde

from chemdataextractor.relex import Snowball, ChemicalRelationship
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType, Compound
from chemdataextractor.parse import R, I, W, Optional, merge, join, OneOrMore, Any, ZeroOrMore, Start
from chemdataextractor.parse.cem import chemical_name, chemical_label
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.common import lrb, rrb, delim
from chemdataextractor.utils import first
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
import re


# Abbreviations 

#### Resources:
* https://en.wikiversity.org/wiki/Python_Concepts/Regular_Expressions#\d_and_\D
* http://dev.lexalytics.com/wiki/pmwiki.php?n=Main.POSTags

W = Word (case sensitive)

I = IWord (case insensetive)

R = Regex (regular expression)

T = Tag (match tag exactly)

H = Hide

lrb = left parenthesis

rrb = right parenthesis

^ = beginning of the string

$ = end of the string

| = or

\* = any number of

\+ = one or more of

\? = 0 or 1 of

\[ \] = any listed within the brackets

\{ \} = within the listed range

\( \) = parentheses define a group

\d = any numeric character

\D = any non-numeric character

In [9]:
class  CurieTemperature(BaseModel):
    specifier = StringType()
    value = StringType()
    units = StringType()

Compound.curie_temperatures = ListType(ModelType(CurieTemperature))

In [10]:
# Define a very basic entity tagger
specifier = (I('curie') + I('temperature') + Optional(lrb | delim) + Optional(R('^T(C|c)(urie)?')) + Optional(rrb) | R('^T(C|c)(urie)?'))('specifier').add_action(join)
units = (R('^[CFK]\.?$'))('units').add_action(merge)
value = (R('^\d+(\.\,\d+)?$'))('value')

In [11]:

# Let the entities be any combination of chemical names, specifier values and units
entities = (chemical_name | specifier | value + units)

# Now create a very generic parse phrase that will match any combination of these entities
curie_temperature_phrase = (entities + OneOrMore(entities | Any()))('curie_temperature')

# List all the entities
curie_temp_entities = [chemical_name, specifier, value, units]
