# 选取化学合成描述片段，使用自然语言理解抽取化学反应合成条件

In [8]:
#调整段落中用量紧接在原料后，括号分隔
text = "In the synthesis of ferrous chloride, slowly, was added iron (12 g) powder to a solution containing HCl (37%, 100 mL, 1.2 mol) until the released hydrogen gas was finished. Then green-dark crystals of iron(II) chloride were prepared after evaporation of the water solution at room temperature. Then the green-dark product was filtered and dried in desiccator. In the preparation of iron aluminide Fe3Al (0.60 g, 3 mmol) dark green crystals of iron(II) chloride were transferred to the mortar. Then, by adding aluminum nanoparticles (0.08 g, 3 mmol) and ground the mixture for 10 min after the reaction was complete, ferromagnetic product absorbed intensively by magnet. The product was transferred to an ethanol solution (100ml), magnetic nanoparticles were collected by a magnetic field and then by-products were decanted and product was dried in a desiccator. Iron aluminide Fe3Al are formed according to the reaction equation."
text

'In the synthesis of ferrous chloride, slowly, was added iron (12 g) powder to a solution containing HCl (37%, 100 mL, 1.2 mol) until the released hydrogen gas was finished. Then green-dark crystals of iron(II) chloride were prepared after evaporation of the water solution at room temperature. Then the green-dark product was filtered and dried in desiccator. In the preparation of iron aluminide Fe3Al (0.60 g, 3 mmol) dark green crystals of iron(II) chloride were transferred to the mortar. Then, by adding aluminum nanoparticles (0.08 g, 3 mmol) and ground the mixture for 10 min after the reaction was complete, ferromagnetic product absorbed intensively by magnet. The product was transferred to an ethanol solution (100ml), magnetic nanoparticles were collected by a magnetic field and then by-products were decanted and product was dried in a desiccator. Iron aluminide Fe3Al are formed according to the reaction equation.'

## 0. 首先导入依赖

In [9]:
from chemdataextractor.doc import Paragraph

## 1. 分句与分词

### 1.1 创建Paragraph

In [10]:
paragraph = Paragraph(text)
paragraph.__repr__

<bound method BaseText.__repr__ of Paragraph(id=None, references=[], text='In the synthesis of ferrous chloride, slowly, was added iron (12 g) powder to a solution containing HCl (37%, 100 mL, 1.2 mol) until the released hydrogen gas was finished. Then green-dark crystals of iron(II) chloride were prepared after evaporation of the water solution at room temperature. Then the green-dark product was filtered and dried in desiccator. In the preparation of iron aluminide Fe3Al (0.60 g, 3 mmol) dark green crystals of iron(II) chloride were transferred to the mortar. Then, by adding aluminum nanoparticles (0.08 g, 3 mmol) and ground the mixture for 10 min after the reaction was complete, ferromagnetic product absorbed intensively by magnet. The product was transferred to an ethanol solution (100ml), magnetic nanoparticles were collected by a magnetic field and then by-products were decanted and product was dried in a desiccator. Iron aluminide Fe3Al are formed according to the reaction equat

### 1.2 分句

In [11]:
paragraph.sentences

[Sentence('In the synthesis of ferrous chloride, slowly, was added iron (12 g) powder to a solution containing HCl (37%, 100 mL, 1.2 mol) until the released hydrogen gas was finished.', 0, 172),
 Sentence('Then green-dark crystals of iron(II) chloride were prepared after evaporation of the water solution at room temperature.', 173, 293),
 Sentence('Then the green-dark product was filtered and dried in desiccator.', 294, 359),
 Sentence('In the preparation of iron aluminide Fe3Al (0.60 g, 3 mmol) dark green crystals of iron(II) chloride were transferred to the mortar.', 360, 492),
 Sentence('Then, by adding aluminum nanoparticles (0.08 g, 3 mmol) and ground the mixture for 10 min after the reaction was complete, ferromagnetic product absorbed intensively by magnet.', 493, 669),
 Sentence('The product was transferred to an ethanol solution (100ml), magnetic nanoparticles were collected by a magnetic field and then by-products were decanted and product was dried in a desiccator.', 670, 86

## 3. 命名实体识别（NER）

### 3.1 化学命名实体识别（CNER）

In [13]:
# 将CNER（化学命名实体识别）与POS（词性标注）的结果合并
def get_tagged_pos_chem_tokens(sentence):
    _tagged_pos_chem_tokens = sentence.pos_tagged_tokens.copy()
    for i, chem_tag in enumerate(sentence.ner_tags):
        if chem_tag is not None:
            _tagged_pos_chem_tokens[i] = (_tagged_pos_chem_tokens[i][0], chem_tag)
    return _tagged_pos_chem_tokens

tagged_pos_chem_tokens = get_tagged_pos_chem_tokens(paragraph.sentences[1])
tagged_pos_chem_tokens

[('Then', 'RB'),
 ('green', 'JJ'),
 ('-', 'HYPH'),
 ('dark', 'JJ'),
 ('crystals', 'NNS'),
 ('of', 'IN'),
 ('iron(II)', 'B-CM'),
 ('chloride', 'I-CM'),
 ('were', 'VBD'),
 ('prepared', 'VBN'),
 ('after', 'IN'),
 ('evaporation', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('water', 'NN'),
 ('solution', 'NN'),
 ('at', 'IN'),
 ('room', 'NN'),
 ('temperature', 'NN'),
 ('.', '.')]

### 3.2 识别化合物性质的单位 以及 合成操作

In [14]:
regex = {
    "UNIT-TIME": "^period[s]?$|^min(utes|s|\.)?$|^hour[s]?|^month[s]?|^week(s|end)?$|^sec(ond)?s$|^d$|^(?-i:h)$|^day[s]?$|^hr[s]?$|^time$",
    "UNIT-MASS": "^([mkµu]?(g|gram[m]?[e]?)|(pico|nano|micro|milli|centi|deci|kilo|mega)gram[m]?[e]?)[s]?$",
    "UNIT-MOLAR": "^([mnµμu]?(?-i:M)|(?-i:N)|(pico|nano|micro|milli|[mnµμu])?molar)$",
    "UNIT-AMOUNT": "^([mnµμu]|pico|nano|micro|milli)?mol[e]?[s]?$",
    "UNIT-VOL": "^([mkµu]?l|[cd]?m[\^]?3|(µ|pico|nano|micro|milli|centi|deci|kilo|mega)?lit(re|er|\.)[s]?)$",
    "TIME": "^(fort|over)?night(s|falls|long)?$",
    "VB-ADD": "^(pre|re)?(add|mix|pour|dispers|introduc)(e|s|ed|ing)?$",
    "VB-CHARGE": "^charge(d)?$",
    "VB-STIR": "(pre|re)?(stir)(red|ring)?",
    "VB-YIELD": "^((afford|furnish|obtain|result|yield)(s|es|ed|ing)?|(isolat|leav|provid)(e|s|es|ed|ing)?|(giv|produc)(e|es|ed|ing)|form(ed|ing)|gave|get)$",
    "B-CM": "^mixture$|^suspension$|^solution$"
}

#### 3.3.2 标记 性质单位 以及 合成操作

In [15]:
import re
def get_tagged_pos_chem_unit_operate_tokens(sentence):
    _tagged_pos_chem_tokens = get_tagged_pos_chem_tokens(sentence)
    _tagged_pos_chem_unit_operate_tokens = _tagged_pos_chem_tokens.copy()
    for i, (word, tag) in enumerate(_tagged_pos_chem_tokens):
        for regex_tag, pattern in regex.items():
            if re.match(pattern, word):
                _tagged_pos_chem_unit_operate_tokens[i] = (word, regex_tag)
                break
    return _tagged_pos_chem_unit_operate_tokens

tagged_pos_chem_unit_operate_tokens = get_tagged_pos_chem_unit_operate_tokens(paragraph.sentences[1])
tagged_pos_chem_unit_operate_tokens

[('Then', 'RB'),
 ('green', 'JJ'),
 ('-', 'HYPH'),
 ('dark', 'JJ'),
 ('crystals', 'NNS'),
 ('of', 'IN'),
 ('iron(II)', 'B-CM'),
 ('chloride', 'I-CM'),
 ('were', 'VBD'),
 ('prepared', 'VBN'),
 ('after', 'IN'),
 ('evaporation', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('water', 'NN'),
 ('solution', 'B-CM'),
 ('at', 'IN'),
 ('room', 'NN'),
 ('temperature', 'NN'),
 ('.', '.')]

## 4. 语法分析 

### 4.1 语法树

In [16]:
import nltk
grammar = "CM: {<B-CM><I-CM>*}\n"\
          "NN-UNIT: {<UNIT-TIME|UNIT-MASS|UNIT-MOLAR|UNIT-AMOUNT|UNIT-VOL>}\n"\
          "PROP: {<CD><NN-UNIT>}\n"\
          "PROP: {<TIME>}\n"\
          "PROPS: {<-LRB-><PROP>(<,><PROP>)*<-RRB->}\n"\
          "CHEM: {<CM><PROP|PROPS>?}\n"
# grammar = "CM: {<B-CM><I-CM>*}\n"\
#           "NN-UNIT: {<UNIT-TIME|UNIT-MASS|UNIT-MOLAR|UNIT-AMOUNT|UNIT-VOL>}\n"\
#           "PROP: {<CD><NN-UNIT>}\n"\
#           "PROP: {<TIME>}\n"\
#           "PROPS: {<-LRB-><PROP>(<,><PROP>)*<-RRB->}\n"\
#           "CHEM: {<CM><PROP|PROPS>?}\n"\
#           "OPERATE: {<VB-ADD|VB-CHARGE|VB-STIR><PROP|PROPS>?}\n"\
#           "NounPhrase: {<DT>?<JJ|RB>*(<NN|NNS|CHEM>)+}\n"\
#           "PrepPhrase: {<IN><NounPhrase>}\n"\
#           "VerbPhrase: {(<NounPhrase|PrepPhrase>?<VBD>?<RB>?<OPERATE>)|(<RB>?<OPERATE><NounPhrase|PrepPhrase>)}\n"\
#           "VerbPhrase: {<IN><RB>?<OPERATE>}\n"\
#           "VerbPhrase: {<VerbPhrase><PrepPhrase><IN><PROP>}"
def grammar_parse(sentence):
    _tagged_pos_chem_unit_operate_tokens = get_tagged_pos_chem_unit_operate_tokens(sentence)
    cp = nltk.RegexpParser(grammar)
    return cp.parse(_tagged_pos_chem_unit_operate_tokens)
    
grammar_tree2 = grammar_parse(paragraph.sentences[1])
grammar_tree2.draw()

# 此代码块运行后，会弹出一个画着语法树的窗口，但是偶尔窗口不会自动弹出，请检查电脑的任务栏
# 关掉弹出的窗口后，notebook才会继续运行

In [17]:
for sentence in paragraph.sentences:
    grammar_tree = grammar_parse(sentence)
    grammar_tree.draw()
# 此代码块运行后，会弹出一个画着语法树的窗口，但是偶尔窗口不会自动弹出，请检查电脑的任务栏
# 关掉弹出的窗口后，notebook才会继续运行

### 4.2 解析语法树，获得合成条件

**1. 首先，定义一个函数(方法)，寻找一个node的具有特定label的descend_nodes**

In [18]:
from typing import Tuple

from nltk.tree import Tree

def get_descend_nodes_with_label(node, label):
    if node.label() == label:
        yield node
    for child_node in node:
        if isinstance(child_node, Tuple):
            if child_node[-1] == label:
                yield child_node
        if isinstance(child_node, Tree):
            if child_node.label() == label:
                yield child_node
            else:
                for res in get_descend_nodes_with_label(child_node, label):
                    yield res

下面尝试一下获得句子中的所有CHEN node

In [19]:
# 尝试一下，可以改一下label，对照下图，看下结果是否正确，比如改成 'PROP'
for chem_node in get_descend_nodes_with_label(node=grammar_tree2, label='PROP'):
    chem_node.draw()
# 此代码块运行后，会弹出一个画着语法树的窗口，但是偶尔窗口不会自动弹出，请检查电脑的任务栏
# 关掉弹出的窗口后，notebook才会继续运行

**2. 然后，定义一个函数(方法)，将CM node转化为一个化合物的名字**

In [20]:
def get_mol_name(cm_node):
    name_fragments = []
    for child_node in cm_node:
        name_fragments.append(child_node[0])
    return ' '.join(name_fragments)

下面尝试一下获得句子中的所有化学物的名字

In [21]:
# 尝试一下，对照下图，结果是否正确
for cm_node in get_descend_nodes_with_label(node=grammar_tree2, label='CM'):
    print("名字："+get_mol_name(cm_node))
    cm_node.draw()
# 此代码块运行后，会弹出一个画着语法树的窗口，但是偶尔窗口不会自动弹出，请检查电脑的任务栏
# 关掉弹出的窗口后，notebook才会继续运行

名字：iron(II) chloride
名字：solution


**3. 定义一个函数(方法)，将prop node转化为属性的值、单位以及类型**

In [22]:
def get_prop_value_unit_type(prop_node):
    child_nodes = list(prop_node)
    if len(child_nodes) == 1:
        prop_value = child_nodes[0][0]
        prop_unit = None
        prop_type = child_nodes[0][1]
    elif len(child_nodes) == 2:
        prop_value = child_nodes[0][0]
        prop_unit = child_nodes[1][0][0]
        prop_type = child_nodes[1][0][1].split('-')[-1]
    else:
        raise ValueError(f"prop node应该只有1个或2个child_nodes, 但却遇到了{len(child_nodes)}个")
    return prop_value, prop_unit, prop_type

下面尝试一下获得所有的属性

In [23]:
# 尝试一下，对照下图，结果是否正确
for prop_node in get_descend_nodes_with_label(node=grammar_tree2, label='PROP'):
    print(get_prop_value_unit_type(prop_node))

**4. 最后，运用刚刚定义的函数(方法)，将段落中的所有合成条件提取出来**

In [24]:
import pandas as pd

result_df = pd.DataFrame(columns=['molecule', 'TIME value', 'TIME unit', 'MASS value', 'MASS unit', 'MOLAR value', 'MOLAR unit', 'AMOUNT value', 'AMOUNT unit', 'VOL value', 'VOL unit'])
for sentence in paragraph.sentences:
    grammar_tree = grammar_parse(sentence)
    for chem_node in get_descend_nodes_with_label(node=grammar_tree, label='CHEM'):
        cm_node = list(get_descend_nodes_with_label(node=chem_node, label='CM'))[0]
        mol_name = get_mol_name(cm_node)
        mol_data = {'molecule': mol_name}
        for prop_node in get_descend_nodes_with_label(node=chem_node, label='PROP'):
            prop_value, prop_unit, prop_type = get_prop_value_unit_type(prop_node)
            mol_data[prop_type+' value']= prop_value
            mol_data[prop_type+' unit']= prop_unit
        result_df = result_df.append(mol_data, ignore_index=True)
result_df

Unnamed: 0,molecule,TIME value,TIME unit,MASS value,MASS unit,MOLAR value,MOLAR unit,AMOUNT value,AMOUNT unit,VOL value,VOL unit
0,ferrous chloride,,,,,,,,,,
1,iron,,,12.0,g,,,,,,
2,solution,,,,,,,,,,
3,HCl,,,,,,,,,,
4,hydrogen,,,,,,,,,,
5,iron(II) chloride,,,,,,,,,,
6,solution,,,,,,,,,,
7,iron aluminide Fe3Al,,,0.6,g,,,3.0,mmol,,
8,iron(II) chloride,,,,,,,,,,
9,aluminum,,,,,,,,,,


## 5. 检查并修改

In [25]:
result_df

Unnamed: 0,molecule,TIME value,TIME unit,MASS value,MASS unit,MOLAR value,MOLAR unit,AMOUNT value,AMOUNT unit,VOL value,VOL unit
0,ferrous chloride,,,,,,,,,,
1,iron,,,12.0,g,,,,,,
2,solution,,,,,,,,,,
3,HCl,,,,,,,,,,
4,hydrogen,,,,,,,,,,
5,iron(II) chloride,,,,,,,,,,
6,solution,,,,,,,,,,
7,iron aluminide Fe3Al,,,0.6,g,,,3.0,mmol,,
8,iron(II) chloride,,,,,,,,,,
9,aluminum,,,,,,,,,,


#### 修改

修改方式就是在regex的最后一项中添加了DCM;体积单位中增加了大写L以及质量体积分数%

In [26]:
regex = {
    "UNIT-TIME": "^period[s]?$|^min(utes|s|\.)?$|^hour[s]?|^month[s]?|^week(s|end)?$|^sec(ond)?s$|^d$|^(?-i:h)$|^day[s]?$|^hr[s]?$|^time$",
    "UNIT-MASS": "^([mkµu]?(g|gram[m]?[e]?)|(pico|nano|micro|milli|centi|deci|kilo|mega)gram[m]?[e]?)[s]?$",
    "UNIT-MOLAR": "^([mnµμu]?(?-i:M)|(?-i:N)|(pico|nano|micro|milli|[mnµμu])?molar)$",
    "UNIT-AMOUNT": "^([mnµμu]|pico|nano|micro|milli)?mol[e]?[s]?$",
    "UNIT-VOL": "^([mkµu]?(l|L)|[cd]?m[\^]?3|(µ|pico|nano|micro|milli|centi|deci|kilo|mega)?lit(re|er|\.)[s]?)$|\%", #增加l的大小写，%
    "TIME": "^(fort|over)?night(s|falls|long)?$",
    "VB-ADD": "^(pre|re)?(add|mix|pour|dispers|introduc)(e|s|ed|ing)?$",
    "VB-CHARGE": "^charge(d)?$",
    "VB-STIR": "(pre|re)?(stir)(red|ring)?",
    "VB-YIELD": "^((afford|furnish|obtain|result|yield)(s|es|ed|ing)?|(isolat|leav|provid)(e|s|es|ed|ing)?|(giv|produc)(e|es|ed|ing)|form(ed|ing)|gave|get)$",
    "B-CM": "^mixture$|^suspension$|^solution$|nanoparticles$"  # 我们在这里添加了nanoparticles
}

### 重新运行程序抽取

In [28]:
result_df = pd.DataFrame(columns=['molecule', 'TIME value', 'TIME unit', 'MASS value', 'MASS unit', 'MOLAR value', 'MOLAR unit', 'AMOUNT value', 'AMOUNT unit', 'VOL value', 'VOL unit'])
for sentence in paragraph.sentences:
    grammar_tree = grammar_parse(sentence)
    for chem_node in get_descend_nodes_with_label(node=grammar_tree, label='CHEM'):
        cm_node = list(get_descend_nodes_with_label(node=chem_node, label='CM'))[0]
        mol_name = get_mol_name(cm_node)
        mol_data = {'molecule': mol_name}
        for prop_node in get_descend_nodes_with_label(node=chem_node, label='PROP'):
            prop_value, prop_unit, prop_type = get_prop_value_unit_type(prop_node)
            mol_data[prop_type+' value']= prop_value
            mol_data[prop_type+' unit']= prop_unit
        result_df = result_df.append(mol_data, ignore_index=True)
result_df

Unnamed: 0,molecule,TIME value,TIME unit,MASS value,MASS unit,MOLAR value,MOLAR unit,AMOUNT value,AMOUNT unit,VOL value,VOL unit
0,ferrous chloride,,,,,,,,,,
1,iron,,,12.0,g,,,,,,
2,solution,,,,,,,,,,
3,HCl,,,,,,,1.2,mol,100.0,mL
4,hydrogen,,,,,,,,,,
5,iron(II) chloride,,,,,,,,,,
6,solution,,,,,,,,,,
7,iron aluminide Fe3Al,,,0.6,g,,,3.0,mmol,,
8,iron(II) chloride,,,,,,,,,,
9,aluminum,,,,,,,,,,
