### Getting started
#### Week 24/01/2022

In [1]:
from batterydataextractor.doc.document import Document, Paragraph



In [53]:
doc1 = Document('graphite is 372 mAhg-1.')
doc = Document('UV-vis spectrum of 5,10,15,20-Tetra(4-carboxyphenyl)porphyrin in Tetrahydrofuran (THF).',
              'The mechanism of lithium intercalation in the so-called ‘soft’ anodes, i.e. graphite or graphitable carbons, is well known: it develops through well-identified, reversible stages, corresponding to progressive intercalation within discrete graphene layers, to reach the formation of LiC6 with a maximum theoretical capacity of 372 ± 2.4 mAh g−1.')

In [54]:
doc1.elements

[Paragraph(id=None, references=[], text='graphite is 372 mAhg-1.')]

In [55]:
para = doc1.elements[0]
para

In [56]:
para.sentences

[Sentence('graphite is 372 mAhg-1.', 0, 23)]

In [57]:
para.tokens

[[Token('graphite', 0, 8),
  Token('is', 9, 11),
  Token('372', 12, 15),
  Token('mAhg-1', 16, 22),
  Token('.', 22, 23)]]

In [7]:
doc.cems

[Span('5,10,15,20-Tetra(4-carboxyphenyl)porphyrin', 19, 61),
 Span('graphite', 76, 84),
 Span('lithium', 17, 24),
 Span('graphene', 239, 247),
 Span('Tetrahydrofuran', 65, 80),
 Span('carbons', 100, 107),
 Span('LiC6', 282, 286)]

In [8]:
p = Paragraph(u'Dye-sensitized solar cells (DSSCs) with ZnTPP = Zinc tetraphenylporphyrin.')
#TODO: fix the abbrev.
p.abbreviation_definitions

[(['Dye',
   '-',
   'sensitized',
   'solar',
   'cells',
   '(',
   'DSSCs',
   ')',
   'with',
   'ZnTPP',
   '=',
   'Zinc',
   'tetraphenylporphyrin',
   '.'],
  [],
  None)]

In [9]:
s = p.sentences[0]

In [10]:
s.tokens[0]

Token('Dye', 0, 3)

In [11]:
s.tokens[0].lex.normalized

'Dye'

In [12]:
s.tokens[0].lex.is_hyphenated

False

### Parser

In [9]:
from batterydataextractor.nlp import BertCemTagger, CemTagger
from transformers import pipeline
dt = BertCemTagger()
ct = CemTagger()

In [10]:
question_answerer = pipeline("question-answering", "batterydata/batterybert-cased-squad-v1")

In [11]:
def new_parser(name, context):
    result = question_answerer(question="What is the {}?".format(name), context=context)
    if result['score']>0:
        answer = result['answer']
    print("{}:".format(name.capitalize()), answer, " Confidence score:", result['score'])
    return

In [12]:
def new_property_only_parser(name, context):
    result = question_answerer(question="What is the value of {}?".format(name), context=context)
    if result['score']>0:
        answer = result['answer']
    print("Property value:", answer, " Confidence score:", result['score'])
    return

In [15]:
def new_property_parser_ner(name, context):
    result = question_answerer(question="What is the value of {}?".format(name), context=context)
    if result['score']>0:
        answer = result['answer']
    ner = [(i, 'NN') for i in context.split(" ")]
    materials = ct.tag(ner)
    for i in materials:
        if i[-1] == 'MAT':
            material = i[0][0]
    print("Material:", material, " Property value:", answer, " Confidence score:", result['score'])
    return

In [26]:
def new_property_parser(name, context):
    result = question_answerer(question="What is the value of {}?".format(name), context=context)
    if result['score']>0:
        answer = result['answer']
    material = question_answerer(question="Which material has a {} of {}?".format(name, answer), context=context)
    print("Material:", material['answer'], " Property value:", answer, " Confidence score:", result['score'])
    return

In [35]:
property_name = "mp"
context = 'NaCl: mp 146-147 °C.'
new_property_parser(property_name, context)

Material: NaCl  Property value: 146-147 °C  Confidence score: 0.8226007223129272


In [36]:
property_name = "glass transition temperature"
context = 'The poly(azide) shows a glass transition temperature of 282.6 °C.'
new_property_parser(property_name, context)

#CDE result: just 282.6, no material

Material: poly(azide)  Property value: 282.6 °C  Confidence score: 0.9389880299568176


In [37]:
context = 'The four-armed compd. (ANTH-OXA6t-OC12) with the dodecyloxy surface group is a high glass transition temp. (Tg:  211°) material and exhibits good soly.'
new_property_parser(property_name, context)

Material: ANTH-OXA6t-OC12)  Property value: 211°  Confidence score: 0.9640427231788635


In [38]:
property_name = 'uvvis'
context = 'λabs/nm 320, 380, 475, 529;'
new_property_only_parser(property_name, context)

Property value: λabs  Confidence score: 0.006887961644679308


In [39]:
property_name = 'capacity'
context = 'The mechanism of lithium intercalation in the so-called ‘soft’ anodes, i.e. graphite or graphitable carbons, is well known: it develops through well-identified, reversible stages, corresponding to progressive intercalation within discrete graphene layers, to reach the formation of LiC6 with a maximum theoretical capacity of 372 ± 2.4 mAh g−1.'
new_property_parser(property_name, context)

Material: LiC6  Property value: 372 ± 2.4 mAh g−1  Confidence score: 0.597428560256958


TODO: just definition, material, etc.

In [40]:
property_name = 'apparatus'
context1 = 'The photoluminescence quantum yield (PLQY) was measured using a HORIBA Jobin Yvon FluoroMax-4 spectrofluorimeter'
context2 = '1H NMR spectra were recorded on a Varian MR-400 MHz instrument.'
new_parser(property_name, context1)
new_parser(property_name, context2)

Apparatus: HORIBA Jobin Yvon FluoroMax-4  Confidence score: 0.10037080198526382
Apparatus: Varian MR-400 MHz instrument  Confidence score: 0.5064557194709778


### Springer reader, Parser, Model in BDE
#### Week 09/02/2022


In [41]:
from batterydataextractor.doc import Document
spr = Document.from_file(r"tests/testpapers/spr_test2.xml")
spr.elements
# records = spr.records
# print(spr.records.serialize())

[{'title': 'Double-walled core-shell structured Si@SiO2@C nanocomposite as anode for lithium-ion batteries', 'authors': ['Tao', 'Yang', 'Zhang', 'Ni'], 'publisher': 'Springer Berlin Heidelberg', 'journal': 'Ionics', 'date': '20140504', 'volume': '20', 'issue': '11', 'firstpage': '1547', 'lastpage': '1552', 'doi': '10.1007/s11581-014-1138-8', 'abstract': 'Double-walled core-shell structured Si@SiO2@C nanocomposite has been prepared by calcination of silicon nanoparticles in air and subsequent carbon coating. The obtained Si@SiO2@C nanocomposite demonstrates a reversible specific capacity of about 786 mAh g−1 after 100\xa0cycles at a current density of 100\xa0mA\xa0g−1 with a capacity fading of 0.13\xa0% per cycle. The enhanced electrochemical performance can be due to that the double walls of carbon and SiO2 improve the electronic conductivity and enhance the compatibility of electrode materials and electrolyte as a result of accommodating the significant volumetric change during cycles

**Property parser**

In [4]:
from batterydataextractor.doc import Document
doc = Document("The theoretical capacity of graphite is 372 mAh/g... In the case of LiFePO4 chemistry, the absolute maximum voltage is 4.2V per cell,")
doc.add_models_by_names(["capacity", "voltage"])
record = doc.records
print(record)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[<PropertyData>, <PropertyData>]


In [5]:
for r in record:
    print(r.serialize())

{'PropertyData': {'value': [372.0], 'units': 'mAh / g', 'raw_value': '372 mAh / g', 'specifier': 'capacity', 'material': 'graphite'}}
{'PropertyData': {'value': [4.2], 'units': 'V', 'raw_value': '4.2 V', 'specifier': 'voltage', 'material': 'LiFePO4'}}


In [9]:
print(doc.cems)

[Span('graphite', 28, 36), Span('LiFePO4', 68, 75)]


In [18]:
property_name = "mp"
context = 'The melting point (mp) of Aspirin (C9H8O4): 146-147 °C.'
doc2 = Document(context)
doc2.add_models_by_names([property_name])
for r in doc2.records:
    print(r.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'PropertyData': {'value': [146.0, 147.0], 'units': '° C', 'raw_value': '146-147 ° C', 'specifier': 'mp', 'material': 'Aspirin'}}


In [17]:
doc2.cems

[Span('C9H8O4', 35, 41)]

In [11]:
property_name = "glass transition temperature"
context = 'The poly(azide) shows a glass transition temperature of 282.6 °C.'
doc3 = Document(context)
doc3.add_models_by_names([property_name])
print(doc3.records.serialize())
#CDE result: just 282.6, no material

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'PropertyData': {'value': [282.6], 'units': '° C', 'raw_value': '282.6 ° C', 'specifier': 'glass transition temperature', 'material': 'poly(azide)'}}]


In [12]:
property_name = 'capacity'
context = 'The mechanism of lithium intercalation in the so-called ‘soft’ anodes, i.e. graphite or graphitable carbons, is well known: it develops through well-identified, reversible stages, corresponding to progressive intercalation within discrete graphene layers, to reach the formation of LiC6 with a maximum theoretical capacity of 372 mAh/g.'
doc4 = Document(context)
doc4.add_models_by_names([property_name])
print(doc4.records.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'PropertyData': {'value': [372.0], 'units': 'mAh / g', 'raw_value': '372 mAh / g', 'specifier': 'capacity', 'material': 'LiC6'}}]


**General Parser**

In [6]:
from batterydataextractor.doc.text import Paragraph
s = 'The lithium iron phosphate battery (LiFePO4 battery) or LFP battery (lithium ferrophosphate), is a type ' \
            'of lithium-ion battery using lithium iron phosphate (LiFePO4) as the cathode material, and a graphitic ' \
            'carbon electrode with a metallic backing as the anode.'
p = Paragraph(s)
p.add_general_models(["anode", "cathode"])
record = p.records
for r in record:
    print(r.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'Compound': {'names': ['lithium']}}
{'Compound': {'names': ['iron']}}
{'Compound': {'names': ['phosphate']}}
{'Compound': {'names': ['LiFePO4']}}
{'Compound': {'names': ['LFP']}}
{'Compound': {'names': ['ferrophosphate']}}
{'Compound': {'names': ['carbon']}}
{'GeneralInfo': {'answer': 'graphitic carbon', 'specifier': 'anode'}}
{'GeneralInfo': {'answer': 'lithium iron phosphate', 'specifier': 'cathode'}}


In [7]:
from batterydataextractor.doc.text import Paragraph
s = '1H NMR spectra were recorded on a Varian MR-400 MHz instrument.'
p = Paragraph(s)
p.add_general_models(["apparatus"])
record = p.records
for r in record:
    print(r.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'Compound': {'names': ['1H']}}
{'GeneralInfo': {'answer': 'Varian MR - 400 MHz instrument', 'specifier': 'apparatus'}}


In [8]:
s = '1H NMR spectra were recorded on a Varian MR-400 MHz instrument.'
p = Paragraph(s)
p.add_general_models(["instrument"])
record = p.records
for r in record:
    print(r.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'Compound': {'names': ['1H']}}
{'GeneralInfo': {'answer': 'Varian MR - 400 MHz', 'specifier': 'instrument'}}


TODO:

- More complicated case (e.g. A long paragraph including multiple properties or materials?)
- Save the database option
- Save the original text option
- Debug: Compound, Number, Units
- Documentation

TODO (NER):
- Sort out training/test set
- Debug: CRF; LSTMCRF

Done:

- Define relations between compound and property.~~
- General Parser (Non-property parser): Anode/Cathode/Electrolyte; Apparatus~~
- Evaluation dataset 

In [19]:
from batterydataextractor.doc import Document
text = "The theoretical capacity of graphite is 372 mAh/g... In the case of LiFePO4 chemistry, the absolute maximum voltage is 4.2V per cell."
doc = Document(text)

property_names = ["capacity", "voltage"]
doc.add_models_by_names(property_names)
for record in doc.records:
    print(record.serialize())
    
# ==============================================================
# {'PropertyData': {'value': [372.0], 'units': 'mAh / g', 'raw_value': '372 mAh / g', 'specifier': 'capacity', 'material': 'graphite'}}
# {'PropertyData': {'value': [4.2], 'units': 'V', 'raw_value': '4.2 V', 'specifier': 'voltage', 'material': 'LiFePO4'}}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'PropertyData': {'value': [372.0], 'units': 'mAh / g', 'raw_value': '372 mAh / g', 'specifier': 'capacity', 'material': 'graphite'}}
{'PropertyData': {'value': [4.2], 'units': 'V', 'raw_value': '4.2 V', 'specifier': 'voltage', 'material': 'LiFePO4'}}


In [22]:
doc.records.serialize()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'PropertyData': {'value': [146.0, 147.0],
   'units': '° C',
   'raw_value': '146-147 ° C',
   'specifier': 'mp',
   'material': 'Aspirin'}}]

In [1]:
from batterydataextractor.doc import Document
text = 'The melting point (mp) of Aspirin (C9H8O4): 146-147 °C.'
doc = Document(text)

property_names = ["mp"]
doc.add_models_by_names(property_names, confidence_threshold=0.01)
for record in doc.records:
    print(record.serialize())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'PropertyData': {'value': [146.0, 147.0], 'units': '° C', 'raw_value': '146-147 ° C', 'specifier': 'mp', 'material': 'Aspirin'}}


In [None]:
from batterydataextractor.doc.text import Paragraph
s = 'The lithium iron phosphate battery (LiFePO4 battery) or LFP battery (lithium ferrophosphate), is a type ' \
            'of lithium-ion battery using lithium iron phosphate (LiFePO4) as the cathode material, and a graphitic ' \
            'carbon electrode with a metallic backing as the anode.'
p = Paragraph(s)
p.add_general_models(["anode", "cathode"], confidence_threshold=0.1)
record = p.records
for r in record:
    print(r.serialize())