In [1]:
# set wd so we can load the settings.yaml file.
import os
os.chdir("/data/akhlak/PromptDataExtraction/")
assert "PromptDataExtraction" in os.getcwd(), os.getcwd()

In [2]:
from backend import postgres, sett
from backend.postgres.orm import PaperTexts
from backend.data.dataset_pranav import GroundDataset
from backend.record_extraction import bert_model, record_extractor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sett.load_settings()
postgres.load_settings()
db = postgres.connect()

Load OK: settings.yaml
[1;36mNOTE  --[0m postgres_ SSH tunnel established.


In [4]:
from backend.record_extraction.utils import LoadNormalizationDataset, load_property_metadata

In [6]:
bert = bert_model.MaterialsBERT(sett.NERPipeline.model)
bert.init_local_model(device=sett.NERPipeline.pytorch_device)

[1;36m      --[0m bert_ Loaded materials bert.


In [8]:
from importlib import reload
from backend.data import mongodb
reload(mongodb)

<module 'backend.data.mongodb' from '/data/akhlak/PromptDataExtraction/backend/data/mongodb.py'>

In [9]:
gnd = GroundDataset()
tg_gnd, tg_nlp = gnd.create_dataset()

[1;36m      --[0m dataset_ Number of DOI's in dataset: 499
[1;36m      --[0m dataset_ Number of records in dataset: 714


In [10]:
normdata = LoadNormalizationDataset(sett.DataFiles.polymer_nen_json)
norm_dataset = normdata.process_normalization_files()
prop_metadata = load_property_metadata(sett.DataFiles.properties_json)

In [11]:
abstract = None

for doi, value in tg_gnd.items():
    print(doi)
    abstract = [item['abstract'] for item in value if item['abstract'] != ''][0]
    break

abstract

10.1039/c6gc03238a


'The general and efficient copolymerization of lactones with hydroxy-acid bioaromatics was accomplished via a concurrent ring-opening polymerization (ROP) and polycondensation methodology. Suitable lactones were L-lactide or ε-caprolactone and four hydroxy-acid comonomers were prepared as hydroxyethyl variants of the bioaromatics syringic acid, vanillic acid, ferulic acid, and p-coumaric acid. Copolymerization conditions were optimized on a paradigm system with a 20:80 feed ratio of caprolactone:hydroxyethylsyringic acid. Among six investigated catalysts, polymer yield was optimized with 1 mol % of Sb_{2}O_{3}, affording eight copolymer series in good yields (32-95 % for lactide; 80-95 % for caprolactone). Half of the polymers were soluble in the GPC solvent hexafluoroisopropanol and analyzed to high molecular weight, with M_{n} = 10500-60700 Da. Mass spectrometry and ^{1}H NMR analysis revealed an initial ring-opening formation of oligolactones, followed by polycondensation of these w

In [12]:
ner_tags = bert.get_tags(abstract)

In [14]:
relation_extractor = record_extractor.RelationExtraction(abstract, ner_tags, norm_dataset, prop_metadata)
output_para, timings = relation_extractor.process_document()

In [18]:
output_para['monomers']

[{'entity_name': 'p',
  'material_class': 'MONOMER',
  'role': '',
  'polymer_type': '',
  'normalized_material_name': '',
  'coreferents': ['p', 'caprolactone', 'ε-caprolactone'],
  'components': []},
 {'entity_name': 'hydroxyethylsyringic acid',
  'material_class': 'MONOMER',
  'role': '',
  'polymer_type': '',
  'normalized_material_name': '',
  'coreferents': ['hydroxyethylsyringic acid'],
  'components': []},
 {'entity_name': 'lactide',
  'material_class': 'MONOMER',
  'role': '',
  'polymer_type': '',
  'normalized_material_name': '',
  'coreferents': ['lactide', 'L-lactide'],
  'components': []}]

In [12]:
records = output_para['material_records']

In [13]:
for r in records:
    print("Material:", r['material_name'])
    print("Material amount:", r.get('material_amount', "N/A"))
    print("Properties:", r['property_record'])
    print("-" * 50, "\n")


Material: [{'entity_name': 'polylactic acid', 'material_class': 'POLYMER', 'role': '', 'polymer_type': 'homopolymer', 'normalized_material_name': 'PLA', 'coreferents': ['polylactic acid', 'PLA'], 'components': []}, {'entity_name': 'polycaprolactone', 'material_class': 'POLYMER', 'role': '', 'polymer_type': 'homopolymer', 'normalized_material_name': 'PCL', 'coreferents': ['polycaprolactone', 'PCL'], 'components': []}, {'entity_name': 'polyethylene terephthalate', 'material_class': 'POLYMER', 'role': '', 'polymer_type': 'homopolymer', 'normalized_material_name': 'PET', 'coreferents': ['polyethylene terephthalate'], 'components': []}, {'entity_name': 'polystyrene', 'material_class': 'POLYMER', 'role': '', 'polymer_type': 'homopolymer', 'normalized_material_name': 'polystyrene', 'coreferents': ['polystyrene'], 'components': []}]
Material amount: [{'entity_name': 'caprolactone', 'material_amount': '20:80'}, {'entity_name': 'lactide', 'material_amount': '32'}, {'entity_name': 'lactide', 'mat

In [None]:
for doi, value in tg_nlp.items():
    print(doi)
    for item in value:
        print(item)
    print("-" * 50, "\n")


10.1039/c6gc03238a
{'material': 'polylactic acid', 'material_coreferents': ['polylactic acid', 'PLA'], 'property_value': '50 ° C'}
{'material': 'polylactic acid', 'material_coreferents': ['polylactic acid', 'PLA'], 'property_value': '62-107 ° C'}
{'material': 'polycaprolactone', 'material_coreferents': ['polycaprolactone', 'PCL'], 'property_value': '-60 ° C'}
{'material': 'polycaprolactone', 'material_coreferents': ['polycaprolactone', 'PCL'], 'property_value': '-48 to 105 ° C'}
{'material': 'polyethylene terephthalate', 'material_coreferents': ['polyethylene terephthalate'], 'property_value': '67 ° C'}
{'material': 'polystyrene', 'material_coreferents': ['polystyrene'], 'property_value': '95 ° C'}
-------------------------------------------------- 

10.1007/s11998-015-9705-0
{'material': 'Novolac', 'material_coreferents': ['Novolac'], 'property_value': '52 ± 1'}
{'material': 'Novolac', 'material_coreferents': ['Novolac'], 'property_value': '79 ± 1 ° C'}
{'material': 'bisphenol A', 'ma