### clean and tokenize your text

In [7]:
import stanza
stanza.download('en', package='craft', processors='tokenize')
import json
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *


test_txt = '''
Rh-based bimetallic catalysts are promising ligand-free heterogeneous catalysts for hydroformylation reactions. It is
important to understand the mechanism of this bimetallic
promotion for designing highly selective and active heterogenous
catalysts. In this work, the RhCo bimetallic catalyst was
investigated focusing on the promotion effect of Co for the gasphase hydroformylation of ethene. Adding Co to Rh increased both
the catalytic productivity and selectivity to oxygenates. In situ
diffuse reflectance infrared Fourier transform spectroscopy and
CO-temperature programmed desorption were used to characterize
CO adsorption. The results showed that the addition of Co to Rh
changed the CO adsorption modes and strength for the Rh-based
catalyst. Modulated CO adsorption strength was important to
enhance selectivity. Density functional theory calculations were
carried out to reveal the reaction mechanism. A reaction pathway was proposed to clarify the reason for enhanced selectivity on a
RhCo bimetallic catalyst and show that the ratio between CO migration and desorption played a great role in this reaction. '''
#load stanza tokenizer
nlp = stanza.Pipeline('en', package='craft', processors='tokenize', use_gpu=False)

test_sents = []
idx = 0
test_txt = cleanup_text(test_txt)
for sent in nlp(test_txt).sentences:
    sent_token = []
    for token in sent.tokens:
        # it is fine to label all token as O because it is not training
        sent_token.append({
            'text':token.text,
            'label':'O',
            "id":  idx,
            "start": token.start_char,
            "end": token.end_char,
        })
        idx += 1
    test_sents.append((sent.text, sent_token))
test_sents = stanza_fix(test_sents)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 17.9MB/s]                    
2023-06-07 15:50:47 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | craft   |

2023-06-07 15:50:47 INFO: File exists: C:\Users\chern\stanza_resources\en\tokenize\craft.pt.
2023-06-07 15:50:47 INFO: Finished downloading models and saved to C:\Users\chern\stanza_resources.
2023-06-07 15:50:47 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | craft   |

2023-06-07 15:50:47 INFO: Use device: cpu
2023-06-07 15:50:47 INFO: Loading: tokenize
2023-06-07 15:50:47 INFO: Done loading processors!


### predict using model checkpoint

In [8]:
import json
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *

#use the checkpoint trained on first fold
ckpt_name = 'checkpoint/CV_0.ckpt'
bert_name = 'pretrained/scibert_domain_adaption'
model = BERTSpan.load_from_checkpoint(ckpt_name, model_name=bert_name, train_dataset=[], val_dataset=[], test_dataset=[])

def pred_model_dataset(model, sent):
    output_tensor_buf = []
    pred_dataset, pred_dataloader = model.gen_pred_dataloader(sent)
    
    model.setup('test')
    model = model.cuda()
    model.eval()
    with torch.no_grad():
        offset = 0
        for batch in tqdm(pred_dataloader):
            batch = model.batch_cuda(batch)
            model.pred_dataset_step(offset, batch, pred_dataset)
            offset += len(batch[0])
    return pred_dataset.output_pred()


output_sents = pred_model_dataset(model, test_sents)
for sent in output_sents:
    sent_tag = [t['pred'] for t in sent]
    print(assemble_token_text(sent))
    for i,j,l in get_bio_spans(sent_tag):
        print(assemble_token_text(sent[i:j+1]), l)
    print('\n\n')

Lightning automatically upgraded your loaded checkpoint from v1.3.4 to v1.9.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file C:\Users\chern\Documents\GitHub\CatalysisIE\checkpoint\CV_0.ckpt`
Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect 

Rh-based bimetallic catalysts are promising ligand-free heterogeneous catalysts for hydroformylation reactions.
Rh-based bimetallic catalysts Catalyst
hydroformylation Reaction



It is important to understand the mechanism of this bimetallic promotion for designing highly selective and active heterogenous catalysts.



In this work, the RhCo bimetallic catalyst was investigated focusing on the promotion effect of Co for the gasphase hydroformylation of ethene.
RhCo bimetallic catalyst Catalyst
Co Catalyst
gasphase hydroformylation Reaction
ethene Reactant



Adding Co to Rh increased both the catalytic productivity and selectivity to oxygenates.
Co Catalyst
Rh Catalyst
oxygenates Product



In situ diffuse reflectance infrared Fourier transform spectroscopy and CO-temperature programmed desorption were used to characterize CO adsorption.
In situ diffuse reflectance infrared Fourier transform spectroscopy Characterization
CO-temperature programmed desorption Characterization



The res




In [1]:
from quantulum3 import parser
quants = parser.parse('For 0.6Rh/SiO2, the catalyst productivity was 44 mol/(molRh·h) and the TOF was 2511 h−1 at the time on stream (TOS) of 5 h')
quants

[Quantity(0.6, "Unit(name="south african rand hour", entity=Entity("unknown"), uri=None)"),
 Quantity(44, "Unit(name="mole", entity=Entity("amount of substance"), uri=Mole_(unit))"),
 Quantity(2511, "Unit(name="per hour ampere-turn", entity=Entity("unknown"), uri=None)"),
 Quantity(5, "Unit(name="hour", entity=Entity("time"), uri=Hour)")]

In [32]:
# usefull code lines
string= 'RhCoO'
str_pattern = r'([A-Z](?:[a-z])?)'
pattern= re.compile(str_pattern)
result = pattern.findall(string)
result


['Rh', 'Co', 'O']

In [2]:
import re
def chemical_prep(chem_list):
    long_chem = {}
    comp_dict = {}
    for entity in chem_list:
        entity_split = entity.split()
        if len(string_list) >= 2 or re.match(r'[A-Za-z]([a-z]){3,}', string) is not None:
            comp = re.findall(r'([A-Z](?:[a-z])?)',entity)
            comp_dict[entity] = comp
        else:
            chem_dict[entity] = entity_split  
    return chem_dict, comp_dict


def ont_comp_lookup (comp_dict,onto):
    for k,v in comp_dict.items():
        for comp in v:
            

2

In [None]:
import json
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *

#use the checkpoint trained on first fold
ckpt_name = 'checkpoint/CV_0.ckpt'
bert_name = 'pretrained/scibert_domain_adaption'
model = BERTSpan.load_from_checkpoint(ckpt_name, model_name=bert_name, train_dataset=[], val_dataset=[], test_dataset=[])

def pred_model_dataset(model, sent):
    output_tensor_buf = []
    pred_dataset, pred_dataloader = model.gen_pred_dataloader(sent)
    
    model.setup('test')
    model = model.cuda()
    model.eval()
    with torch.no_grad():
        offset = 0
        for batch in tqdm(pred_dataloader):
            batch = model.batch_cuda(batch)
            model.pred_dataset_step(offset, batch, pred_dataset)
            offset += len(batch[0])
    return pred_dataset.output_pred()


output_sents = pred_model_dataset(model, test_sents)
for sent in output_sents:
    sent_tag = [t['pred'] for t in sent]
    print(assemble_token_text(sent))
    for i,j,l in get_bio_spans(sent_tag):
        print(assemble_token_text(sent[i:j+1]), l)
    print('\n\n')