<h1>Importing libraries and utilites

In [1]:
import os
import pickle
import sys
import multiprocessing as mp
sys.path.append('../')

from argparse import ArgumentParser
import numpy as np
import pandas as pd
from tqdm import tqdm
from chemdataextractor.doc import Paragraph

import torch
from torch import nn

import ner_datasets
from models import BERT_CRF
from normalize_text import normalize

from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
    AdamW,
)

import chemdataextractor as cde

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)

def apply_parallel(func_name, l):
    p = min(len(l), mp.cpu_count())
    with mp.Pool(processes=p) as pool:
        r = list(tqdm(pool.imap(func_name, l), total=len(l)))
    return r

using device: cuda


In [3]:
root_dir = '../'
cache_dir = os.path.join(root_dir, '.cache')
output_dir = os.path.join(root_dir, 'ner/output_matscibert_matscholar')
model_name = os.path.join(root_dir, 'ner/models/matscholar')
to_normalize = True

In [4]:
# captions is the list of sentences from which entities need to be extracted

captions= ['Glasses are emerging as promising and efficient solid electrolytes for all-solid-state sodium-ion batteries.',
           'The current study shows a significant enhancement in crack resistance (from 11.3 N to 32.9 N) for Na3Al1.8Si1.65P1.8O12 glass (Ag-0 glass) upon Na+-Ag+ ion-exchange (IE) due to compressive stresses generated in the glass surface while the ionic conductivity values (∼10−5 S/cm at 473 K) were retained. ',
           'In this study, magic angle spinning-nuclear magnetic resonance (MAS-NMR), molecular dynamics (MD) simulations, Vickers micro hardness, and impedance spectroscopic techniques were used to evaluate the intermediate-range structure, atomic structure, crack resistance and conductivity of the glass.',
           'Selected beam geometry allows us to suppress the bulk contribution to sum-frequency generation from crystalline quartz and use sum-frequency vibrational spectroscopy to study water/α-quartz interfaces with different bulk pH values.',
           'XRD patterns of glass-ceramics sintered at different holding times; identifying rutile TiO2 crystal grains.']
           
           

In [5]:
# del df
captions = [c for c in captions if type(c) == str]

def tokenize_caption(c):
    para = Paragraph(normalize(c))
    ret = []
    for sent in para.tokens:
        ret.append([t.text for t in sent])
    return ret

tok_captions = apply_parallel(tokenize_caption, captions)

100%|██████████| 5/5 [00:00<00:00,  7.72it/s]


In [6]:
sum_tok_captions = []
for t in tok_captions:
    sum_tok_captions += t
tok_captions = sum_tok_captions

train_X, train_y = ner_datasets.get_ner_data('matscholar', norm=to_normalize)[:2]
print(len(train_X))

4401


In [7]:
unique_labels = set(label for sent in train_y for label in sent)
label_list = sorted(list(unique_labels))
print(label_list)
tag2id = {tag: id for id, tag in enumerate(label_list)}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(label_list)
print(num_labels)

['B-APL', 'B-CMT', 'B-DSC', 'B-MAT', 'B-PRO', 'B-SMT', 'B-SPL', 'I-APL', 'I-CMT', 'I-DSC', 'I-MAT', 'I-PRO', 'I-SMT', 'I-SPL', 'O']
15


In [8]:
cnt = dict()
for sent in train_y:
    for label in sent:
        if label[0] in ['I', 'B']: tag = label[2:]
        else: continue
        if tag not in cnt: cnt[tag] = 1
        else: cnt[tag] += 1

eval_labels = sorted([l for l in cnt.keys() if l != 'experiment_evoking_word'])
print(len(eval_labels))

7


In [9]:
tokenizer_kwargs = {
    'cache_dir': cache_dir,
    'use_fast': True,
    'revision': 'main',
    'use_auth_token': None,
    'model_max_length': 512
}
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert', **tokenizer_kwargs)

In [10]:
labels = [['O' for i in range(len(sent))] for sent in tok_captions]

In [11]:
def tokenize(c):
    return tokenizer(c, is_split_into_words=True, return_offsets_mapping=True, padding=True)
captions_encodings = apply_parallel(tokenize, tok_captions)

tok_captions = [t for x, t in zip(captions_encodings, tok_captions) if len(x['input_ids']) <= 512]
labels = [l for x, l in zip(captions_encodings, labels) if len(x['input_ids']) <= 512]
def tokenize2(c):
    return tokenizer(c, is_split_into_words=True, return_offsets_mapping=True, max_length=512, padding='max_length')
captions_encodings = apply_parallel(tokenize2, tok_captions)
captions_encodings = {k: [x[k] for x in captions_encodings] for k in captions_encodings[0].keys()}


def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    i = 0
    for doc_labels, doc_offset in tqdm(zip(labels, encodings['offset_mapping'])):
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)
        try:
            doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels
        except:
            print(i)
            raise
        i += 1
        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

100%|██████████| 5/5 [00:00<00:00, 747.81it/s]
100%|██████████| 5/5 [00:00<00:00, 512.89it/s]


In [12]:
lsbs = encode_tags(labels, captions_encodings)

# captions_encodings.pop('offset_mapping', None)
del captions_encodings['offset_mapping']

5it [00:00, 1152.91it/s]


In [13]:
caption_dataset = ner_datasets.MyDataset(captions_encodings, lsbs)

In [14]:
config_kwargs = {
    'num_labels': num_labels,
    'cache_dir': cache_dir,
    'revision': 'main',
    'use_auth_token': None,
}
config = AutoConfig.from_pretrained('m3rg-iitd/matscibert', **config_kwargs)

In [15]:
model = BERT_CRF('m3rg-iitd/matscibert', device, config, cache_dir)

Some weights of the model checkpoint at m3rg-iitd/matscibert were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at m3rg-iitd/matscibert and are newly i

In [16]:
ner_model_path = os.path.join(os.getcwd(), 'models/matscholar')
model.load_state_dict(torch.load(os.path.join(ner_model_path, 'pytorch_model.bin'), map_location='cpu'))
model = model.to(device)

In [17]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=64,
    seed=0
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=caption_dataset,
    tokenizer=tokenizer,
)

preds = np.argmax(trainer.predict(caption_dataset).predictions, axis=2)

true_preds = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(preds, lsbs)
]

assert len(true_preds) == len(tok_captions)

for p, a in zip(true_preds, tok_captions):
    assert len(p) == len(a)
    
res = []
for t, p in zip(tok_captions, true_preds):
    res.append(list(zip(t, p)))

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


In [18]:
caption_preds = res

In [19]:
from collections import defaultdict, Counter

In [20]:
idxs = []
for i, c in tqdm(enumerate(captions)):
    para = Paragraph(normalize(c))
    for sent in para.tokens:
        idxs.append(i)

5it [00:00, 13.65it/s]


In [22]:
tags = dict()
for idx, c in zip(idxs, caption_preds):
    if idx not in tags:
        tags[idx] = defaultdict(list)
    for i, t in enumerate(c):
        if t[1][0] in ['I', 'O']:
            continue
        s = t[0]
        j = i + 1
        while j < len(c) and c[j][1][0] == 'I':
            assert t[1][2:] == c[j][1][2:]
            s += ' ' + c[j][0]
            j += 1
        tags[idx][t[1][2:]].append(s)

In [23]:
d = defaultdict(list)
for c in caption_preds:
    for i, t in enumerate(c):
        if t[1][0] in ['I', 'O']:
            continue
        s = t[0]
        j = i + 1
        while j < len(c) and c[j][1][0] == 'I':
            assert t[1][2:] == c[j][1][2:]
            s += ' ' + c[j][0]
            j += 1
        d[t[1][2:]].append(s)

In [24]:
df = pd.DataFrame(data=captions, columns=['Text'])

In [25]:
for k in d:
    df[k] = [list() for i in range(len(df))]

In [26]:
df.head()

Unnamed: 0,Text,DSC,APL,PRO,MAT,CMT,SMT,SPL
0,Glasses are emerging as promising and efficien...,[],[],[],[],[],[],[]
1,The current study shows a significant enhancem...,[],[],[],[],[],[],[]
2,"In this study, magic angle spinning-nuclear ma...",[],[],[],[],[],[],[]
3,Selected beam geometry allows us to suppress t...,[],[],[],[],[],[],[]
4,XRD patterns of glass-ceramics sintered at dif...,[],[],[],[],[],[],[]


In [27]:
for i, t in tags.items():
    for k in t:
        assert k in d.keys()
        df.loc[i, k].extend(t[k])

In [28]:
df

Unnamed: 0,Text,DSC,APL,PRO,MAT,CMT,SMT,SPL
0,Glasses are emerging as promising and efficien...,[Glasses],"[solid electrolytes, all-solid-state sodium - ...",[],[],[],[],[]
1,The current study shows a significant enhancem...,"[glass, glass, glass surface]",[],"[crack resistance, compressive stresses, ionic...","[Na3Al1.8Si1.65P1.8O12, Ag-0]",[],[],[]
2,"In this study, magic angle spinning-nuclear ma...",[glass],[],"[intermediate - range structure, atomic struct...",[],[magic angle spinning - nuclear magnetic reson...,[],[]
3,Selected beam geometry allows us to suppress t...,"[bulk, crystalline, interfaces]",[],[pH],"[quartz, α-quartz]",[sum - frequency vibrational spectroscopy],[],[]
4,XRD patterns of glass-ceramics sintered at dif...,"[glass - ceramics, crystal grains]",[],[],[TiO2],[XRD],[sintered],[rutile]


In [3]:
def string_to_list(s):
    my_list = s.split()
    lt = []
    for st in my_list:
        flag = False
        for ch in st:
            if ch.isdigit():
                flag = True
                break
        if flag:
            lt.append(st)
    return lt

In [1]:
sentences = ['Theoretically, when the application temperature is above Tg, the polymer behaves like a rubber and when below Tg, the polymer is a plastic (applicable to amorphous polymers only in this respect). For the two polymers discussed, PE has a Tg of around −128°C while PVC has a Tg of around 68°C, which is a significant difference.'
             ,'Substituting NE by m-aminostyrene and the subsequent curing yielded a polyimide with Tg values in excess of 325°C.',
             ' The PMR-15 composite cured at 316°C exhibits a Tg of 270°C. Curing at 274°C results in a decrease in Tg to 190°C.'
             ]

In [13]:
tg_df = {'sentences': [], 'NUM':[]}
for j in sentences:
    tg_df['sentences'].append(j)
    tg_df['NUM'].append(string_to_list(j))
    

In [14]:
tg_df

{'sentences': ['Theoretically, when the application temperature is above Tg, the polymer behaves like a rubber and when below Tg, the polymer is a plastic (applicable to amorphous polymers only in this respect). For the two polymers discussed, PE has a Tg of around −128°C while PVC has a Tg of around 68°C, which is a significant difference.',
  'Substituting NE by m-aminostyrene and the subsequent curing yielded a polyimide with Tg values in excess of 325°C.',
  ' The PMR-15 composite cured at 316°C exhibits a Tg of 270°C. Curing at 274°C results in a decrease in Tg to 190°C.'],
 'NUM': [['−128°C', '68°C,'],
  ['325°C.'],
  ['PMR-15', '316°C', '270°C.', '274°C', '190°C.']]}

In [15]:
import pandas as pd

In [16]:
tg_df = pd.DataFrame.from_dict(tg_df)

In [17]:
tg_df

Unnamed: 0,sentences,NUM
0,"Theoretically, when the application temperatur...","[−128°C, 68°C,]"
1,Substituting NE by m-aminostyrene and the subs...,[325°C.]
2,The PMR-15 composite cured at 316°C exhibits ...,"[PMR-15, 316°C, 270°C., 274°C, 190°C.]"


In [9]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [8]:
import nltk
import spacy
print('NTLK version: %s' % (nltk.__version__))

from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')

NTLK version: 3.8.1


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\shukl\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shukl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shukl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\shukl\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [10]:
sentences = ['Theoretically, when the application temperature is above Tg, the polymer behaves like a rubber and when below Tg, the polymer is a plastic (applicable to amorphous polymers only in this respect). For the two polymers discussed, PE has a Tg of around −128°C while PVC has a Tg of around 68°C, which is a significant difference.'
             ,'Substituting NE by m-aminostyrene and the subsequent curing yielded a polyimide with Tg values in excess of 325°C.',
             ' The PMR-15 composite cured at 316°C exhibits a Tg of 270°C. Curing at 274°C results in a decrease in Tg to 190°C.'
             ]

In [11]:
for i in sentences:
  results = ne_chunk(pos_tag(word_tokenize(i)))
  for x in str(results).split('\n'):
    if '/NNP' in x:
        print(x.replace('/NNP', ''))
  print("--------------------------------------------------")

  (GPE Tg)
  (GPE Tg)
  (ORGANIZATION PE)
  Tg
  −128°C
  (ORGANIZATION PVC)
  Tg
  68°C
--------------------------------------------------
  NE
  Tg
  (ORGANIZATION polyimide)
  325°C
--------------------------------------------------
  (ORGANIZATION PMR-15)
  Tg
  270°C
  (GPE Tg)
  Tg
  190°C
--------------------------------------------------
