In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(1, "drive/MyDrive/workflow/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pymatgen transformers nltk ipywidgets seqeval[gpu]
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [None]:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import re
from pymatgen.core import Composition
from torch.utils.data import DataLoader
from torch import cuda
from transformers import BertTokenizerFast
from seqeval.metrics import classification_report
import os
import json

import psie

import nltk
nltk.download("punkt", quiet=True)

True

In [None]:
device = "cuda" if cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
radio_buttons = widgets.RadioButtons(
    options=["Band Gap", "Curie Temperature"], value="Band Gap", description=''
)
print("Extraction Target: ")
display(radio_buttons)

Extraction Target: 


RadioButtons(options=('Band Gap', 'Curie Temperature'), value='Band Gap')

In [None]:
if radio_buttons.value == "Curie Temperature":
  extr_target = "Tc"
elif radio_buttons.value == "Band Gap":
  extr_target = "Gap"

MAX_LEN = 256
MAIN_DIR = os.path.join("drive", "MyDrive", "workflow")
MODEL_DIR = os.path.join("models", extr_target, "ner")                          # Fine-tuned NER model
CORPUS = os.path.join("corpus", extr_target, "relevant_sentences.json")
OUTPUT = "test_extraction"                                                      # Name of the output file

In [None]:
id_to_BOI = {
    1: "B-CHEM",     # Chemical entity
    0: "O",          # No entity
}

if extr_target == "Tc":
  id_to_BOI[2] = "B-TEMP"
elif extr_target == "Gap":
  id_to_BOI[2] = "B-BANDGAP"

In [None]:
with open(os.path.join(MAIN_DIR, CORPUS), "r") as f:
  data = json.load(f)

tokenizer = BertTokenizerFast.from_pretrained(os.path.join(MAIN_DIR, MODEL_DIR))

sentences = psie.NerUnlabeledDataset(data["sentence"][0:1000], tokenizer, max_len=MAX_LEN)
sources = data["source"][0:1000]
sentences_params = {'batch_size': 10,
                    'shuffle': False,
                    'num_workers': 0
}

sentences_loader = DataLoader(sentences, **sentences_params)

In [None]:
model = psie.BertForNer.from_pretrained(os.path.join(MAIN_DIR, MODEL_DIR), num_labels=3)
model.to(device)

BertForNer(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

# NER predictions

In [None]:
predictions = model.predict(sentences_loader, device, id_to_BOI)

In [None]:
extr_labels = []
for n in range(len(predictions)):

    tokens = tokenizer.tokenize(
        "[CLS]" + psie.preprocess_text(sentences[n]["plain"]) + "[SEP]",
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )
    extracted = {}
    i = 0
    while i < MAX_LEN:
        if predictions[n][i] != "O" and tokens[i] not in ["[CLS]", "[SEP]", "[PAD]"]:
            entity = predictions[n][i]
            entry = []
            while predictions[n][i] == entity:
                entry.append(tokens[i])
                i += 1
                if i >= MAX_LEN:
                    break

            if entity in extracted.keys():
                extracted[entity].append(" ".join(entry))
            else:
                extracted[entity] = [" ".join(entry)]
        i += 1

    extr_labels.append(extracted)

#### Extract the sentences with multiple mentions of Chem and Tc/Gap
The extracted sentences are saved in a json file and will be processed by the BERT model finetuned for relation classification.

In [None]:
relational = []

for i in range(len(extr_labels)):
  n_entries = [len(extr_labels[i][key]) for key in extr_labels[i].keys()]
  if n_entries != []:
    if len(n_entries) == 2:
      if n_entries[0] > 1 and n_entries[1] > 1:
        relational.append(extr_labels[i].copy())

        relational[-1]["sentence"] = sentences[i]["plain"]
        relational[-1]["source"] = sources[i]

print("Relational/Total: ", len(relational), "/", len(predictions))
with open(os.path.join(MAIN_DIR, "extraction", extr_target, "multiple_mentions_"+OUTPUT+".json"), 'w') as f:
  json.dump(relational, f)

Relational/Total:  3 / 133


#### Extract the sentences with exactly 1 mention of Chem and 1 mention of Tc/Gap

In [None]:
relevant = []

for i in range(len(extr_labels)):
    n_entries = [len(extr_labels[i][key]) for key in extr_labels[i].keys()]
    if n_entries == [1, 1]:
        relevant.append(extr_labels[i])

        relevant[-1]["sentence"] = sentences[i]["plain"]
        relevant[-1]["source"] = sources[i]

print("Relevant/Total: ", len(relevant), "/", len(predictions))

Relevant/Total:  1 / 133


Cleaning of the sentences with single mentions of CHEM and Tc/Gap
at every step the sentences that raise an exception are printed for debugging purpose


In [None]:
database = {"compound": [], extr_target: [], "sentence": [], "source": []}

for n in range(len(relevant)):
    chem, trgt = None, None

    try:
        chem = (
            relevant[n]["B-CHEM"][0]
            .strip()
            .replace(" ", "")
            .replace("#", "")
            .replace("(", "\(")
            .replace(")", "\)")
            .replace("+", "\+")
            .replace("[UNK]", "")
            .replace(".", "\.")
        )

        chem = re.findall(
            "(?i)[^a-zA-Z0-9]*" + chem + "[^a-zA-Z]",
            relevant[n]["sentence"],
        )[0].strip()

        if chem.endswith(",") or chem.endswith("."):
            chem = chem[0 : len(chem) - 1]
        if chem.startswith(",") or chem.startswith("."):
            chem = chem[1 : len(chem)]

        if chem in psie.ELEMENT_NAMES:
            chem = psie.ELEMENTS[psie.ELEMENT_NAMES.index(chem)]

        trgt = relevant[n][id_to_BOI[2]][0].replace("#", "").strip()
        trgt = (
            trgt.replace("[", "")
            .replace("]", "")
            .replace("{", "")
            .replace("}", "")
            .replace("=", "")
            .replace("[UNK]", "")
        )

        if extr_target == "Tc":
          trgt = trgt.replace("k", "K").replace("c", "C")
        elif extr_target == "Gap":
          trgt = trgt.replace("ev", "eV")

        if trgt.endswith(",") or trgt.endswith("."):
            trgt = trgt[0 : len(trgt) - 1]
        if trgt.startswith(",") or trgt.startswith("."):
            trgt = trgt[1 : len(trgt)]

        if (chem is not None) and (trgt is not None):
            database["compound"].append(chem)
            database[extr_target].append(trgt)

        database["sentence"].append(relevant[n]["sentence"])
        database["source"].append(relevant[n]["source"])

    except:
        comp = (
            relevant[n]["B-CHEM"][0]
            .replace("#", "")
            .replace(" ", "")
            .replace("(", "\(")
            .replace(")", "\)")
            .replace("+", "\+")
            .replace("[UNK]", "")
        )
        trgt = relevant[n][id_to_BOI[2]][0].replace("#", "").strip()
        print(comp, trgt, relevant[n]["sentence"], "\n\n")                      ### Print the cases that raise an exception (for debugging purposes)

In [None]:
print("Database entries:", len(database["compound"]), "/", len(relevant))

Database entries: 1 / 1


The chemical entity is converted to a Composition object from pymatgen and its reduced formula is taken

In [None]:
database = pd.DataFrame(database)

valid_i = []

for i, comp in enumerate(database['compound']):
  try:
    Composition(comp).get_reduced_formula_and_factor()[0]
    valid_i.append(i)
  except:
    print(comp, '\t', database['sentence'][i], '\n\n')                          # The entries that raise an exception are printed for debugging purpose

In [None]:
print("Database entries:", len(valid_i), "/", len(relevant))

Database entries: 1 / 1


In [None]:
database.iloc[valid_i].to_csv(os.path.join(MAIN_DIR, "extraction", extr_target, "single_mentions_"+OUTPUT+".csv"))

In [None]:
database.head()

Unnamed: 0,compound,Gap,sentence,source
0,CdSe,eV,We demonstrate the change in the band gap of u...,704.3451
