# CheMastery: Identify information in chemical recipes

In [1]:
import pandas as pd
import numpy as np
import os
import chemdataextractor
from chemdataextractor import Document
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import re

In [2]:
main = '/Users/Viktoria/Desktop/Chemicals'
os.chdir(main)

In [3]:
f = open('exercise_experimentals.txt', 'rb')
doc = Document.from_file(f)

In [4]:
print(len(doc), ' ', type(doc))

11   <class 'chemdataextractor.doc.document.Document'>


## Question 1. What was added to what?

### Step 1. Get the sentences of the text up until the last occurrence of addition. <br> Step 2. Get the named entities that stand for the ingredients. <br> Step 3. Get the syntactic position of the ingredients to find out 'what was added to what'. In 'X added to Y' X is the subject, Y is the object. 

In [5]:
#Create a list of strings (recipes) which contains the texts of each recipe up until the last occurrence of the phrase that contains the addition

recipes=[]

for i in range(0, len(doc)):
    
    paragraph=[]
    tracker=[]
    
    for sentence in doc[i].sentences:
        text=str(sentence).lower()
        paragraph.append(text)
        
        #check if the addition phrase is present
        if 'add' in text or 'addition' in text:
            tracker.append(1)
        else:
            tracker.append(0)
            
    #consider the text until the last occurrence of the 'add' phrase
    paragraph = paragraph[0:np.max(np.nonzero(tracker))+1]    
    paragraph = ' '.join([p for p in paragraph])
    recipes.append(paragraph)

In [6]:
#Get a list of all the chemical elements in each recipe

entities=[]
for i in range(len(doc.cems)):
    entities.append(str(doc.cems[i]).lower())

In [19]:
# Create dictionaries for each recipe (indices) with the name of the entity, starting position, ending position. 
# These dictionaries will be passed to spacy to get the syntax of the sentences.

tagged_entities = []

for recipe in recipes:
    
    indices={}
    
    #check which entities appear in the text
    for entity in entities:
        
        #look for the entity 
        if entity in recipe:
            
            #get the start & end position and add it to the dict
            start = recipe.index(entity)
            end = recipe.index(entity)+len(entity)
            indices[entity]=start, end
            
    #only match the full chemical name, not if e.g. the word 'h2o' is contained in a longer name
    keys = list(indices.keys())
    for word in keys: 
        if sum(word in k for k in keys) > 1: #more than 1 occurrences 
            indices.pop(word) #pop the short word that is contained by other words
    
    
    tagged_entities.append(indices)

In [20]:
# Add the named entities to spacy, then get the information which one was the subject and which one the object of the sentence

nlp = spacy.load("en_core_web_sm") # load a new spacy model

db = DocBin() # create a DocBin object
corpora = []

for i,v in enumerate(recipes):

    indices=tagged_entities[i]

    spacy_doc = nlp(recipes[i]) # create doc object from text
    ents = []
    
    for (key,value) in indices.items(): # add character indexes
        span = spacy_doc.char_span(value[0], value[1], label=key, alignment_mode="expand")
        
        if span is None:
            print('none')
        else:
            ents.append(span)
          
    spacy_doc.ents = ents # label the text with the ents
    db.add(spacy_doc)
    
    corpora.append(spacy_doc)

In [21]:
#Check on a single recipe if the tyagging process was successful

#spacy_doc.has_annotation("TAG") #gives True

In [22]:
#Visualise the sentence structure of a single recipe

#displacy.render(spacy_doc, style="dep")

In [23]:
# Now find out whether the ingredient in the recipe was a subject or an object
# X added to Y -> X is the subject, Y is the object

for c in range(len(corpora)):

    ingredients={}
    indices = tagged_entities[c]

    #get the name and the syntactic position
    for token in corpora[c]:
    
        if 'subj' in token.dep_ or 'obj' in token.dep_:
     
            #if this is a chemical entity we are interested in
            for k in indices.keys():
                if token.text in k:
                  
                    #add the syntactic information
                    indices[k] = token.dep_


In [25]:
indices=tagged_entities[0]

In [26]:
#In a structure "X added to Y", X is the subject of addition, i.e. the chemical being added & Y is object, i.e. the recipient

def clean_results(value):
    
    if 'subj' in value:
        value = 'added to mixture'
    elif 'obj' in value:
        value = 'recipient'
    else:
        value = 'order of addition unknown'
        
    return value

In [31]:
results=[]
for i in tagged_entities:
    ingredients = {key:clean_results(value) for (key,value) in i.items()}
    results.append(ingredients)

## Question 2. How much of each constituent was added?

In [33]:
recipes[0]

'4-methylmorpholine n-oxide (1.76 ml, 8.42 mmol) and potassium osmate dihydrate (97.3 mg, 0.38 mmol) were added to a solution of (5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one (347 mg, 1.91 mmol) in acetone/h2o (2:1, 54 ml).'

In [35]:
'4-methylmorpholine n-oxide' in entities

True

In [None]:
entity = '4-methylmorpholine n-oxide'
pattern = re.compile(entity + r'\s*[(.*)]\s*')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/Viktoria/anaconda3/envs/Flawless/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'TypeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/Viktoria/anaconda3/envs/Flawless/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/Viktoria/anaconda3/envs/Flawless/lib/python3.8/site-packages/IPython/core/ultratb.py", line 248, in wrapped
    return f(*args, **kwargs)
  File "/Users/Viktoria/anaconda3/envs/Flawless/lib/python3.8/site-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/Viktoria/anaconda3/en