# CheMastery: Identify information in chemical recipes

In [1]:
import pandas as pd
import numpy as np
import os
import chemdataextractor
from chemdataextractor import Document
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import re
from nltk.tokenize import TweetTokenizer
import nltk
from nltk.corpus import stopwords
import json

In [62]:
main = '/Users/Viktoria/Desktop/Chemistry_project'
code = os.path.join(main, 'notebooks')
data = os.path.join(main, 'raw data')
preproc = os.path.join(main, 'preprocessed_data')

In [3]:
os.chdir(data)
f = open('exercise_experimentals.txt', 'rb')
doc = Document.from_file(f)

In [4]:
print(len(doc), ' ', type(doc))

11   <class 'chemdataextractor.doc.document.Document'>


## Question 1. What was added to what?

### Step 1. Get the sentences of the text up until the last occurrence of addition. <br> Step 2. Get the named entities that stand for the ingredients. <br> Step 3. Get the syntactic position of the ingredients to find out 'what was added to what'. In 'X was added to Y' X is the subject, Y is the object. 

In [5]:
#Create a list of strings (recipes) which contains the texts of each recipe up until the last occurrence of the phrase that contains the addition

recipes=[]

for i in range(0, len(doc)):
    
    paragraph=[]
    tracker=[]
    
    for sentence in doc[i].sentences:
        text=str(sentence).lower()
        paragraph.append(text)
        
        #check if the addition phrase is present
        if 'add' in text or 'addition' in text:
            tracker.append(1)
        else:
            tracker.append(0)
            
    #consider the text until the last occurrence of the 'add' phrase
    paragraph = paragraph[0:np.max(np.nonzero(tracker))+1]    
    paragraph = ' '.join([p for p in paragraph])
    recipes.append(paragraph)

In [6]:
#Get a list of all the chemical elements in each recipe

entities=[]
for i in range(len(doc.cems)):
    entities.append(str(doc.cems[i]).lower())

In [7]:
# Create dictionaries for each recipe (indices) with the name of the entity, starting position, ending position. 
# These dictionaries will be passed to spacy to get the syntax of the sentences.

tagged_entities = []

for recipe in recipes:
    
    indices={}
    
    #check which entities appear in the text
    for entity in entities:
        
        #look for the entity 
        if entity in recipe:
            
            #get the start & end position and add it to the dict
            start = recipe.index(entity)
            end = recipe.index(entity)+len(entity)
            indices[entity]=start, end
            
    #only match the full chemical name, not if e.g. the word 'h2o' is contained in a longer name
    keys = list(indices.keys())
    for word in keys: 
        if sum(word in k for k in keys) > 1: #more than 1 occurrences 
            indices.pop(word) #pop the short word that is contained by other words
    
    
    tagged_entities.append(indices)

In [8]:
# Add the named entities to spacy, then get the information which one was the subject and which one the object of the sentence

nlp = spacy.load("en_core_web_sm") # load a new spacy model

db = DocBin() # create a DocBin object
corpora = [] # these will be the syntactically tagged recipes

for i,v in enumerate(recipes):

    indices=tagged_entities[i]

    spacy_doc = nlp(recipes[i]) # create doc object from text
    ents = []
    
    for (key,value) in indices.items(): # add character indexes
        span = spacy_doc.char_span(value[0], value[1], label=key, alignment_mode="expand")
        
        if span is None:
            print('none')
        else:
            ents.append(span)
          
    spacy_doc.ents = ents # label the text with the ents
    db.add(spacy_doc)
    
    corpora.append(spacy_doc)

In [9]:
#Check on a single recipe if the tagging process was successful

#spacy_doc.has_annotation("TAG") #gives True

In [10]:
#Visualise the sentence structure of a single recipe

#displacy.render(spacy_doc, style="dep")

In [11]:
# Now find out whether the ingredient in the recipe was a subject or an object
# X was added to Y -> X is the subject, Y is the object

for c in range(len(corpora)):

    ingredients={}
    indices = tagged_entities[c]

    #get the name and the syntactic position
    for token in corpora[c]:
    
        if 'subj' in token.dep_ or 'obj' in token.dep_:
     
            #if this is a chemical entity we are interested in
            for k in indices.keys():
                if token.text in k:
                  
                    #add the syntactic information
                    indices[k] = token.dep_


In [23]:
#In a structure "X was added to Y", X is the subject, i.e. the chemical being added & Y is object, i.e. the recipient

def clean_results(value):
    
    if 'subj' in value:
        value = 'Order of addition: Added to mixture'
    elif 'obj' in value:
        value = 'Order of addition: Recipient'
    else:
        value = 'Order of addition: Unknown'
        
    return value

In [24]:
# Prepare results to Questions 1.

res1=[]

for i in tagged_entities:
    ingredients = {key:clean_results(value) for (key,value) in i.items()}
    res1.append(ingredients)

In [25]:
res1

[{'(5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one': 'Order of addition: Recipient',
  '4-methylmorpholine n-oxide': 'Order of addition: Added to mixture',
  'h2o': 'Order of addition: Unknown',
  'potassium osmate dihydrate': 'Order of addition: Unknown',
  'acetone': 'Order of addition: Recipient'},
 {'mesitylene': 'Order of addition: Recipient',
  'ammonia': 'Order of addition: Recipient',
  'methanol': 'Order of addition: Added to mixture',
  'toluene': 'Order of addition: Recipient',
  'benzyl alcohol': 'Order of addition: Recipient',
  'n2': 'Order of addition: Unknown',
  'h2ptcl6': 'Order of addition: Recipient'},
 {'dmso': 'Order of addition: Unknown',
  '2-methylbenzaldehyde': 'Order of addition: Recipient',
  'et2o': 'Order of addition: Recipient',
  '(1s,2s)-1,2-bis(2-hydroxyphenyl)-1,2-diaminoethane': 'Order of addition: Recipient',
  'nitrogen': 'Order of addition: Recipient'},
 {'cacl2': 'Order of addition: Unknown',
  'dichloromethane': 'Order of addition:

## Question 2. How much of each constituent was added?

### Quantities are either in brackets after the named entity (0.01 mmol, 4.28 mg), or shortly before it '2 ml dry toluene'

In [41]:
res2=[]

sw = stopwords.words('english')

#for each recipe
for i,recipe in enumerate(recipes):
    
    res=[]
    
    #and each ingredient
    for entity in tagged_entities[i].keys():
        
        #look for the information in brackets
        pattern = re.compile(re.escape(entity) + r"\s*\(.*?\)")
        try:
            
            #expression found
            quant = re.search(pattern, recipe).group()
            res.append(quant)
        
        #tokenize and find the information preceding the entity
        except:
            
            sent = ' '.join([w for w in recipe.split() if w not in sw])
            tokenizer = TweetTokenizer()
            wordlist = tokenizer.tokenize(sent)
            for i,w in enumerate(wordlist):
                if w in entities:
                    
                    #add the 3 words preceding the named entity
                    res.append(' '.join([wordlist[i-3], wordlist[i-2], wordlist[i-1], wordlist[i]]))
                    
    res2.append(res)

In [42]:
res2

[['(5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one (347 mg, 1.91 mmol)',
  '4-methylmorpholine n-oxide (1.76 ml, 8.42 mmol)',
  'h2o (2:1, 54 ml)',
  'potassium osmate dihydrate (97.3 mg, 0.38 mmol)',
  '1.91 mmol ) acetone',
  ') acetone / h2o'],
 ['standard . solution h2ptcl6',
  '2 ml dry toluene',
  '0.5 mmol kh n2',
  'purged 2 bar ammonia',
  'pressurized 7 bar ammonia',
  'cooled room temperature ammonia',
  '1 ml dry methanol',
  'monitor conversion gcms mesitylene',
  'standard . solution h2ptcl6',
  '2 ml dry toluene',
  '0.5 mmol kh n2',
  'purged 2 bar ammonia',
  'pressurized 7 bar ammonia',
  'cooled room temperature ammonia',
  '1 ml dry methanol',
  'monitor conversion gcms mesitylene',
  'standard . solution h2ptcl6',
  '2 ml dry toluene',
  '0.5 mmol kh n2',
  'purged 2 bar ammonia',
  'pressurized 7 bar ammonia',
  'cooled room temperature ammonia',
  '1 ml dry methanol',
  'monitor conversion gcms mesitylene',
  'standard . solution h2ptcl6',
  '2 ml d

## Question 3. Type of addition: in portions or continuous?

In [28]:
#Look for diagnostic phrases that inform us about the type of addition (identified in the data)
#These will be stored in a text file so that the users of the code can edit these expressions any time

os.chdir(code)

with open('continuous_addition.txt', 'r+') as f:
    cont = f.readlines()  
    cont = [re.sub('\n', '', c.lower()) for c in cont]
    
with open('addition_in_portion.txt', 'r+') as f:
    por = f.readlines()  
    por = [re.sub('\n', '', p.lower()) for p in por]

In [29]:
res3 = []

for i,recipe in enumerate(recipes):
    
    if any(c in recipe for c in cont) and any(p in recipe for p in por):
        res3.append('Type of addition: Mention of both continuous and in-portion addition')
        
    elif any(c in recipe for c in cont):
        res3.append('Type of addition: Continuous')
        
    elif any(p in recipe for p in por):
        res3.append('Type of addition: Addition in portions')
        
    else:
        res3.append('Type of addition: Unknown')
    

In [30]:
res3

['Type of addition: Unknown',
 'Type of addition: Unknown',
 'Type of addition: Continuous',
 'Type of addition: Unknown',
 'Type of addition: Unknown',
 'Type of addition: Continuous',
 'Type of addition: Unknown',
 'Type of addition: Unknown',
 'Type of addition: Mention of both continuous and in-portion addition',
 'Type of addition: Unknown',
 'Type of addition: Continuous']

## Save results

In [122]:
results = []

for i in range(len(doc)):
    
    res=[]
    
    res.append('Recipe ' + str(i+1))
    res.append('Text: ' + doc[i].text)
    res.append('Ingredients: ')
    res.extend([str(a[0]) + ', ' + str(a[1]) for a in res1[i].items()])
    res.append('Quantities: ')
    res.extend([str(a) for a in res2[i]])
    res.append(str(res3[i]))
    
    results.append(res)
    


In [124]:
os.chdir(preproc)

with open('Recipes_preprocessed.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)