In [1]:
import pandas as pd
import numpy as np
import os
import chemdataextractor
from chemdataextractor import Document
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import re

In [2]:
main = '/Users/Viktoria/Desktop/Chemicals'
os.chdir(main)

In [3]:
f = open('exercise_experimentals.txt', 'rb')
doc = Document.from_file(f)

In [4]:
print(len(doc), ' ', type(doc))

11   <class 'chemdataextractor.doc.document.Document'>


## Question 1. What was added to what?

### Step 1. Get the sentences of the text up until the last occurrence of addition. <br> Step 2. Get the named entities that stand for the ingredients. <br> Step 3. Get the syntactic position of the ingredients to find out 'what was added to what'. In 'X added to Y' X is the subject, Y is the object. 

In [5]:
#Create a list of strings (recipes) which contains the texts of each recipe up until the last occurrence of the phrase that contains the addition

recipes=[]

for i in range(0, len(doc)):
    
    paragraph=[]
    tracker=[]
    
    for sentence in doc[i].sentences:
        text=str(sentence).lower()
        paragraph.append(text)
        
        #check if the addition phrase is present
        if 'add' in text or 'addition' in text:
            tracker.append(1)
        else:
            tracker.append(0)
            
    #consider the text until the last occurrence of the 'add' phrase
    paragraph = paragraph[0:np.max(np.nonzero(tracker))+1]    
    paragraph = ' '.join([p for p in paragraph])
    recipes.append(paragraph)

In [6]:
recipes

['4-methylmorpholine n-oxide (1.76 ml, 8.42 mmol) and potassium osmate dihydrate (97.3 mg, 0.38 mmol) were added to a solution of (5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one (347 mg, 1.91 mmol) in acetone/h2o (2:1, 54 ml).',
 'to a solution of h2ptcl6 (0.01 mmol, 4.28 mg) in 2 ml dry toluene in a 100 ml fisher porter tube was added 0.5 mmol of benzyl alcohol and 0.5 mmol of kh under n2 atmosphere. the tube was closed and was taken out from the glovebox and purged with 2 bar of ammonia for 3 times and finally pressurized with 7 bar of ammonia. the reaction mixture was stirred at 150°c for 24h, cooled down to room temperature and the ammonia pressure was released in the hood. 1 ml dry methanol was added to dissolve the precipitate and monitor the conversion in gcms with mesitylene as an internal standard.',
 'an oven-dried, 100-ml, two-necked, round-bottomed flask equipped with a magnetic stir bar (13 × 9 mm, octagon-type), a rubber septum, and a nitrogen line is evacua

In [7]:
#Get a list of all the chemical elements in each recipe

entities=[]
for i in range(len(doc.cems)):
    entities.append(str(doc.cems[i]).lower())

In [8]:
# Create dictionaries for each recipe (indices) with the name of the entity, starting position, ending position. 
# These dictionaries will be passed to spacy to get the syntax of the sentences.

tagged_entities = []

for recipe in recipes:
    
    indices={}
    
    #check which entities appear in the text
    for entity in entities:
        
        #look for the entity 
        if entity in recipe:
            
            #get the start & end position and add it to the dict
            start = recipe.index(entity)
            end = recipe.index(entity)+len(entity)
            indices[entity]=start, end
            
    #only match the full chemical name, not if the word 'h2o' is contained in a longer name
    keys = list(indices.keys())
    for word in keys: 
        if sum(word in k for k in keys) > 1: #more than 1 occurrences 
            indices.pop(word) #pop the short word that is contained by other words
    
    
    tagged_entities.append(indices)

In [9]:
tagged_entities

[{'(5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one': (128, 189),
  'potassium osmate dihydrate': (52, 78),
  'h2o': (221, 224),
  'acetone': (213, 220),
  '4-methylmorpholine n-oxide': (0, 26)},
 {'ammonia': (255, 262),
  'toluene': (58, 65),
  'n2': (159, 161),
  'benzyl alcohol': (119, 133),
  'methanol': (465, 473),
  'mesitylene': (552, 562),
  'h2ptcl6': (17, 24)},
 {'dmso': (457, 461),
  'nitrogen': (140, 148),
  'et2o': (762, 766),
  '(1s,2s)-1,2-bis(2-hydroxyphenyl)-1,2-diaminoethane': (237, 287),
  '2-methylbenzaldehyde': (482, 502)},
 {'dichloromethane': (0, 15),
  'benzylamine': (64, 75),
  'cacl2': (178, 183),
  'benzophenone imine': (90, 108)},
 {'gd(otf)3·9h2o': (34, 47),
  'yellow precipitate': (761, 779),
  'thf': (627, 630),
  'dichloromethane': (500, 515),
  'diethyl ether': (293, 306),
  'pyridine': (5, 13),
  'ybl·4h2o': (68, 76)},
 {'thf': (92, 95),
  'hydrochloric acid': (321, 338),
  'lithium aluminum hydride': (122, 146),
  'methyl 3,3-dimethylpent

In [10]:
# Add the named entities to spacy, then get the information which one was the subject and which one the object of the sentence

nlp = spacy.load("en_core_web_sm") # load a new spacy model

db = DocBin() # create a DocBin object
corpora = []

for i,v in enumerate(recipes):

    indices=tagged_entities[i]

    spacy_doc = nlp(recipes[i]) # create doc object from text
    ents = []
    
    for key,value in indices.items(): # add character indexes
        span = spacy_doc.char_span(value[0], value[1], label=key, alignment_mode="expand")
        
        if span is None:
            print('none')
        else:
            ents.append(span)
          
    spacy_doc.ents = ents # label the text with the ents
    db.add(spacy_doc)
    
    corpora.append(spacy_doc)

In [11]:
#Check on a single recipe if the tyagging process was successful

#spacy_doc.has_annotation("TAG") #gives True

In [12]:
#Visualise the sentence structure of a single recipe

#displacy.render(spacy_doc, style="dep")

In [15]:
# Now find out whether the ingredient in the recipe was a subject or an object
# X added to Y -> X is the subject, Y is the object

for token in corpora[0]:
    
    print(token.text, token.dep_)

4 nummod
- punct
methylmorpholine nmod
n cc
- punct
oxide nsubjpass
( punct
1.76 nummod
ml appos
, punct
8.42 nummod
mmol appos
) punct
and cc
potassium compound
osmate compound
dihydrate conj
( punct
97.3 nummod
mg appos
, punct
0.38 nummod
mmol npadvmod
) punct
were auxpass
added ROOT
to prep
a det
solution pobj
of prep
( punct
5r,7as)‐5‐butyl‐1h,3h,5h,7ah‐pyrrolo[1,2‐c][1,3]oxazol‐3‐one pobj
( punct
347 appos
mg conj
, punct
1.91 nummod
mmol appos
) punct
in prep
acetone pobj
/ punct
h2o punct
( punct
2:1 ROOT
, punct
54 nummod
ml appos
) punct
. punct


In [None]:
ingredients = {}

for token in corpora[0]:
    
    if [token.text in k for k in tagged_entities[0].keys()]:
        if 'subj' in token.dep_ or 'obj' in token.dep_:
        
            ingredients[token.text] = token.dep_
    
            

In [None]:
for i in list(ingredients.keys()):
    for k in list(tagged_entities[0]):
        if i in k:
            ingredients[k] = ingredients.pop(i)

In [None]:
ingredients
    

In [None]:
tagged_entities[0]