In [35]:
import datetime
import json
import os
import sys
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('../chembase/'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils.db_handler as mongo
import extractors.numerical_property_extractor as prop_ext

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract property data from papers and upload to MongoDB
------------------

#### First, we will complete this process for a single paper, to illustrate the process. Then, we will apply the same framework, in a loop, to all papers in the corpus. The general steps are outlined here:

1. Connect to the database containing the corpus and the collection of molecular entities and their properties

2. Identify article documents in the corpus collection that are to be subjected to property extraction. In this case, we are looking for documents that are from the publisher 'RSC' whose raw filetype is 'HTML'.

3. Based on the publisher and article datatype you select, choose the appropriate parser to convert the article to a ChemDataExtractor Document() object.
    - For many articles, this can be done using the information present in the article's MetaData field, as shown in the for loop at the bottom
    
    
4. Convert the extracted property data from the JSON format produced by Document.records.serialize() to one appropriate for the MolecularEntities database collection.
    - Make sure that the DOI of the article is included in the resulting JSON object to ensure propert citation and attribution of the property information
    

5. Upload the molecular entity JSON to the MongoDB
    - Make sure to first check existing compounds in the collection to append data to before creating a new object for a given molecular entity

In [2]:
# 1. Connect to the database containing the corpus and the collection of
# molecular entities and their properties

#load user and password from local file
with open('/Users/wesleytatum/Desktop/post_doc/BETO/mongo_passwords.json', 'r') as f:
    mongo_passwords = json.load(f)
    f.close()
    
user = mongo_passwords['UW_corpus']['user']
password = mongo_passwords['UW_corpus']['password']
    
#initialize MongoDBHandler()
mongo_handler = mongo.MongoDBHandler(user, password)

In [3]:
# 2. Identify article documents in the corpus collection that are to be
# subjected to property extraction.

%autoreload

doi_list = mongo_handler.retrieve_all_article_doi()
doi = doi_list[1001] #for now, the first 1000 articles are abstract only

article = mongo_handler.retrieve_doc_by_doi(doi)

article

{'_id': '10.1039/A908130E',
 'DOI': '10.1039/A908130E',
 'Title': 'Organic materials for electronic and optoelectronic devicesBasis of a presentation given at Materials Chemistry Discussion No.\xa02, 13–15 September 1999 ...  - Journal of Materials Chemistry (RSC Publishing) DOI:10.1039/A908130E',
 'Abstract': 'This article concentrates on our recent results on several classes of photo- and electro-active organic materials that permit thin film formation and discusses their synthesis, properties, functions and potential applications for electronic and optoelectronic devices. The materials studied include amorphous molecular materials, titanyl phthalocyanine, oligothiophenes with well-defined structures, and non-conjugated polymers containing pendant oligothiophenes or other π-electron systems. The thin films of these materials find potential applications for use in organic electroluminescent, photovoltaic, electrochromic, and other devices.',
 'Raw': '<html xmlns="http://www.w3.org/199

In [6]:
# 3. Based on the publisher and article datatype you select, choose the appropriate
# parser to convert the article to a ChemDataExtractor Document() object.

%autoreload

if article['MetaData']['Publisher'] == 'RSC':
    if article['MetaData']['RawType'] == 'HTML':
        extractor = prop_ext.PropertyExtractor(article_format = 'HTML',
                                               publisher = 'RSC')

        raw_records = extractor.extract(article['Raw'])

#remove unwanted property information (e.g. 'roles', 'labels')
filtered_properties = extractor.filter_properties(raw_records, recs_with_props_only = True)

In [7]:
filtered_properties
# raw_records

[{'names': ['m-MTDAB'], 'melting_points': [{'value': '183', 'units': '°C'}]},
 {'names': ['p-MTDAB'], 'melting_points': [{'value': '210', 'units': '°C'}]},
 {'names': ['p-FTDAB'], 'melting_points': [{'value': '228', 'units': '°C'}]},
 {'names': ['p-ClTDAB'], 'melting_points': [{'value': '181', 'units': '°C'}]},
 {'names': ['p-BrTDAB'], 'melting_points': [{'value': '165', 'units': '°C'}]},
 {'names': ['o-MTDATz'], 'melting_points': [{'value': '175', 'units': '°C'}]},
 {'names': ['o-MTDAPB'], 'melting_points': [{'value': '279', 'units': '°C'}]},
 {'names': ['t-Bu-TBATA'],
  'uvvis_spectra': [{'peaks': [{'value': '353'}]},
   {'peaks': [{'value': '465'}]}]},
 {'names': ['o-PTDATA'],
  'uvvis_spectra': [{'peaks': [{'value': '346'}]},
   {'peaks': [{'value': '479'}]}]},
 {'names': ['m-PTDATA'],
  'uvvis_spectra': [{'peaks': [{'value': '339'}]},
   {'peaks': [{'value': '490'}]}]},
 {'names': ['m-MTDAPD'],
  'uvvis_spectra': [{'peaks': [{'value': '342'}]},
   {'peaks': [{'value': '408'}]}]},


In [29]:
# 4. Convert the extracted property data from the JSON format produced by
# Document.records.serialize() to one appropriate for the MolecularEntities
# database collection.
%autoreload

reformatted_properties = extractor.reformat_list_of_records(filtered_properties, doi)

reformatted_properties

[{'Synonyms': ['m-MTDAB'],
  'Properties': [{'melting_points': [{'value': '183', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['p-MTDAB'],
  'Properties': [{'melting_points': [{'value': '210', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['p-FTDAB'],
  'Properties': [{'melting_points': [{'value': '228', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['p-ClTDAB'],
  'Properties': [{'melting_points': [{'value': '181', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['p-BrTDAB'],
  'Properties': [{'melting_points': [{'value': '165', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['o-MTDATz'],
  'Properties': [{'melting_points': [{'value': '175', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {'Synonyms': ['o-MTDAPB'],
  'Properties': [{'melting_points': [{'value': '279', 'units': '°C'}],
    'AssociatedDOI': '10.1039/A908130E'}]},
 {

Let's first look at the existing entry for a single molecular entity in the database, if it exists at all: 

In [40]:
mol_ents = mongo_handler.return_client_mol_ents_object()

entity = reformatted_properties[1]

print(entity['Synonyms'])

matches = list(mol_ents.find({'Synonyms':entity['Synonyms'][0]}))

print(len(matches))

['p-MTDAB']
0


In [41]:
# 5. Upload the molecular entity JSON to the MongoDB

mol_ents = mongo_handler.return_client_mol_ents_object()

entity = reformatted_properties[1]

names = entity['Synonyms']

matched_ents = []

#get all possible matches that exist in the database
for nm in names:     
    matches = list(mol_ents.find({'Synonyms': nm}))    
    matched_ents.extend(matches)

new_ent = {}

#no existing matches
if len(matched_ents) == 0:
    new_ent['Synonyms'] = names
    new_ent['Properties'] = entity['Properties']

#single match
if len(matched_ents) == 1:
    old_ent = matched_ents[0]
    
    if 'Properties' in old_ent:
        old_prop_dict_list = old_ent['Properties']
        old_prop_dict_list.extend(entity['Properties'])
        
        new_ent['Synonyms'] = old_ent['Synonyms']
        new_ent['Properties'] = old_prop_dict_list
        
    else:
        new_ent['Synonyms'] = old_ent['Synonyms']
        new_ent['Properties'] = entity['Properties']

#multiple matches (probably due to multiple names or abbreviations)
if len(matched_ents) > 1:
    
    #update matched entity with longest name (assume that's the IUPAC name)
    longest_name = max(names)
    for match in matched_ents:
        if longest_name in match['Synonyms']:
            
            if 'Properties' in old_ent:
                old_prop_dict_list = old_ent['Properties']
                old_prop_dict_list.extend(entity['Properties'])
                
                new_ent['Synonyms'] = old_ent['Synonyms']
                new_ent['Properties'] = old_prop_dict_list

            else:
                new_ent['Synonyms'] = old_ent['Synonyms']
                new_ent['Properties'] = entity['Properties']
        else:
            pass
        
    
t = datetime.datetime.utcnow()
timestamp = t.strftime('%Y %b %d %H:%M.%S')     
new_ent['UploadTimestamp'] = timestamp
mol_ents.insert_one(new_ent)

<pymongo.results.InsertOneResult at 0x15a57d140>

Now let's look at the database entry for this entity

In [44]:
entity = reformatted_properties[1]

print(entity['Synonyms'])

matches = list(mol_ents.find({'Synonyms':entity['Synonyms'][0]}))

print(len(matches))
matches[0]

['p-MTDAB']
1


{'_id': ObjectId('603fd9d2f533045dd0482c62'),
 'Synonyms': ['p-MTDAB'],
 'Properties': [{'melting_points': [{'value': '210', 'units': '°C'}],
   'AssociatedDOI': '10.1039/A908130E'}],
 'UploadTimestamp': '2021 Mar 03 18:47.46'}

The above process has been conveniently written into a single function as `chembase.db_handler.upload_properties_document()`. We used it above for a single entity that was identified in the sample research article.


Below, we will now loop through all of the entities in a single document:

In [49]:
%autoreload
# 1. Connect to the database containing the corpus and the collection of
# molecular entities and their properties

#load user and password from local file
with open('/Users/wesleytatum/Desktop/post_doc/BETO/mongo_passwords.json', 'r') as f:
    mongo_passwords = json.load(f)
    f.close()
    
user = mongo_passwords['UW_corpus']['user']
password = mongo_passwords['UW_corpus']['password']
    
#initialize MongoDBHandler()
mongo_handler = mongo.MongoDBHandler(user, password)

print('Step 1: Complete')

########################

# 2. Identify article documents in the corpus collection that are to be
# subjected to property extraction.
doi_list = mongo_handler.retrieve_all_article_doi()
doi = doi_list[1001] #for now, the first 1000 articles are abstract only

article = mongo_handler.retrieve_doc_by_doi(doi)

print('Step 2: Complete')

########################

# 3. Based on the publisher and article datatype you select, choose the appropriate
# parser to convert the article to a ChemDataExtractor Document() object.
if article['MetaData']['Publisher'] == 'RSC':
    if article['MetaData']['RawType'] == 'HTML':
        extractor = prop_ext.PropertyExtractor(article_format = 'HTML',
                                               publisher = 'RSC')

        raw_records = extractor.extract(article['Raw'])

#remove unwanted property information (e.g. 'roles', 'labels')
filtered_properties = extractor.filter_properties(raw_records, recs_with_props_only = True)

print('Step 3: Complete')

########################

# 4. Convert the extracted property data from the JSON format produced by
# Document.records.serialize() to one appropriate for the MolecularEntities
# database collection.
reformatted_properties = extractor.reformat_list_of_records(filtered_properties, doi)

print('Step 4: Complete')

########################

# 5. Upload the molecular entities JSON to the MongoDB
pbar = tqdm(total = len(reformatted_properties), position = 0)
for entity in reformatted_properties:
    mongo_handler.upload_properties_document(entity)
    
    pbar.update()
    
print('Step 5: Complete')

Step 1: Complete
Step 2: Complete


  0%|          | 0/38 [06:59<?, ?it/s]
  0%|          | 0/38 [00:00<?, ?it/s]

Step 3: Complete
Step 4: Complete


100%|██████████| 38/38 [00:08<00:00,  3.86it/s]

Step 5: Complete


_________________
_________________

# Now to run this for all fulltexts in the corpus:

In [None]:
# 1. Connect to the database containing the corpus and the collection of
# molecular entities and their properties

#load user and password from local file
with open('/Users/wesleytatum/Desktop/post_doc/BETO/mongo_passwords.json', 'r') as f:
    mongo_passwords = json.load(f)
    f.close()
    
user = mongo_passwords['UW_corpus']['user']
password = mongo_passwords['UW_corpus']['password']
    
#initialize MongoDBHandler()
mongo_handler = mongo.MongoDBHandler(user, password)

########################

# 2. Identify article documents in the corpus collection that are to be
# subjected to property extraction.
doi_list = mongo_handler.retrieve_all_article_doi()
doi_list = doi_list[1002:] #for now, the first 1000 articles are abstract only

pbar = tqdm(total = len(doi_list), position = 0)

for doi in doi_list:

    article = mongo_handler.retrieve_doc_by_doi(doi)

    ########################

    # 3. Based on the publisher and article datatype you select, choose the appropriate
    # parser to convert the article to a ChemDataExtractor Document() object.
    if article['MetaData']['Publisher'] == 'RSC':
        if article['MetaData']['RawType'] == 'HTML':
            extractor = prop_ext.PropertyExtractor(article_format = 'HTML',
                                                   publisher = 'RSC')

            raw_records = extractor.extract(article['Raw'])

    #remove unwanted property information (e.g. 'roles', 'labels')
    filtered_properties = extractor.filter_properties(raw_records, recs_with_props_only = True)

    ########################

    # 4. Convert the extracted property data from the JSON format produced by
    # Document.records.serialize() to one appropriate for the MolecularEntities
    # database collection.
    reformatted_properties = extractor.reformat_list_of_records(filtered_properties, doi)

    ########################

    # 5. Upload the molecular entities JSON to the MongoDB
    for entity in reformatted_properties:
        mongo_handler.upload_properties_document(entity)

    pbar.update()

100%|██████████| 38/38 [05:00<00:00,  7.91s/it]
  0%|          | 0/38 [09:32<?, ?it/s]
  6%|▌         | 123/2097 [2:00:21<29:14:52, 53.34s/it]