In [3]:
import os
import time
import pandas as pd
import re
from typing import Dict
import citation_normalisation as cn
import reference_parser as rp
import retrieve_COCONUT_references as rCr
import importlib
import datetime

# Read COCONUT references from file

In [2]:
# Read data from file
coconut_references = pd.read_csv('./coconut_references.csv')
unstructured_references = coconut_references['citationDOI']
COCONUT_IDs = coconut_references['coconut_id']

ID_ref_tuples = zip(COCONUT_IDs, unstructured_references)

# Get everything into the right format and filter empty reference lists
ID_ref_tuples = [(tup[0], eval(tup[1])) # the references are read as str and need to be converted to lists
                for tup in ID_ref_tuples
                if len(eval(tup[1])) != 0 # Don't include empty reference lists
                if eval(tup[1]) != ['NA']] # Don't include reference list with 'NA' as only element


# Test parsers for different reference notations

There is a variety of different reference notation styles in the COCONUT references. Most of them can be defined by specific regular expressions and the specific sub-units can hence be identified. The parser functions with the regular expressions used can be found in reference_parser.py

The single parsing functions can be used separately after calling an instance of reference_parser. 

Example: 
> parser = rp.reference_parser()

> parser.parse_general_pattern('Haba,Phytochem.,68,(2007),1255')

If the pattern is unknown, the instance of reference_parser can simply be called as a function to try all available parsing functions.

Example:
> parser = rp.reference_parser()

> parser('Haba,Phytochem.,68,(2007),1255')

All parsing functions return a dictionary which contains the parsed information or None if predefined the pattern could not be matched.

In [4]:
importlib.reload(rp)
parser = rp.reference_parser()

### Pattern N°1: 

In [5]:
## Check regex for typical pattern: 
# Pattern: Author,? (et al.)?, Journal, issue, (year), page(-page)?

# Examples that are supposed to be matched
general_pattern_references = [
    'Haba,Phytochem.,68,(2007),1255',
    'El-Sayed,Phytochem.,30,(1991),2442',
    'Fujita,J.Nat.Prod.,49,(1986),1122-1125',
    'Kim, et al., Chem Pharm Bull, 52, (2004), 1466',
    'Lansky et al.,J.Ethnopharmacol.,19,(2007),177-206',
    'Imperato,Chim.Ind.(Milan),71,(1989),86',
    'Cole,R.J.et al.,Can.J.Microbiol.,20(1974),1159',
    'Mathews.,J. Biol. Chem.,241(21),(1966),5008',
    " Fang,Chung Ts'ao Yao,12,(1981),1",
    'N.V.Thu,Pharmazie,26,(1971),504',
    'Ruan,Yun-Nan Chih Wu Yen Chiu,13,(1991),225',
    'Peng J.-P.,Phytochem.,41,(1996),283-285',
    ' Hussain,J.Nat.Prod.51.,(1988),809',
    'Bondarenko,Khim.Prir.Soedin,(1983),243',
    'Haba,Phytochem.,68,82007),1255',
    'Ingham,Phytochem.,15,819769,1489',
    'Bondarenko,Khim.Prir.Soedin,(1983),243',]


# Test
for ref in general_pattern_references:
    assert parser.parse_general_pattern(ref)
    print(' Original reference: {} \n Resulting dict:\n {}\n'.format(ref, parser.parse_general_pattern(ref)))
    
    


 Original reference: Haba,Phytochem.,68,(2007),1255 
 Resulting dict:
 {'authors': 'Haba', 'first_author_surname': 'Haba', 'journal': 'Phytochem.', 'volume': '68', 'issue': None, 'year': '2007', 'pages': '1255', 'first_page': '1255'}

 Original reference: El-Sayed,Phytochem.,30,(1991),2442 
 Resulting dict:
 {'authors': 'El-Sayed', 'first_author_surname': 'El-Sayed', 'journal': 'Phytochem.', 'volume': '30', 'issue': None, 'year': '1991', 'pages': '2442', 'first_page': '2442'}

 Original reference: Fujita,J.Nat.Prod.,49,(1986),1122-1125 
 Resulting dict:
 {'authors': 'Fujita', 'first_author_surname': 'Fujita', 'journal': 'J.Nat.Prod.', 'volume': '49', 'issue': None, 'year': '1986', 'pages': '1122-1125', 'first_page': '1122'}

 Original reference: Kim, et al., Chem Pharm Bull, 52, (2004), 1466 
 Resulting dict:
 {'authors': 'Kim, ', 'first_author_surname': 'Kim', 'journal': 'Chem Pharm Bull', 'volume': '52', 'issue': None, 'year': '2004', 'pages': '1466', 'first_page': '1466'}

 Original

### Pattern N°2

In [6]:
# Test parser for a rarer structured reference pattern
underscore_pattern_references = ['J_Agric_Food_Chem_2016_64_(21):4255-4263',
                                 'J_Nat_Prod_2015_78_(4):730-735',
                                 "Phytochemistry_2003;64:285-291",
                                'J_Ethnopharmacol_2008;118(3):448-54',
                                '"J_Nat_Prod_2002;65_(7):1030-1032"']


for ref in underscore_pattern_references:
    assert parser.parse_underscore_pattern(ref)
    print(' Original reference: {} \n Resulting dict:\n {}\n'.format(ref, parser.parse_underscore_pattern(ref)))


 Original reference: J_Agric_Food_Chem_2016_64_(21):4255-4263 
 Resulting dict:
 {'journal': 'J Agric Food Chem', 'year': '2016', 'volume': '64', 'issue': '21', 'pages': '4255-4263', 'first_page': '4255'}

 Original reference: J_Nat_Prod_2015_78_(4):730-735 
 Resulting dict:
 {'journal': 'J Nat Prod', 'year': '2015', 'volume': '78', 'issue': '4', 'pages': '730-735', 'first_page': '730'}

 Original reference: Phytochemistry_2003;64:285-291 
 Resulting dict:
 {'journal': 'Phytochemistry', 'year': '2003', 'volume': '64', 'issue': None, 'pages': '285-291', 'first_page': '285'}

 Original reference: J_Ethnopharmacol_2008;118(3):448-54 
 Resulting dict:
 {'journal': 'J Ethnopharmacol', 'year': '2008', 'volume': '118', 'issue': '3', 'pages': '448-54', 'first_page': '448'}

 Original reference: "J_Nat_Prod_2002;65_(7):1030-1032" 
 Resulting dict:
 {'journal': 'J Nat Prod', 'year': '2002', 'volume': '65', 'issue': '7', 'pages': '1030-1032', 'first_page': '1030'}



### Pattern N°3

In [7]:
# Regex for J.Chem.Soc. references 

jchemsoc_pattern_references = [
    'Gunasekera,J.Chem.Soc.,Perkin 1,(1975),2447',
    'Locksley,J.Chem.Soc.,C,(1971),1332',]

# Test
for ref in jchemsoc_pattern_references:
    assert parser.parse_jchemsoc_pattern(ref)
    print(' Original reference: {} \n Resulting dict:\n {}\n'.format(ref, parser.parse_jchemsoc_pattern(ref)))


 Original reference: Gunasekera,J.Chem.Soc.,Perkin 1,(1975),2447 
 Resulting dict:
 {'authors': 'Gunasekera', 'first_author_surname': 'Gunasekera', 'journal': 'J.Chem.Soc., Perkin 1', 'year': '1975', 'pages': '2447', 'first_page': '2447'}

 Original reference: Locksley,J.Chem.Soc.,C,(1971),1332 
 Resulting dict:
 {'authors': 'Locksley', 'first_author_surname': 'Locksley', 'journal': 'J.Chem.Soc., C', 'year': '1971', 'pages': '1332', 'first_page': '1332'}



### Pattern for Harborne´s Handbook of Natural Flavonoids

In [8]:
# Check regex for the Handbook of Natural Flavonoids (Harborne)

harborne_flavonoid_references = [
    'Harborne, The Handbook of Natural Flavonoids, 2, (1999), 115,Chalcones,dihydrochalcones and aurones',
    'Harborne, The Handbook of Natural Flavonoids, 1, (1999), 181.Flavonols',
    'Harborne, The Handbook of Natural Flavonoids, 1, (1999), 3.Flavone O-glycosides, John Wiley & Son',]

for ref in harborne_flavonoid_references:
    assert parser.parse_harborne_flavonoid_pattern(ref)
    print(' Original reference: {} \n Resulting dict:\n {}\n'.format(ref, parser.parse_harborne_flavonoid_pattern(ref)))

 Original reference: Harborne, The Handbook of Natural Flavonoids, 2, (1999), 115,Chalcones,dihydrochalcones and aurones 
 Resulting dict:
 {'authors': 'Harborne, J.B., Baxter, H.', 'title': 'The Handbook of Natural Flavonoids', 'volume': '2', 'year': '1999', 'chapter_no': '115', 'chapter_title': 'Chalcones,dihydrochalcones and aurones', 'publisher': 'Wiley', 'doi': '10.1016/S0039-9140(00)00629-9', 'isbn': '0-471-95893-2', 'original_str': 'Harborne, The Handbook of Natural Flavonoids, 2, (1999), 115,Chalcones,dihydrochalcones and aurones'}

 Original reference: Harborne, The Handbook of Natural Flavonoids, 1, (1999), 181.Flavonols 
 Resulting dict:
 {'authors': 'Harborne, J.B., Baxter, H.', 'title': 'The Handbook of Natural Flavonoids', 'volume': '1', 'year': '1999', 'chapter_no': '181', 'chapter_title': 'Flavonols', 'publisher': 'Wiley', 'doi': '10.1016/S0039-9140(00)00629-9', 'isbn': '0-471-95893-1', 'original_str': 'Harborne, The Handbook of Natural Flavonoids, 1, (1999), 181.Flavon

### Pattern for Harborne´s Phytochemical Dictionary

In [9]:
# Check regex for the Phytochemical Dictionary (Harborne)

harborne_phytochemdict_references = [
    'Harborne,Phytochemical Dictionary Second Edition,Taylor and Francis,(1999),Chapter54',]



for ref in harborne_phytochemdict_references:
    assert parser.parse_harborne_phytochemdict_pattern(ref)
    print(' Original reference: {} \n Resulting dict:\n {}\n'.format(ref, parser.parse_harborne_phytochemdict_pattern(ref)))

 Original reference: Harborne,Phytochemical Dictionary Second Edition,Taylor and Francis,(1999),Chapter54 
 Resulting dict:
 {'year': '1999', 'chapter_no': '54', 'authors': 'Harborne, J.B., Baxter, H., Moss, G.P.', 'publisher': 'Taylor & Francis', 'title': 'Phytochemical Dictionary. A Handbook of Bioactive Compounds from Plants (Second Edition)', 'doi': 'https://doi.org/10.4324/9780203483756', 'isbn': '9780748406203', 'original_str': 'Harborne,Phytochemical Dictionary Second Edition,Taylor and Francis,(1999),Chapter54'}



## Analysis of COCONUT reference composition

In [10]:
# How many references are there and what type of references are we dealing with?

total_number = 0 # Total number of references
unique_number = 0 # Tumber of unique references
PMID_number = 0 # Number of PubMed IDs
DOI_number = 0 # Number of DOIs
general_pattern_number = 0 # Number of references that can be matched exactly be the pattern above.

harborne_flavonoid_number = 0
harborne_phytochemdict_number = 0
underscore_pattern_number = 0
j_chem_soc_number = 0
no_digits_number = 0
suspiciously_short_number = 0
unmatched_references = []
no_digits_references = []
suspiciously_short_references = []

references = []
for tup in ID_ref_tuples:
    for ref in tup[1]:
        if ref != "NA":
            total_number += 1
            if ref not in references:
                unique_number += 1
                references.append(ref)
                # Check for DOI
                if cn.contains_DOI(ref):
                    DOI_number += 1
                # Check for PMID (reference str is a number of at least 6 digits)
                elif ref.isdigit():
                    if len(ref) > 3:
                        PMID_number += 1
                # Check for the most frequent reference notation pattern
                elif parser.parse_general_pattern(ref):
                    general_pattern_number += 1
                # Check for other reference notation patterns
                elif parser.parse_underscore_pattern(ref):
                    underscore_pattern_number += 1
                elif parser.parse_jchemsoc_pattern(ref):
                    j_chem_soc_number += 1
                # Check for Harbornes Handbook of Natural Flavonoids
                elif parser.parse_harborne_flavonoid_pattern(ref):
                    harborne_flavonoid_number += 1
                # Check for Harbornes Phytochemical Dictionary Second Edition
                elif parser.parse_harborne_phytochemdict_pattern(ref):
                    harborne_phytochemdict_number += 1
                # Check for (useless) references that contain no digits
                elif re.search('\D+', ref).group() == ref:
                    no_digits_number += 1
                    no_digits_references.append(ref)
                elif len(ref) < 10:
                    suspiciously_short_number += 1
                    suspiciously_short_references.append(ref)
                else:
                    unmatched_references.append((tup[0], ref))


In [11]:
non_specific_ref = unique_number - DOI_number - PMID_number - general_pattern_number - harborne_flavonoid_number - harborne_phytochemdict_number - underscore_pattern_number - j_chem_soc_number

print('There are {} COCONUT entries'.format(len(COCONUT_IDs)))
print('{} of them have a total of {} references ({} of them are unique).'.format(len(ID_ref_tuples), total_number, unique_number))
print('{} of them are PMIDs.'.format(PMID_number))
print('{} of them are DOIs.'.format(DOI_number))
print('Another {} of them follow a very specific pattern (as in: Haba,Phytochem.,68,(2007),1255). This would offer enough information'.format(general_pattern_number))
print('Another {} of them follow a different specific pattern (as in: J_Nat_Prod_2015_78_(4):730-735).'.format(underscore_pattern_number))
print('Another {} of them follow a different pattern that only occures in J Chem Soc references (as in: Locksley,J.Chem.Soc.,C,(1971),1332)'.format(j_chem_soc_number))
print('Another {} of them come from Harborne´s Handbook of Natural Flavonoids'.format(harborne_flavonoid_number))
print('Another {} of them come from Harborne´s Phytochemical Dictionary Second Edition'.format(harborne_phytochemdict_number))
print('That leaves us with {} unique references that do not match a specific pattern (that we know).'.format(non_specific_ref))
print('{} of them do not contain any digit.'.format(no_digits_number))
print('{} of the remaining references are shorter than 10 characters.'.format(suspiciously_short_number))


There are 406747 COCONUT entries
70969 of them have a total of 158824 references (66151 of them are unique).
20953 of them are PMIDs.
12232 of them are DOIs.
Another 29297 of them follow a very specific pattern (as in: Haba,Phytochem.,68,(2007),1255). This would offer enough information
Another 182 of them follow a different specific pattern (as in: J_Nat_Prod_2015_78_(4):730-735).
Another 111 of them follow a different pattern that only occures in J Chem Soc references (as in: Locksley,J.Chem.Soc.,C,(1971),1332)
Another 589 of them come from Harborne´s Handbook of Natural Flavonoids
Another 59 of them come from Harborne´s Phytochemical Dictionary Second Edition
That leaves us with 2728 unique references that do not match a specific pattern (that we know).
285 of them do not contain any digit.
155 of the remaining references are shorter than 10 characters.


### Excerpt of remaining references

In [12]:
for ref in unmatched_references[:150]:
    print(ref)

('CNP0106606', 'Cole,Handbook of Secondary Fungal Metabolites,Volume I,(2003)')
('CNP0309481', 'Yin, et al., Modern Study of Chinese Drugs and Clinical Applications (1), Xueyuan Press, Beijing, (1993).')
('CNP0309481', 'Buckingham(Executive Editor), Dictionary of Natural Products, Chapman & Hall, 1994, Vol1-7')
('CNP0309481', 'Buckingham(Executive Editor)Dictionary of Natural ProductsChapman & Hall 1995, Vol8')
('CNP0309481', 'Buckingham(Executive Editor)Dictionary of Natural ProductsChapman & Hall 1996, Vol9')
('CNP0309481', 'Buckingham(Executive Editor)Dictionary of Natural ProductsChapman & Hall 1997, Vol10')
('CNP0309481', 'Buckingham(Executive Editor)Dictionary of Natural ProductsChapman & Hall 1998, Vol11.')
('CNP0251815', 'Ohmiya,The Alkaloids,47,(1995),1,Lupine alkaloids')
('CNP0251815', 'Ji, et al., Pharmacological Action and Application of Available Composition of Traditional Chinese Medicine, Heilongjiang Science and technology Press, Heilongjiang, (1995).')
('CNP0251815', '

### Retrieve information for all COCONUT references 
#### (or read the already retrieved information from a file)

In [35]:
if os.path.exists('./COCONUT_reference_retrieval_raw_output.tsv'):
    with open('COCONUT_reference_retrieval_raw_output.tsv', 'r') as retrieved_data:
        retrieved_data = [line.split('\t') for line in retrieved_data.readlines()[1:]]
else:
    if os.path.exists('./coconut_references.csv'):
        # Warning: This may take multiple days.
        retrieval_data = rCr.retrieval_coordination('./coconut_references.csv')
    else:
        print('The COCONUT reference file at given path!')
    

In [36]:
# Count retrieved entries
print(len(retrieved_data))

66150


#### Sort retrieved reference data by query type (DOI, PMID or keyword) [or read sorted data from file]

In [37]:
# Sort retrieved data by query string type (PMID, DOI, keyword)
PMID_based_dicts = []
DOI_based_dicts = []
keyword_based_dicts =  []
failed_queries = []

# Check if the retrieved dicts have already been filtered and saved before
if os.path.exists('./retrieved_dicts_filtered.csv'):
    with open('./retrieved_dicts_filtered.csv') as filtered_retrieved_dicts:
        for entry in filtered_retrieved_dicts.readlines():
            query_type, retrieved_dict = entry.split(', ', 1)
            if query_type == 'PMID':
                PMID_based_dicts.append(eval(retrieved_dict))
            elif query_type == 'DOI':
                DOI_based_dicts.append(eval(retrieved_dict))
            elif query_type == 'KEYWORD':
                keyword_based_dicts.append(eval(retrieved_dict))
# If no file with filtered dicts exists, filter the dictionaries from xml str and PubMed objects
else:
    for ref_str, retrieved_dict in retrieved_data:
        # Get rid of some xml str and other elements that eval() does not agree with
        xml_str = re.search("\'xml\':.+\>\'\,", retrieved_dict)
        a = deepcopy(retrieved_dict)
        if xml_str:
            retrieved_dict = retrieved_dict.replace(xml_str.group(), '')
        #for what_makes_eval_unhappy in re.findall('\'.+?\'\:\s\<.+?\>\,?\>?\,?', retrieved_dict):
        for what_makes_eval_unhappy in re.findall('\<\<?(?:metapub|bound|function|Element).+?\>\>?', retrieved_dict):
            retrieved_dict = retrieved_dict.replace(what_makes_eval_unhappy, 'False')
        retrieved_dict = eval(retrieved_dict)

        if retrieved_dict:
            if retrieved_dict['query_str_type'] == 'PMID':
                PMID_based_dicts.append(retrieved_dict)
            elif retrieved_dict['query_str_type'] == 'DOI':
                DOI_based_dicts.append(retrieved_dict)
            elif retrieved_dict['query_str_type'] == 'unstructured_ID':
                keyword_based_dicts.append(retrieved_dict)
        else:
            failed_queries.append(ref_str)# Write filtered dicts to file

    # Write filtered dicts to file
    with open("retrieved_dicts_filtered.csv", "a") as output:
        for retrieved_dict in DOI_based_dicts:
            output.write("DOI, " + str(retrieved_dict) + '\n')
        for retrieved_dict in PMID_based_dicts:
            output.write("PMID, " + str(retrieved_dict) + '\n')
        for retrieved_dict in keyword_based_dicts:
            output.write("KEYWORD, " + str(retrieved_dict) + '\n')
            
print('There are {} retrieved references based on DOI queries, '.format(len(DOI_based_dicts))
     + '{} based on PMID queries '.format(len(PMID_based_dicts))
     + 'and {} on keyword queries with the reference strings.'.format(len(keyword_based_dicts)))

There are 11676 retrieved references based on DOI queries, 20860 based on PMID queries and 33227 on keyword queries with the reference strings.


#### Example comparison of reference str with retrieved data based on keyword query:

In [38]:
example_dict = keyword_based_dicts[0]
norm_example_dict = cn.normalize_crossref_dict(example_dict)
norm_example_dict

{'title': 'Two new coumarins from Murraya plants.',
 'abstract': None,
 'DOI': '10.1248/cpb.37.819',
 'issue': '3',
 'volume': '37',
 'year': 1989,
 'journal': 'Chemical and Pharmaceutical Bulletin',
 'authors': ['Ito, C.', 'Furukawa, H.'],
 'first_author_surname': 'Ito',
 'pages': '819-820',
 'first_page': '819',
 'reference_retrieved_from': 'Crossref',
 'query_str_type': 'unstructured_ID',
 'query_str': 'Ito,Chem. Pharm. Bull.,37,(1989),819',
 'PMID': None}

In [39]:
parsed_info = parser(norm_example_dict['query_str'])
parsed_info

{'authors': 'Ito',
 'first_author_surname': 'Ito',
 'journal': 'Chem. Pharm. Bull.',
 'volume': '37',
 'issue': None,
 'year': '1989',
 'pages': '819',
 'first_page': '819'}

In [40]:
importlib.reload(cn)
cn.is_same_publication(parsed_info, norm_example_dict)

True

### Match all retrieved data based on keywords queries with parsed data from reference strings

In [41]:
importlib.reload(cn)
confirmed_retrieved_dicts = []
harborne_dicts = []
falsified_retrieved_dicts = []
for retrieved_dict in keyword_based_dicts:
    norm_keyword_dict = cn.normalize_crossref_dict(retrieved_dict)
    if norm_keyword_dict:
        parsed_info = parser(norm_keyword_dict['query_str'])
        if parsed_info:
            # If a reference can be confirmed as in the reference str: Good.
            if cn.is_same_publication(parsed_info, norm_keyword_dict):
                confirmed_retrieved_dicts.append(retrieved_dict)
            # If a reference can be identified as one of the known books: Good.
            elif parser.parse_harborne_flavonoid_pattern(norm_keyword_dict['query_str']):
                harborne_dicts.append(parser.parse_harborne_flavonoid_pattern(norm_keyword_dict['query_str']))
            elif parser.parse_harborne_phytochemdict_pattern(norm_keyword_dict['query_str']):
                harborne_dicts.append(parser.parse_harborne_phytochemdict_pattern(norm_keyword_dict['query_str']))
            # Retrieved info does not overlap with parsed info and does not refer to known book: Bad.
            else:
                falsified_retrieved_dicts.append(retrieved_dict)

In [42]:
print('Out of the {} references that were retrieved based on keyword queries, {} belong to the original publication or can be allocated to one of the parsed book references.'.format(len(keyword_based_dicts), len(confirmed_retrieved_dicts)+len(harborne_dicts)))
print('That leaves us with {} keyword_based queries that led to False information retrieval.'.format(len(falsified_retrieved_dicts)))

Out of the 33227 references that were retrieved based on keyword queries, 16837 belong to the original publication or can be allocated to one of the parsed book references.
That leaves us with 13209 keyword_based queries that led to False information retrieval.


#### Show examples of original references and references based on queries

In [43]:
for confirmed_info in confirmed_retrieved_dicts[:10]:
    confirmed_info = cn.normalize_crossref_dict(confirmed_info)
    original_ref_str = confirmed_info['query_str']
    improved_ref_str = cn.create_normalized_reference_str(confirmed_info)
    print(' Original reference: \n {} \n Normalised reference: \n {} \n'.format(original_ref_str, improved_ref_str))

 Original reference: 
 Ito,Chem. Pharm. Bull.,37,(1989),819 
 Normalised reference: 
 Ito, Chemical and Pharmaceutical Bulletin, 1989, 37 (3), 819 

 Original reference: 
 Lambden,J. Bacteriol.,115,(1973),992 
 Normalised reference: 
 Lambden, Journal of Bacteriology, 1973, 115 (3), 992 

 Original reference: 
 Nash,R.J.et al.,Tet.Lett.,35,(1994),7849-7852 
 Normalised reference: 
 Nash, Tetrahedron Letters, 1994, 35 (41), 7849 

 Original reference: 
 Fujimoto,Chem.Pharm.Bull.,54,(2006),550 
 Normalised reference: 
 Fujimoto et al., Chemical and Pharmaceutical Bulletin, 2006, 54 (4), 550 

 Original reference: 
 Westley,J. Antibiotics,27,(1974),744 
 Normalised reference: 
 Westley et al., The Journal of Antibiotics, 1974, 27 (10), 744 

 Original reference: 
 Dong,Chem.Pharm.Bull.,56,(2008),1600 
 Normalised reference: 
 Dong et al., Chemical and Pharmaceutical Bulletin, 2008, 56 (11), 1600 

 Original reference: 
 Ezaki,J. Antibiotics,34,(1981),1363 
 Normalised reference: 
 Ezaki e

### Extended query for unmatched references

The information that could not be confirmed is going to be used again in a Crossref query. This time, the string is cleaned up and not only the first, but the first 200 results are checked.

#### Example

In [44]:
example_wrong_data = falsified_retrieved_dicts[0]
cn.normalize_crossref_dict(example_wrong_data)

{'title': None,
 'abstract': None,
 'DOI': '10.1111/ppl.2001.111.issue-1',
 'issue': '1',
 'volume': '111',
 'year': 2001,
 'pages': False,
 'first_page': False,
 'reference_retrieved_from': 'Crossref',
 'query_str_type': 'unstructured_ID',
 'query_str': 'Morvan-Bertrand,Physiol Plant,111,(2001),225',
 'PMID': None}

In [45]:
# Parse original query_str
example_dict = parser(example_wrong_data['query_str'])
example_dict

{'authors': 'Morvan-Bertrand',
 'first_author_surname': 'Morvan-Bertrand',
 'journal': 'Physiol Plant',
 'volume': '111',
 'issue': None,
 'year': '2001',
 'pages': '225',
 'first_page': '225'}

#### Second information retrieval

Warning, this may take ~ 1 day (if the data has not been saved yet).

In [46]:
if os.path.exists('./COCONUT_reference_second_retrieval_raw_output.tsv'):
    with open('COCONUT_reference_second_retrieval_raw_output.tsv', 'r') as retrieved_data:
        second_retrieved_data = [line.split('\t') for line in retrieved_data.readlines()[1:]]
else:
    if os.path.exists('./retrieved_dicts_filtered.csv'):
        # Warning: This may take multiple days.
        second_retrieval_data = rCr.retrieval_coordination('./retrieved_dicts_filtered.csv')
    else:
        print('The COCONUT reference file at given path!')

In [47]:
falsified_twice_dicts = []
second_retrieval_confirmed_dicts = []
for data in second_retrieved_data:
    norm_dict = cn.normalize_crossref_dict(eval(data[1]))
    parsed_ref_dict = parser(data[0])
    if norm_dict:    
        match = cn.is_same_publication(parsed_ref_dict, norm_dict)
        if match:
            second_retrieval_confirmed_dicts.append(eval(data[1]))
    else:
        falsified_twice_dicts.append(data)
confirmed_retrieved_dicts += second_retrieval_confirmed_dicts


In [48]:
print('During the second information retrieval, an additional {} confirmed references have been retrieved'.format(len(second_retrieval_confirmed_dicts)),
     'There are now {} confirmed dicts and {} falsified_dicts.'.format(len(confirmed_retrieved_dicts)+len(second_retrieval_confirmed_dicts), len(falsified_twice_dicts)))

During the second information retrieval, an additional 2344 confirmed references have been retrieved There are now 20880 confirmed dicts and 10829 falsified_dicts.


### Create normalised reference strings

#### Generate dict that maps all "old" references to a dictionary containing a normalised reference str, the DOI and the PMID

In [49]:
importlib.reload(cn)
references = {}
a = 0
# Map retrieved info (ref str, DOI, PMID) to original ref str
retrieved_data = confirmed_retrieved_dicts+ DOI_based_dicts + PMID_based_dicts
for ref in retrieved_data:
    if ref["reference_retrieved_from"] == "Crossref":
        norm_dict = cn.normalize_crossref_dict(ref)
        if norm_dict['query_str'][0] == '{':
            norm_dict['query_str'] = eval(norm_dict['query_str'])['query_str']
    elif ref['reference_retrieved_from'] == 'MetaPub':
        norm_dict = cn.normalize_metapub_dict(ref) 
    if cn.create_normalized_reference_str(norm_dict):
        references[norm_dict['query_str']] = {'reference': cn.create_normalized_reference_str(norm_dict),
                                              'DOI': norm_dict['DOI'],
                                              'PMID': norm_dict['PMID']}
# Map old Harborne ref str to same kind of dict
for ref in harborne_dicts:
    if 'volume' in ref.keys():
        harborne_str = 'Harborne, {}, {}, ({}), Chapter {}'.format(ref['title'],
                                                        ref['volume'],
                                                        ref['year'], 
                                                        ref['chapter_no'])
    else:
        harborne_str = 'Harborne, {}, ({}), Chapter {}'.format(ref['title'],
                                                        ref['year'], 
                                                        ref['chapter_no'])
    if 'chapter_title' in ref.keys():
        harborne_str += ' - {}'.format(ref['chapter_title'])
    references[ref['original_str']] = {'reference': harborne_str,
                                      'DOI': ref['doi'],
                                      'PMID': None}
    


In [53]:
for key in list(references.keys())[:50000:5000]:
    print('_____\n{}'.format(key))
    print(references[key])

_____
Ito,Chem. Pharm. Bull.,37,(1989),819
{'reference': 'Ito, Chemical and Pharmaceutical Bulletin, 1989, 37 (3), 819', 'DOI': '10.1248/cpb.37.819', 'PMID': None}
_____
Cao,J.Nat.Prod.,67,(2004),986
{'reference': 'Cao et al., Journal of Natural Products, 2004, 67 (6), 986', 'DOI': '10.1021/np040058h', 'PMID': None}
_____
Westley,J. Antibiotics,32,(1979),874
{'reference': 'Westley et al., The Journal of Antibiotics, 1979, 32 (9), 874', 'DOI': '10.7164/antibiotics.32.874', 'PMID': None}
_____
Evans,Phytochem.,12,(1973),2505
{'reference': 'Evans, Phytochemistry, 1973, 12 (10), 2505', 'DOI': '10.1016/0031-9422(73)80464-9', 'PMID': None}
_____
10.1021/np070664n
{'reference': 'Chen et al., Journal of Natural Products, 2008, 71 (3), 431', 'DOI': '10.1021/np070664n', 'PMID': None}
_____
10.1021/ol502216j
{'reference': 'Grudniewska et al., Organic Letters, 2014, 16 (18), 4695', 'DOI': '10.1021/ol502216j', 'PMID': None}
_____
10.7164/antibiotics.28.83
{'reference': 'Shimura et al., The Journal 

#### Write reference dict to json file

In [56]:
with open('COCONUT_reference_dict.json', 'w') as output:
    output.write(str(references))

In [71]:
importlib.reload(cn)


for test_ref in ['Morvan-Bertrand,Physiol Plant,111,(2001),225',
                 '20512739', 
                 '10.1021/ol502216j'
                ]:
    print(cn.get_final_dict_from_ref_str(test_ref))

{'title': 'Endogenous gibberellins in Lolium perenne\n and influence of defoliation on their contents in elongating leaf bases and in leaf sheaths', 'abstract': None, 'DOI': '10.1034/j.1399-3054.2001.1110214.x', 'issue': '2', 'volume': '111', 'year': 2001, 'journal': 'Physiologia Plantarum', 'authors': ['Morvan-Bertrand, A.', 'Ernstsen, A.', 'Lindgård, B.', 'Koshioka, M.', 'Le Saos, J.', 'Boucaud, J.', "Prud'homme, M.", 'Junttila, O.'], 'first_author_surname': 'Morvan-Bertrand', 'pages': '225-231', 'first_page': '225', 'reference_retrieved_from': 'Crossref', 'query_str_type': 'Crossref_extended_query', 'query_str': "{'authors': 'Morvan-Bertrand', 'first_author_surname': 'Morvan-Bertrand', 'journal': 'Physiol Plant', 'volume': '111', 'issue': None, 'year': '2001', 'pages': '225', 'first_page': '225', 'reference_retrieved_from': 'Crossref', 'query_str_type': 'unstructured_ID', 'query_str': 'Morvan-Bertrand,Physiol Plant,111,(2001),225'}", 'PMID': None}
{'Morvan-Bertrand,Physiol Plant,111