## All imports

In [1]:
!pip install yake



You should consider upgrading via the 'c:\users\shweata\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [2]:
import os
import glob
import xml.etree.ElementTree as ET
import pathlib
import yake
import subprocess
import logging
from bs4 import BeautifulSoup

## Defining all the functions

In [17]:
logging.basicConfig(level=logging.INFO)
# All the functions
def querying_pygetpapers_sectioning(query, hits, output_directory, using_terms = False, terms_txt=None):
    """queries pygetpapers for specified query. Downloads XML, and sections papers using ami section

    Args:
        query (str): query to pygetpapers
        hits (int): no. of papers to download
        output_directory (str): CProject Directory (where papers get downloaded)
        using_terms (bool, optional): pygetpapers --terms flag. Defaults to False.
        terms_txt (str, optional): path to text file with terms. Defaults to None.
    """
    logging.info('querying pygetpapers')
    if using_terms:
        subprocess.run(f'pygetpapers -q "{query}" -k {hits} -o {output_directory} -x --terms {terms_txt}',
                                shell=True)
    else:  
        subprocess.run(f'pygetpapers -q "{query}" -k {hits} -o {output_directory} -x', 
                                shell=True)
    logging.info('running ami section')
    subprocess.run(f'ami -p {output_directory} section', shell=True)

def parse_xml(output_directory, results_txt, body_section='figure'):
    """globs the specified section parsed xml and dumps the text to a file

    Args:
        output_directory (str): CProject directory
        results_txt (str):name of text file to write parsed XML
        body_section (str, optional): [description]. Defaults to 'method'.
    """
    WORKING_DIRECTORY = os.getcwd()
    glob_results = glob.glob(os.path.join(WORKING_DIRECTORY,
                                          output_directory,"*", "sections",
                                          "**", f"*{body_section}*.xml"), recursive = True)
    logging.info(f'globbed_xml: {glob_results}')
    file1 = open(results_txt,"w+", encoding='utf-8')
    for result in glob_results:
        tree = ET.parse(result)
        root = tree.getroot()
        xmlstr = ET.tostring(root, encoding='utf8', method='xml')
        soup = BeautifulSoup(xmlstr, features='lxml')
        text = soup.get_text(separator="")
        text = text.replace(
            '\n', '')
        print(text, file = file1)
    logging.info(f'wrote text to {results_txt}')
    
def key_phrase_extraction(results_txt, terms_txt):
    """extract key phrases from the text file with parsed xml and saves the phrases in a text file (comma-separated)

    Args:
        results_txt (str): text file with parsed XML text
        terms_txt (str): name of text file with comma-separated extracted key phrases
    """
    text = pathlib.Path(results_txt).read_text(encoding='utf-8')
    custom_kw_extractor = yake.KeywordExtractor(lan='en', n=2, top=50, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    keywords_list = []
    for kw in keywords:
        keywords_list.append(kw[0])
    logging.info('extracted key phrases')
    
    keywords_list_string = ', '.join(str(i) for i in keywords_list)
    with open(terms_txt, 'w', encoding='utf-8') as fo:
        fo.write(keywords_list_string)
    logging.info(f'wrote the phrases to {terms_txt}')

## Defining all variables

In [8]:
OD_QUERY = '(cyclic voltammetry) AND batteries'
OD_HITS = '50'
OD_OUTPUT='cyclic_voltammetry_20210824_1'
OD_RESULTS= 'cyclic_volammtery.txt'
OD_TERMS = 'terms.txt'

## 1. Query [`pygetpapers`](https://pypi.org/project/pygetpapers/)
`pygetpapers` is a command-line tool which downloads open scientific papers from repositories like EPMC, biorxiv, arxiv, and so on. 
![image](https://user-images.githubusercontent.com/70576776/130623817-73596788-a3b1-4a35-9332-1d0cf375a7d7.png)
In this Demo, we've used `pygetpapers` to download `50` papers in XML-format on `(cyclic voltammetry) AND batteries` from EPMC. 
## 2. Section papers using [`ami-section`](https://github.com/petermr/ami3)
We use `ami`'s sectioning functionality to create smaller sections (like Introduction, Method, Results, Figures, and so on) for each paper. 
![image](https://user-images.githubusercontent.com/70576776/130624722-aecb3ff3-c26c-490a-92c5-30bb98b25318.png)


In [9]:
querying_pygetpapers_sectioning(OD_QUERY, OD_HITS, OD_OUTPUT)

INFO:root:querying pygetpapers
INFO:root:running ami section


`pygetpapers` gives us: 
![image](https://user-images.githubusercontent.com/70576776/130625542-192e3133-91d7-4b6d-815f-9cc3db924a4f.png)

After ami-section: 
![image](https://user-images.githubusercontent.com/70576776/130625282-407b6f91-7ed6-4735-90e7-6334bd798f97.png)


## 3. Get text from Figure Caption (or section of your choice)
- Sectioning papers makes it easy to select for specific sections like results, methods or Figure Caption, and get the text. 



In [13]:
parse_xml(OD_OUTPUT,OD_RESULTS)

INFO:root:number of XML files globbed: 139
INFO:root:wrote text to cyclic_volammtery.txt


Globbed files:
```
'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\0_figure_1.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\1_figure_2.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\3_figure_3.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\4_figure_4.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\5_figure_5.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7645205\\sections\\3_floats-group\\6_figure_6.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7693081\\sections\\3_floats-group\\0_figure_1.xml', 'C:\\Users\\shweata\\snowball\\cyclic_voltammetry_20210824_1\\PMC7693081\\sections\\3_floats-group\\1_figure_2.xml',... 
```

## 4. Extracting Key Phrases from Figure Caption Text using [`YAKE!`](https://pypi.org/project/yake/)
Finally, we can use any unsupervised Key Phrase Extractor to extracted key phrases from the text-dump from the previous step. We choose to use YAKE. 

We, then, write all the key phrases to a text file called `terms.txt`. 

In [14]:
key_phrase_extraction(OD_RESULTS, OD_TERMS)

INFO:root:extracted key phrases
INFO:root:wrote the phrases to terms.txt


Here are the Key Phrases extracted using YAKE (copy-pasted from `terms.txt`)
```
Figure, KOH, RDC, cyclic voltammetry, electrodes, SiO, electrode, scan rate, PAAK, rate, current, cyclic, American Chemical Society, voltammetry, image, disk electrode, cyclic voltammetry curves, carbon disk electrode, scan, CMC, electrolyte, KOH alkaline polymer, voltage, DMSO, HCE, cells, images, gold electrodes, Chemical Society, American Chemical, discharge, concentration KOH solution, KOH solution, disk, spectra, cell, KOH alkaline, Cyclic voltammograms, cycles, DMC, composite electrodes, working electrode, alkaline polymer electrolyte, profiles, curves, solution, voltammograms, concentration KOH, calendered electrodes, polymer electrolyte membranes
```

In [15]:
OD_OUTPUT_2 = 'cyclic_voltammetry_2'
OD_RESULTS_2= 'cyclic_volammtery_2.txt'
OS_CUSTOM_TERMS = os.path.join(os.getcwd(), 'custom_terms_list.txt')

## 5. Choose the relevant terms and re-run the query
We created a custom terms list based on the extracted key phrases and re-run the query. 
Copy-pasted from `custom_terms_list.txt`
```
disk electrode, scan rate, calendered electrodes
```

In [16]:
querying_pygetpapers_sectioning(OD_QUERY, OD_HITS, OD_OUTPUT_2, using_terms=True, terms_txt=OS_CUSTOM_TERMS)

INFO:root:querying pygetpapers
INFO:root:running ami section


## Comparision between 1st and 2nd run
Inspecting `pygetpapers_log.txt` (which is inside of CProject directory) from both the runs:
1st run: 
```
INFO:root:Making log file at cyclic_voltammetry_20210824_1\pygetpapers_log.txt
INFO:root:Final query is (cyclic voltammetry) AND batteries
INFO:root:Total Hits are 3508
```
2nd run:
```
INFO:root:Making log file at cyclic_voltammetry_2\pygetpapers_log.txt
INFO:root:Final query is ((cyclic voltammetry) AND batteries AND (disk electrode OR  scan rate OR  calendered electrodes))
INFO:root:Total Hits are 359
```
We've filtered 1/10th of the papers available on EPMC on `(cyclic voltammetry) AND batteries` (1st run), by
- downloading a sample of 50 papers, 
- extracting the key phrases,
-  choosing the relevant phrases from the initial list of phrases
-  and re-running the query by adding those chosen phrases. 