In [1]:
serp_api = '2db975c7f72d37a545504fb166033d19ccbadcf5eeaf17b56882f9a81d0dafe7'
cohere_api = '6rIWhaa4vGVQKRnSJ9ClSKkgsxQp8EabiXBLH2zl'
elsevier_apikey = 'ce04ffa20c7ee4560e4ffd4761bbe157'

In [2]:
import requests
from bs4 import BeautifulSoup
import bs4
import time
import lxml
import cohere
from cohere.classify import Example 
import pandas as pd
from unidecode import unidecode
import json
from qa.search import embedding_search, get_results_paragraphs_multi_process, get_results_paragraphs_from_paper

In [3]:
class paper_parser():
    def __init__(self, query, apikey, count, start, sort, view, timeout):
        self.query = query
        self.apikey = apikey
        self.count = count
        self.start = start
        self.sort = sort
        self.view = view
        self.timeout = timeout
        self.dic = self.search_for_papers(self.query, self.apikey, self.count, self.start, self.sort, self.view)
        self.piis = list(self.dic.values())
        self.paragraphs = self.get_all_paragraphs()
        self.titles = self.get_all_titles()
        self.authors = self.get_all_1st_authors()
        self.organizations = self.get_all_organizations()
    def search_for_papers(self, query, apikey, count, start, sort, view):
        url = 'https://api.elsevier.com/content/search/sciencedirect?'
        headers = {'X-ELS-APIKey': apikey, 'Accept': 'application/json'}
        params = {'query': query, 'count': count, 'start': start, 'sort': sort, 'view': view}
        r = requests.get(url, headers=headers, params=params)
        json_resp = json.loads(r.text)
        titles, piis, authors = [], [], []
        dic = {}
        for paper in json_resp['search-results']['entry']:
            titles.append(paper['dc:title'])
            piis.append(paper['pii'])
            dic[paper['dc:title']] = paper['pii']
        return dic
    def get_paragraphs(self, pii):
        url = f'https://api.elsevier.com/content/article/pii/{pii}'
        response = requests.get(url, headers={"X-ELS-APIKey":self.apikey, "content-type": "text/xml;charset=UTF-8"}, timeout = self.timeout)
        soup = BeautifulSoup(response.content, "xml")
        text =[unidecode(''.join(s.findAll(text=True)).replace('\n', '')) for s in soup.find_all('ce:para')]
        return text
    def get_all_1st_authors(self):
        authors = []
        for pii in self.piis:
            url = f'https://api.elsevier.com/content/article/pii/{pii}'
            response = requests.get(url, headers={"X-ELS-APIKey":self.apikey, "content-type": "text/xml;charset=UTF-8"}, timeout = self.timeout)
            soup = BeautifulSoup(response.content, "xml")
            first_name = [''.join(s.findAll(text=True)) for s in soup.find_all('ce:given-name')][0]
            last_name = [''.join(s.findAll(text=True)) for s in soup.find_all('ce:surname')][0]
            name = first_name + ' ' + last_name
            authors.append(name)
        return authors
    def get_all_organizations(self):
        organizations = []
        for pii in self.piis:
            url = f'https://api.elsevier.com/content/article/pii/{pii}'
            response = requests.get(url, headers={"X-ELS-APIKey":self.apikey, "content-type": "text/xml;charset=UTF-8"}, timeout = self.timeout)
            soup = BeautifulSoup(response.content, "xml")            
            organization = [''.join(s.findAll(text=True)) for s in soup.find_all('sa:organization')]
            # join all organizations into one string and split by comma
            organization = ', '.join(organization)
            organizations.append(organization)
        return organizations
    def get_all_paragraphs(self):
        paragraphs = []
        for pii in self.piis:
            paragraphs.append(self.get_paragraphs(pii))
        return paragraphs
    def get_all_titles(self):
        return list(self.dic.keys())
    def get_text(self, pii, apikey, timeout):
        url = f'https://api.elsevier.com/content/article/pii/{pii}'
        response = requests.get(url, headers={"X-ELS-APIKey":apikey, "content-type": "text/xml;charset=UTF-8"}, timeout = timeout)
        soup = BeautifulSoup(response.content, "xml")
        return soup

In [8]:
pp = paper_parser('Machine learning assisted design of FeCoNiCrMn high-entropy alloys with ultra-low hydrogen diffusion coefficients', elsevier_apikey, 1, 0, 'relevance', 'COMPLETE', 60)

In [9]:
example_papers = pp.get_all_paragraphs()
example_titles = pp.get_all_titles()
example_authors = pp.get_all_1st_authors()
example_organizations = pp.get_all_organizations()

In [11]:
pp.piis

['S1359645421009137']

In [16]:
# save to a csv file
df = pd.DataFrame(example_papers[0], columns = ['paragraphs'])
df['titles'] = example_titles[0]
df['authors'] = example_authors[0]
df['organizations'] = example_organizations[0]
# df.to_csv('S1359645421009137.csv')

In [29]:
def get_relevant_paper_from_question(question):
    co = cohere.Client('6rIWhaa4vGVQKRnSJ9ClSKkgsxQp8EabiXBLH2zl') 
    response = co.generate( 
        model='command-xlarge-20221108', 
        prompt='Extract the key words and seperate them by commas from the following question:\n\nQuestion: '+ question +'\n\nKey words:', 
        max_tokens=50, 
        temperature=0.6, 
        k=0, 
        p=1, 
        frequency_penalty=0, 
        presence_penalty=0, 
        stop_sequences=["--"], 
        return_likelihoods='NONE') 
    key_words = response.generations[0].text
    print(key_words)
    pp = paper_parser(key_words, elsevier_apikey, 1, 0, 'relevance', 'COMPLETE', 60)
    paragraphs = []
    paragraph_sources = []
    for i in range(len(pp.get_all_paragraphs()[0])):
        paragraphs += pp.get_all_paragraphs()[0][i]
        paragraph_sources += [pp.piis(0)]  * len(url_paragraphs[i])
    return paragraphs, paragraph_sources

In [31]:
get_relevant_paper_from_question('What is the best way to synthesize a membrane with NIPS?')



NIPS, membrane, synthesis


In [5]:
df = pd.read_csv('S1359645421009137'+'.csv')

In [6]:
len(get_results_paragraphs_from_paper('S1359645421009137')[0])

34