In [1]:
import requests
import json
from bs4 import BeautifulSoup

def search_consensus(search_term:str, since_year=None, page_size = 10, page_num = 0): #Consensus
    try:
        timeframe=''
        if since_year:
            timeframe = f"&year_min={since_year}"
        search_url = f"https://consensus.app/api/paper_search/?query={search_term}&page={page_num}&size={page_size}{timeframe}&domain=bio,chem,med,psych"
        search_json = requests.get(search_url).content

        # Decode the byte-string to a normal string and then load it as JSON
        parsed_results = json.loads(search_json.decode('utf-8'))

        # Put the links, with the url_slug and the paper_id in a list
        article_links = [f"https://consensus.app/papers/{paper['url_slug']}/{paper['paper_id']}/" for paper in parsed_results['papers']]
        return article_links    

    except Exception as e:
        print(e)

def get_articles_data(links): #PubMed
    if not isinstance(links,list):
        links = [links]
    
    articles_data = []

    for article_link in links:
        try:
            article_page = requests.get(article_link).content
            
            # Parse the HTML content
            soup = BeautifulSoup(article_page, 'html.parser')

            # Extract the paper title
            title_tag = soup.find('h1', {'data-testid': 'paper-title'})
            title = title_tag.get_text() if title_tag else None

            # Extract the year of publication
            year_tag = soup.find('meta', {'property': 'citation_year'})
            year = year_tag['content'] if year_tag else None

            # Extract the journal title
            journal_tag = soup.find('p', {'data-testid': 'journal-title'})
            journal = journal_tag.get_text() if journal_tag else None

            # Extract the authors
            authors_tag = soup.find('p', {'data-testid': 'hide-more-author'})
            authors = authors_tag.get_text() if authors_tag else None
            limit = authors.find('+')
            authors = authors[:limit]
            
            # Extract the abstract
            abstract_tag = soup.find('p', {'data-testid': 'abstract'})
            abstract = abstract_tag.get_text().strip() if abstract_tag else None

            articles_data.append({"title":title,
                                  "authors":authors,"journal":journal,
                                  "year":year,
                                  "abstract":abstract})
        except Exception as e:
            print(e)
    return articles_data

In [2]:
keyword = "health quality"

articles_data = []
for page_num in range(10):
    links = search_consensus(keyword, since_year=2022, page_num=page_num)
    articles_data += get_articles_data(links)


In [3]:
articles_data

[{'title': 'Quality of the Healthcare Services During COVID-19 Pandemic in Selected European Countries',
  'authors': 'Magdalena Tuczyńska, R. Staszewski, M. Matthews‑Kozanecka',
  'journal': 'Frontiers in Public Health',
  'year': '2022',
  'abstract': "Background There are several definitions of the quality of healthcare services. It may be defined as a level of value provided by any health care resource, as determined by some measurement. Scientists use a variety of quality measures to attempt to determine health care quality. They use special indicators or based on a patients' or healthcare professional's perception. This article aims to provide a short review of the available data on the quality of healthcare services in selected European countries during the COVID-19 pandemic. Methodology The research was done by the use of online databases such as PubMed, Google Scholar, and Science Direct. All the studies focused on the quality of healthcare services, yet the studies used diffe

In [4]:
len(articles_data)

100

In [None]:
import pandas as pd

df = pd.DataFrame(articles_data)

In [32]:
df.to_csv("data.csv")