In [29]:
import requests
from bs4 import BeautifulSoup
import re
import os
import logging
import random
import time

In [30]:
base_url = 'https://www.ncbi.nlm.nih.gov'
search_url = 'https://www.ncbi.nlm.nih.gov/pmc/?term=(Inflammatory Bowel Disease%5BTitle%5D)+AND+(Gut microbiome)'

In [31]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [11]:
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
]

In [12]:
def download_pdf(pdf_url, filename):
    headers = {'User-Agent': random.choice(user_agents)}
    try:
        response = requests.get(pdf_url, headers=headers)
        response.raise_for_status()

        # Create the downloaded_pdfs folder in the current working directory if it doesn't exist
        current_dir = os.getcwd()
        downloaded_pdfs_dir = os.path.join(current_dir, 'downloaded_pdfs')
        os.makedirs(downloaded_pdfs_dir, exist_ok=True)

        file_path = os.path.join(downloaded_pdfs_dir, filename)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        logger.warning(f"Downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error downloading PDF: {pdf_url}")
        logger.error(str(e))

In [13]:
def scrape_articles(url, retry_count=3, delay=5, max_pages=4, current_page=1):
    headers = {'User-Agent': random.choice(user_agents)}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = soup.find_all('div', class_='rslt')

        for article in articles:
            article_url = base_url + article.find('a')['href']
            logger.warning(f"Article URL:{article_url}")
            retry_article = 3
            while retry_article > 0:
                try:
                    article_response = requests.get(article_url, headers=headers)
                    article_response.raise_for_status()
                    article_soup = BeautifulSoup(article_response.text, 'html.parser')

                    pdf_link = article_soup.find('a', href=re.compile(r'\.pdf$'))

                    if pdf_link:
                        pdf_url = base_url + pdf_link['href']
                        pdf_filename = pdf_link['href'].split('/')[-1]
                        download_pdf(pdf_url, pdf_filename)
                        logger.warning(f"Downloaded: {pdf_filename}")
                    else:
                        logger.warning(f"No PDF found for: {article_url}")

                    time.sleep(5)  # Add a random delay between article requests
                    break  # Break the loop if the request is successful
                except (requests.exceptions.RequestException, ConnectionResetError) as e:
                    logger.error(f"Error processing article: {article_url}")
                    logger.error(str(e))
                    retry_article -= 1
                    if retry_article > 0:
                        logger.warning(f"Retrying article {article_url} in {delay} seconds...")
                        time.sleep(delay)
                    else:
                        logger.error(f"Max retry attempts reached for article: {article_url}. Skipping.")

        if current_page < max_pages:
            pagination = soup.find('div', class_='pagination')
            next_link = pagination.find('a', class_ = 'active page_link next')
            logger.warning(f"Current page: {current_page}")

            if next_link:
                next_page = int(next_link.get('page', '0'))
                logger.warning(f"Next page: {next_page}")
                if next_page > current_page:
                    logger.info(f"Moving to page {next_page}")
                    time.sleep(random.uniform(2, 5))  # Add a random delay before moving to the next page

                    # Construct the URL for the next page
                    next_url = f"{search_url}&page={next_page}"
                    logger.warning (f"Next URL:{next_url}")

                    scrape_articles(next_url, retry_count, delay, max_pages, next_page)

    except (requests.exceptions.RequestException, ConnectionResetError) as e:
        if retry_count > 0:
            logger.warning(f"Request error occurred. Retrying in {delay} seconds...")
            time.sleep(delay)
            scrape_articles(url, retry_count - 1, delay)
        else:
            logger.error("Max retry attempts reached. Skipping this request.")

In [14]:
scrape_articles(search_url)

AttributeError: 'NoneType' object has no attribute 'find'

In [32]:
import requests
from bs4 import BeautifulSoup
import re
import os
import logging
import random
import time

base_url = 'https://www.ncbi.nlm.nih.gov'
search_url = 'https://www.ncbi.nlm.nih.gov/pmc/?term=(Inflammatory+Bowel+Disease%5BTitle%5D)+AND+(Gut+microbiome)'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
]

def download_pdf(pdf_url, filename):
    headers = {'User-Agent': random.choice(user_agents)}
    try:
        response = requests.get(pdf_url, headers=headers)
        response.raise_for_status()

        current_dir = os.getcwd()
        downloaded_pdfs_dir = os.path.join(current_dir, 'downloaded_pdfs')
        os.makedirs(downloaded_pdfs_dir, exist_ok=True)

        file_path = os.path.join(downloaded_pdfs_dir, filename)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        logger.warning(f"Downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error downloading PDF: {pdf_url}")
        logger.error(str(e))

def scrape_articles(url, retry_count=3, delay=5, max_pages=4, current_page=1):
    headers = {'User-Agent': random.choice(user_agents)}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = soup.find_all('div', class_='rslt')

        for article in articles:
            article_url = base_url + article.find('a')['href']
            logger.warning(f"Article URL:{article_url}")
            retry_article = 3
            while retry_article > 0:
                try:
                    article_response = requests.get(article_url, headers=headers)
                    article_response.raise_for_status()
                    article_soup = BeautifulSoup(article_response.text, 'html.parser')

                    pdf_link = article_soup.find('a', href=re.compile(r'\.pdf$'))

                    if pdf_link:
                        pdf_url = base_url + pdf_link['href']
                        pdf_filename = pdf_link['href'].split('/')[-1]
                        download_pdf(pdf_url, pdf_filename)
                        logger.warning(f"Downloaded: {pdf_filename}")
                    else:
                        logger.warning(f"No PDF found for: {article_url}")

                    time.sleep(5)  # Add a random delay between article requests
                    break  # Break the loop if the request is successful
                except (requests.exceptions.RequestException, ConnectionResetError) as e:
                    logger.error(f"Error processing article: {article_url}")
                    logger.error(str(e))
                    retry_article -= 1
                    if retry_article > 0:
                        logger.warning(f"Retrying article {article_url} in {delay} seconds...")
                        time.sleep(delay)
                    else:
                        logger.error(f"Max retry attempts reached for article: {article_url}. Skipping.")

        # Check pagination and construct the next page URL
        if current_page < max_pages:
            logger.warning(f"Current page: {current_page}")
            next_page = current_page + 1  # Increment the current page
            logger.warning(f"Next page: {next_page}")

            # Construct the URL for the next page
            next_url = f"{search_url}&page={next_page}"
            logger.warning(f"Next URL: {next_url}")

            time.sleep(random.uniform(2, 5))  # Add a random delay before moving to the next page
            scrape_articles(next_url, retry_count, delay, max_pages, next_page)

    except (requests.exceptions.RequestException, ConnectionResetError) as e:
        if retry_count > 0:
            logger.warning(f"Request error occurred. Retrying in {delay} seconds...")
            time.sleep(delay)
            scrape_articles(url, retry_count - 1, delay)
        else:
            logger.error("Max retry attempts reached. Skipping this request.")

# Start scraping
scrape_articles(search_url)




KeyboardInterrupt: 