In [None]:
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup
import re
import os
import json
import time

In [None]:
def extract_article_name(header):
    return header.find('h1').text


def extract_authors(header) -> list:
    """extract authors of article return list of them"""
    authors_block = header.find("div", {'class':"cg p"})
    authors = [a_link.text for a_link in authors_block.find_all("a")]
    return list(set(authors))

def extract_article_sections(article):
    article_content = article.find("section",{'class':'body main-article-body'})
    sections = article_content.find_all('section',recursive=False)
    sections = [elem for elem in sections if elem.h2.text != "References"]
    return sections

def extract_section_texts(sections):
    all_texts = ""
    for elem in sections:
        texts = " ".join([paragraph.text for paragraph in elem.find_all('p')])
        all_texts += "\n" + texts
    return all_texts        
    
    
def extract_time_and_doi_section(article):

    time_and_doi_section = article.find('section',{'class':"pmc-layout__citation font-secondary font-xs"})
    text = time_and_doi_section.find("div").text.strip()
    return text

def extract_date_and_doi(text):
        
    pattern = r"(\d{4}\s+[A-Za-z]{3}\s+\d{1,2}).*?doi:\s*(\S+)"
    match = re.search(pattern, text)

    if match:
        date = match.group(1)
        doi = match.group(2)
        # print("Date:", date)
        # print("DOI:", doi)
        return {'date':date,'doi':doi}
    else:
        return None
    

def extract_article(soup):
    main = soup.find('main',{"id":"main-content"})
    article = main.find("article")

    header = article.find("div",{'class':"ameta p font-secondary font-xs"})
    name = extract_article_name(header)
    authors = extract_authors(header)
    article_sections =  extract_article_sections(article)
    texts = extract_section_texts(article_sections)
    date_doi_dict = extract_date_and_doi(extract_time_and_doi_section(article))
    if date_doi_dict is not None:
        date = date_doi_dict["date"]
        doi = date_doi_dict['doi']
    else:
        date = None
        doi = None
    article_dict = {'name':name,
                    "authors":authors,
                    "text":texts,
                    'date':date,
                    'doi':doi
                    }
    return article_dict


In [96]:
df_pubs = pd.read_csv("./SB_publication_PMC.csv")
scraper = cloudscraper.create_scraper() 

In [None]:
links = list(df_pubs['Link'].values)
for link in links:

    res = scraper.get(link)
    soup = BeautifulSoup(res.content,'html.parser')
    

Date: 2020 Mar 6
DOI: 10.3389/fpls.2020.00199
