In [None]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup




In [None]:
# Set up Selenium with headless Chrome
chrome_options = Options()

chrome_options.headless = True
driver = webdriver.Chrome(options=chrome_options)



In [None]:
import requests


def get_climate_articles(year, page):
    """
    This function scrapes the Nature Climate Change website for articles published in a given year and page.
    
    Parameters:
    year: int, the year of publication
    page: int, the page number to scrape
    
    return: DataFrame, a DataFrame containing the title, authors, and summary of the articles on the page

    """

    f_url = f"https://www.nature.com/nclimate/articles?searchType=journalSearch&sort=PubDate&type=article&year={year}&page={page}"
    driver.get(f_url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all("div", class_="u-full-height")
    article_list = []
    for article in articles:
        title = article.find("h3", class_="c-card__title").text
        link = article.find("a", class_="c-card__link u-link-inherit")["href"]
        link = f"https://www.nature.com{link}"

        authors = article.find_all("ul", class_="c-author-list c-author-list--compact c-author-list--truncated")
        authors = [author.text for author in authors]
        authors_tags = article.find_all("ul", class_="c-author-list c-author-list--compact c-author-list--truncated")
        authors = []
        for author_tag in authors_tags:
            author_names = author_tag.find_all("li")
            authors.extend([author_name.text.strip() for author_name in author_names])
        if not authors:
            authors = ["No authors"]
        summary = article.find("div", class_="c-card__summary").text
        # abstract  
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        abstract = soup.find("div", class_="c-article-section__content")
        if abstract:
            abstract = abstract.text
        else:
            abstract = "No content"
        reference = soup.find("p", class_="c-article-references__text")
        
        content = soup.find("div", class_="main-content")
        if content:
            content = content.text
        else:
            content = "No content"

        if reference:
            reference = reference.text
        else:
            reference = "No content"

        article_list.append({"title": title, "authors": authors, "summary": summary, "link": link, "abstract": abstract, "content":content, "reference": reference})
    return pd.DataFrame(article_list)

# df = get_climate_articles(2021,1)
# df.head()


all_data = []

pages = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

for page in pages:
    for year in years:
        df = get_climate_articles(year, page)
        all_data.append(df)
        print(f"Extracted data for year {year}, page {page}")

# Concatenate all data into a single DataFrame
final_df = pd.concat(all_data, ignore_index=True)

# Save the concatenated DataFrame to a single CSV file
final_df.to_csv("all_climate_articles.csv", index=False)
print("Saved all_climate_articles.csv")


In [4]:
df = get_climate_articles(2021,1)
df.head()

Unnamed: 0,title,authors,summary,link,abstract,content,reference
0,\nGlobal warming decreases connectivity among ...,"[Joana Figueiredo, Christopher J. Thomas, Emma...",\nThe authors develop a high-resolution model ...,https://www.nature.com/articles/s41558-021-012...,"Global warming is killing corals; however, the...",No content,"Cesar, H., Burke, L. & Pet-Soede L. The Econom..."
1,\nGlobal warming decreases connectivity among ...,"[Joana Figueiredo, Christopher J. Thomas, Emma...",\nThe authors develop a high-resolution model ...,https://www.nature.com/articles/s41558-021-012...,"Global warming is killing corals; however, the...",No content,"Cesar, H., Burke, L. & Pet-Soede L. The Econom..."
2,\nHysteresis of the intertropical convergence ...,"[Jong-Seong Kug, Ji-Hoon Oh, Jongsoo Shin]",\nIn idealized model experiments where CO2 inc...,https://www.nature.com/articles/s41558-021-012...,With the unprecedented rate of global warming ...,No content,"Joos, F. & Spahni, R. Rates of change in natur..."
3,\nHysteresis of the intertropical convergence ...,"[Jong-Seong Kug, Ji-Hoon Oh, Jongsoo Shin]",\nIn idealized model experiments where CO2 inc...,https://www.nature.com/articles/s41558-021-012...,With the unprecedented rate of global warming ...,No content,"Joos, F. & Spahni, R. Rates of change in natur..."
4,\nContextualizing cross-national patterns in h...,"[Brayton Noll, Tatiana Filatova, Alessandro Ta...",\nThe context and motivation around adaptation...,https://www.nature.com/articles/s41558-021-012...,Understanding social and behavioural drivers a...,No content,"Coronese, M., Lamperti, F., Keller, K., Chiaro..."


In [5]:
df['link'][0]

'https://www.nature.com/articles/s41558-021-01248-7'

In [6]:
df['authors'][0]

['Joana Figueiredo', 'Christopher J. Thomas', 'Emmanuel Hanert']

In [7]:
df['title'][0]

'\nGlobal warming decreases connectivity among coral populations\n'

In [8]:
df['summary'][1]

'\nThe authors develop a high-resolution model of coral larval dispersal for the southern Great Barrier Reef. They show that 2\u2009°C of warming decreases larval dispersal distance and connectivity of reefs, hampering post-disturbance recovery and the potential spread of warm-adapted genes.\n'

In [9]:
df['abstract'][3]

'With the unprecedented rate of global warming in recent decades, whether or not anthropogenic climate change is irreversible is an important question. Based on idealized CO2 ramp-up until 1,468 ppm and symmetric ramp-down model experiments, here we show that the intertropical convergence zone (ITCZ) does not respond linearly to CO2 forcing, but exhibits strong hysteresis behaviour. While the location of the ITCZ changes minimally during the ramp-up period, it moves sharply south as soon as CO2 begins to decrease, and its centre eventually resides in the Southern Hemisphere during the ramp-down period. Such ITCZ hysteresis is associated with delays in global energy exchanges between the tropics and extratropics. The delayed energy exchanges are explained by two distinct hysteresis behaviours of the Atlantic Meridional Overturning Circulation and slower warming/cooling in the Southern Ocean. We also suggest that the ITCZ hysteresis can lead to hysteresis in regional hydrological cycles.

In [10]:
df['reference'][1]

'Cesar, H., Burke, L. & Pet-Soede L. The Economics of Worldwide Coral Reef Degradation (Cesar Environmental Economics Consulting, 2003).'

In [11]:
for row in df['content']:
    if row == 'No content':
        print("No content available")
    else:
        print('Content available')

No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
No content available
Content available
Content available
No content available
No content available
No content available
No content available
No content available
No content available


In [12]:
df.shape

(40, 7)

In [13]:
# df = get_climate_articles(2025,1)
# df.head()

In [14]:
# df.shape

In [15]:
all_data = []

pages = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

for page in pages:
    for year in years:
        df = get_climate_articles(year, page)
        all_data.append(df)
        print(f"Extracted data for year {year}, page {page}")

# Concatenate all data into a single DataFrame
final_df = pd.concat(all_data, ignore_index=True)

# Save the concatenated DataFrame to a single CSV file
final_df.to_csv("all_climate_articles.csv", index=False)
print("Saved all_climate_articles.csv")


Extracted data for year 2011, page 1
Extracted data for year 2012, page 1
Extracted data for year 2013, page 1
Extracted data for year 2014, page 1
Extracted data for year 2015, page 1
Extracted data for year 2016, page 1
Extracted data for year 2017, page 1


ReadTimeout: HTTPSConnectionPool(host='www.nature.com', port=443): Read timed out. (read timeout=None)

In [None]:

df = pd.read_csv("all_climate_articles.csv")
df.head()

In [None]:
df.shape

In [None]:
profile_report = ProfileReport(df)
profile_report.to_file("climate_articles_profile.html")
print("Saved climate_articles_profile.html")
profile_report.to_notebook_iframe()

In [None]:
df.describe

In [None]:
df["title"][0]

In [None]:
df['authors'][0]

In [None]:
df['summary'][0]

In [None]:
# Use a smaller subset of the data to generate the profile report
subset_df = df.sample(frac=0.1, random_state=1)  # Adjust the fraction as needed

profile = ProfileReport(subset_df, title="Climate Articles Profiling Report", explorative=True)
profile.to_file("climate_articles_profile.html")
profile.to_notebook_iframe()

In [None]:
def get_journal_index():
    """
    This function scrapes the Nature Climate Change website for journals on a given page."""
    url = "https://www.nature.com/siteindex"

    driver.get(url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    journals = soup.find_all("li", class_="grid mq640-grid-12 text13 pa0 grid-6")
    journal_last = soup.find_all("li", class_="grid mq640-grid-12 text13 pa0 grid-6 last")
    journals.extend(journal_last)
    
    journal_list = []
    for journal in journals:
        
        title = journal.find("a").text
        link = journal.find("a")["href"]
        link = f"https://www.nature.com{link}"
        journal_list.append({"title": title, "link": link})

    return pd.DataFrame(journal_list)



In [None]:
df_index = get_journal_index()
df_index.head()

In [None]:
df_index.shape

In [None]:
df_index['title'][0]

In [None]:
df_index['link'][0]

In [None]:
df_index.head()

In [None]:
df_index

In [None]:
def get_journal_articles(url):
    """
    This function scrapes the Nature Climate Change website for articles published in a given journal.
    
    Parameters:
    link: str, the URL of the journal
    
    return: DataFrame, a DataFrame containing the title, authors, and summary of the articles in the journal

    """
    driver.get(url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all("div", class_="u-full-height")
    article_list = []
    for article in articles:
        title = article.find("h3", class_="c-card__title").text
        link = article.find("a", class_="c-card__link u-link-inherit")["href"]
        link = f"https://www.nature.com{link}"
        print(link)

        authors = article.find_all("ul", class_="c-author-list c-author-list--compact c-author-list--truncated")
        authors = [author.text for author in authors]
        authors_tags = article.find_all("ul", class_="c-author-list c-author-list--compact c-author-list--truncated")
        authors = []
        for author_tag in authors_tags:
            author_names = author_tag.find_all("li")
            authors.extend([author_name.text.strip() for author_name in author_names])
        if not authors:
            authors = ["No authors"]
        # summary = article.find("div", class_="c-card__summary").text
        # summary = article.find("div", class_="c-card__summary").text
        # abstract  
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        abstract = soup.find("div", class_="c-article-section__content")
        if abstract:
            abstract = abstract.text
        else:
            abstract = "No content"
        reference = soup.find("p", class_="c-article-references__text")
        if reference:
            reference = reference.text
        else:
            reference = "No content"
        
        content = soup.find("div", class_="main-content")
        if content:
            content = content.text
        else:
            content = "No content"

        article_list.append({"title": title, "authors": authors, "link": link,"abstract":abstract, "content":content, "reference": reference})
    return pd.DataFrame(article_list)



In [None]:
# get_journal_articles("https://www.nature.com/aps/")

In [None]:
dataset=get_journal_articles("https://www.nature.com/aps/")

In [None]:
dataset.shape

In [None]:
dataset['title'][37]

In [None]:
dataset['authors'][37]

In [None]:
dataset['abstract'][37]

In [None]:
dataset['reference'][37]

In [None]:
# Loop through the links extracted from the df_index DataFrame

all_journal_data = []
for link in df_index['link']:
    try:
        df = get_journal_articles(link)
        all_journal_data.append(df)
        print(f"Extracted data for journal: {link}")
    except requests.exceptions.ReadTimeout:
        print(f"Read timeout occurred for journal: {link}")
    except Exception as e:
        print(f"An error occurred for journal: {link} - {e}")

# Concatenate all data into a single DataFrame
all_journal_data = pd.concat(all_journal_data, ignore_index=True)

'''
extracts the regional Nature websites from the Nature Climate Change website.
    These regions include:

    Nature Africa
    Nature China
    Nature India
    Nature Italy
    Nature Japan
    Nature Middle East
'''
urls = ["https://www.nature.com/natafrica","https://www.nature.com/nindia","https://www.nature.com/natitaly","https://www.natureasia.com/ja-jp","https://www.nature.com/nmiddleeast"]
for url in urls:
    df = get_journal_articles(url)
    all_journal_data.append(df)
    

# Save the concatenated DataFrame to a single CSV file
all_journal_data.to_csv("all_journal_articles.csv", index=False)
print("Saved all_journal_articles.csv")