In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle
import time
import datetime

http_proxy = "http://92.63.168.248:80"
proxies = {"http" : http_proxy}

In [2]:
def scrape_keywords(keyword):
    urls = []
    page = 1
    
    # Loop through the pages of the search results
    while True:
        url = 'https://nos.nl/zoeken?q=' + keyword + '&page=' + str(page)
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

        soup = BeautifulSoup(response.text, "html.parser")

        # Check if there are any results
        search = soup.find('form').find('ul')
        if search.find('span').get_text() == "Geen resultaten gevonden":
            break
        
        # Loop through the articles on the page
        links = search.select('li > a')
        for link in links:
            title = link.find('h2').get_text().strip()
            article_url = 'https://nos.nl' + link.get("href")
            date = link.find('time').get('datetime')
            parsed_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').strftime('%Y-%m-%d')
            new_entry = {"title": title, "url": article_url, 'date': parsed_datetime}

            # Check if it is an article
            if '/artikel/' in new_entry['url']:
                urls.append(new_entry)
        
        page += 1

    return urls

In [5]:
keywords = ['hittestress', 'hittegolf', 'warmte golf', 'hitteplan', 'temperatuurrecords', 'hoge temperaturen', 'tropische temperaturen', 
            'warmte in Nederland', 'hoge temperaturen Nederland', 'RIVM hitte', 'KNMI hitte']

result = []
for keyword in keywords:
    r = scrape_keywords(keyword)
    print(f"{keyword}: {len(r)}")
    result += r

hittestress: 44
hittegolf: 412
warmte golf: 12
hitteplan: 82
temperatuurrecords: 26
hoge temperaturen: 598
tropische temperaturen: 168
warmte in Nederland: 452
hoge temperaturen Nederland: 216
RIVM hitte: 64
KNMI hitte: 100


In [6]:
data = pd.DataFrame(result)
data.drop_duplicates(subset=['url'], inplace=True)
data.reset_index(drop=True, inplace=True)
data.shape

(1448, 3)

In [119]:
data["scraped"] = False
data["tag"] = None
data["collections"] = None
data["text"] = None
data["subheadings"] = None
data["image_urls"] = None

with open('articles.df.pkl', 'wb') as f:
    pickle.dump(data, f)

print(data.shape)
data.head()

(1448, 9)


Unnamed: 0,title,url,date,scraped,tag,collections,text,subheadings,image_urls
0,Zuid-Franse druiven groeien door klimaatverand...,https://nos.nl/artikel/2481980-zuid-franse-dru...,2023-07-08,False,,,,,
1,"Steeds meer mensen een airco, maar experts zie...",https://nos.nl/artikel/2481925-steeds-meer-men...,2023-07-07,False,,,,,
2,Hofplein in Rotterdam op de schop: minder auto...,https://nos.nl/artikel/2479795-hofplein-in-rot...,2023-06-21,False,,,,,
3,'Klimaatverandering kan desastreuze gezondheid...,https://nos.nl/artikel/2477696-klimaatverander...,2023-06-04,False,,,,,
4,"Varkens in slachthuizen lijden onnodig, NVWA g...",https://nos.nl/nieuwsuur/artikel/2471637-varke...,2023-04-16,False,,,,,


In [120]:
with open('articles.df.pkl', 'rb') as f:
    df_articles = pickle.load(f)

classes = ['sc-703c8009-0 bNWqny', 'sc-ec7ecbea-0 ijYtkw sc-e0c07641-6 hkOGKG']
length = len(df_articles.index)
number = 1

# run through all articles and retrieve tags, collections, subheadings, text, and image urls
while (df_articles["scraped"] == False).any():
    # with open('articles.df.pkl', 'rb') as f:
    #     df_articles = pickle.load(f)

    df_not_scraped = df_articles[df_articles["scraped"] == False]
    url = df_not_scraped.iloc[0]["url"]
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    # remove relevant articles and share divs
    for element in classes:
        for div in soup.find_all("div", {'class':element}):     
            div.decompose()

    # get tag text
    try:
        tag = soup.find(True, {'data-testid':"pill-with-label"}).text.strip()
    except:
        tag = None

    # get subheadings html list
    body_subheadings = soup.select('h2.sc-b9829a65-0.jOMHUX')
    # get text and add each to a list
    subheadings = []
    for subheading in body_subheadings:
        subheadings.append(subheading.get_text())

    if len(subheadings) == 0:
        subheadings = None

    # get body html list
    body_text  = soup.select('div.sc-e0c07641-1.eHATPt')[1:]
    # get text and add to a string
    string = ""
    for paragraph in body_text:
        if paragraph.find('tbody') is None:
            paragraph = paragraph.get_text()
            string = string + paragraph + "\n"
    text = string.strip()

    # get any image urls
    # image_url_list = soup.select('img.sc-89aee953-1.dQLfsp')
    figure_list = soup.find_all('figure')
    image_urls = []
    # get text and add each to a list
    for figure in figure_list:
        image_url = figure.find('img')
        image_url = image_url['src']
        
        image_caption = figure.find('figcaption')
        if image_caption is not None:
            image_caption = image_caption.get_text().strip()
    
        image_urls.append({"image_url": image_url, "caption": image_caption})
    
    if len(image_urls) == 0:
        image_urls = None
    
    # get collections
    collections_list = soup.select('div.sc-703c8009-0.sc-db08e33-0.bNWqny.dxKSik p')
    # get collection and add each to a list
    collections = []
    for collection in collections_list:
        collection = collection.get_text()
        collections.append(collection)

    if len(collections) == 0:
        collections = None

    # add items to dataframe
    index = df_articles.index[df_articles["url"] == url].tolist()

    df_articles.at[index[0], "tag"] = tag
    df_articles.at[index[0], "collections"] = collections
    df_articles.at[index[0], "subheadings"] = subheadings
    df_articles.at[index[0], "text"] = text
    df_articles.at[index[0], "image_urls"] = image_urls
    df_articles.at[index[0], "scraped"] = True

    #df_articles.to_pickle('articles.df.pkl')

    print(f"{number}/{length} complete.", end="\r")

    number += 1

1448/1448 complete.

In [121]:
df_articles.drop(columns=['scraped'], inplace=True)
df_articles.reset_index(drop=True, inplace=True)

In [122]:
df_articles.to_csv("nos_keywords2.csv")
df_articles

Unnamed: 0,title,url,date,tag,collections,text,subheadings,image_urls
0,Zuid-Franse druiven groeien door klimaatverand...,https://nos.nl/artikel/2481980-zuid-franse-dru...,2023-07-08,,[Economie],Tegen de heuvels rond Maastricht werden vijfti...,[Einde aan uienteelt],[{'image_url': 'https://cdn.nos.nl/image/2023/...
1,"Steeds meer mensen een airco, maar experts zie...",https://nos.nl/artikel/2481925-steeds-meer-men...,2023-07-07,,[Binnenland],Op hete dagen zijn mobiele airco's in winkels ...,,[{'image_url': 'https://cdn.nos.nl/image/2023/...
2,Hofplein in Rotterdam op de schop: minder auto...,https://nos.nl/artikel/2479795-hofplein-in-rot...,2023-06-21,,"[In samenwerking met, Rijnmond, Regionaal nieuws]",Het Hofplein in het centrum van Rotterdam gaat...,[Kosten hoger dan verwacht],[{'image_url': 'https://cdn.nos.nl/image/2023/...
3,'Klimaatverandering kan desastreuze gezondheid...,https://nos.nl/artikel/2477696-klimaatverander...,2023-06-04,Klimaat,"[Collectie, Klimaat, Binnenland, Buitenland]",Dat klimaatverandering kan leiden tot bijvoorb...,"[Directe en indirecte gevolgen, 'Toch wel een ...",[{'image_url': 'https://cdn.nos.nl/image/2023/...
4,"Varkens in slachthuizen lijden onnodig, NVWA g...",https://nos.nl/nieuwsuur/artikel/2471637-varke...,2023-04-16,,"[Nieuwsuur, Binnenland]",Vanaf volgend jaar gaat de overheid optreden t...,"[Vechten, Hittestress]",[{'image_url': 'https://cdn.nos.nl/image/2023/...
...,...,...,...,...,...,...,...,...
1443,KNMI: 's middags kans op zwaar onweer,https://nos.nl/artikel/2045194-knmi-s-middags-...,2015-07-04,,[Binnenland],Zondagmiddag komt er met zwaar onweer een eind...,,[{'image_url': 'https://cdn.nos.nl/image/2015/...
1444,Utrecht klaar voor Grand Départ,https://nos.nl/artikel/2045063-utrecht-klaar-v...,2015-07-04,,[Binnenland],Utrecht maakt zich op voor de start van de Tou...,"[Vignetten , Iets minder warm]",[{'image_url': 'https://cdn.nos.nl/image/2015/...
1445,Code oranje beëindigd; lokaal problemen door b...,https://nos.nl/artikel/2039710-code-oranje-bee...,2015-06-05,,[Binnenland],Het KNMI heeft even na 21.00 uur code oranje b...,"[Code oranje, België]",[{'image_url': 'https://cdn.nos.nl/image/2015/...
1446,90-jarige weerman: dit werk houdt een mens jong,https://nos.nl/artikel/2020237-90-jarige-weerm...,2015-02-19,,[Binnenland],Hij heeft veel bewonderaars die graag een plan...,[Vreeslyk gehuil ],[{'image_url': 'https://cdn.nos.nl/image/2015/...
