In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import random
import numpy as np
import pandas as pd
import re

In [29]:
def setup_driver_mac():
    service = Service(ChromeDriverManager().install())
    options = Options()
    #options = webdriver.SafariOptions()
    options.add_argument("--start-maximized")

    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Version/15.0 Safari/537.36"

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    ]
    options.add_argument(f"user-agent={random.choice(user_agents)}")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    # options.add_argument("--incognito")
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [30]:
def setup_driver():
    brave_path = r"C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe"
    service = Service(ChromeDriverManager().install())
    options = Options()
    options.binary_location = brave_path
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [31]:
def get_page_source(url, driver):
    driver.get(url)
    time.sleep(10)
    wait = WebDriverWait(driver, 10)
    return driver.page_source

In [32]:
def extract_articles_jornal_noticias(soup):
    divs_itc001 = soup.find_all(lambda tag: (
        tag.name == "div" and 
        tag.get("data-instance") == "i01" and 
        "ITC001" in tag.get("class", []) and 
        "light" in tag.get("class", [])
    ))
    divs_hc002 = soup.find_all(lambda tag: (
        tag.name == "div" and 
        "HC002" in tag.get("class", []) and 
        "light" in tag.get("class", [])
    ))
    divs_pc001 = soup.find_all(lambda tag: (
        tag.name == "div" and 
        tag.get("data-instance") == "i01" and 
        "PC001" in tag.get("class", []) and 
        "light" in tag.get("class", [])
    ))
    divs_hc002data = soup.find_all(lambda tag: (
        tag.name == "div" and 
        tag.get("data-instance") == "i02" and 
        "HC002" in tag.get("class", []) and 
        "light" in tag.get("class", [])
    ))
    divs_itc002 = soup.find_all(lambda tag: (
        tag.name == "div" and 
        tag.get("data-instance") == "i04" and 
        "ITC002" in tag.get("class", []) and 
        "light" in tag.get("class", [])
    ))
    
    print(f"HC002 light none found: {len(divs_hc002data)}")
    print(f"ITC001 light none found: {len(divs_itc001)}")
    print(f"PC001 light corners-square none hover-scale found: {len(divs_pc001)}")
    print(f"ITC002 light corners-square none found: {len(divs_itc002)}")
    print(f"HC002 light none found: {len(divs_hc002data)}")

    all_divs = divs_hc002 + divs_itc001 + divs_pc001 +  divs_itc002 + divs_hc002data

    articles = []
    for div in all_divs:
        #print(div)
        a_tag = div.find("a")
        h2title, category = None, None
        h2title = div.find("h2",class_="title")
        category = div.find("div", class_="info")

        if a_tag:

            h2title = h2title.get_text(strip=True)
            category = category.get_text(strip=True)
            link = a_tag["href"]
            
            full_link = f"https://www.jn.pt{link}"

            articles.append({"title": h2title, "link": full_link, "category": category})

        else:
            continue
            
    return articles

In [33]:
def scroll_and_find_element(driver,locator,scroll_amount=500):
    wait = WebDriverWait(driver, 10)
    scroll_position = 0
    while True:
        try:
            element = wait.until(EC.presence_of_element_located(locator))
            return element
        except:
            driver.execute_script(f"window.scrollTo(0,{scroll_position})")
            scroll_position += scroll_amount
            time.sleep(20)

            if scroll_position > driver.execute_script("return document.body.scrollHeight"):
                print("End of page reached, it didnt find the element")
                return None

In [34]:
def extract_full_article(article_url, driver):
    """Visit an article URL and extract the full text content."""
    page_source = get_page_source(article_url, driver)
    soup = BeautifulSoup(page_source, "html.parser")

    paragraphs = soup.find_all("p")
    return "\n".join([p.get_text(strip=True) for p in paragraphs])

In [36]:
def main():
    url = "https://www.jn.pt/"
    driver = setup_driver_mac()

    get_page_source(url, driver)
    wait = WebDriverWait(driver, 10)
    driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[2]/div[2]/button[1]').click()
    time.sleep(1)
    
    total_height = driver.execute_script("return document.body.scrollHeight")
    quarter_height = total_height / 4
    current_scroll_position = 0

    for _ in range(4):
        current_scroll_position += quarter_height
        driver.execute_script(f"window.scrollTo(0, {current_scroll_position});")
        time.sleep(random.randint(1,2))

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    time.sleep(1)
    driver.execute_script(f"window.scrollTo(0, 0);")
    time.sleep(1)

    try:
        element_crise_politica = scroll_and_find_element(driver,(By.XPATH, '//*[@id="crise-politica"]/div/div/div/div[1]/div/a'))
        driver.execute_script("arguments[0].scrollIntoView();", element_crise_politica)
        time.sleep(2)
        element_crise_politica.click()
        time.sleep(10)
        print(f"Crise Politica button pressed.")
    except Exception as e:
        print(f"Crise Politica couldnt be pressed")

    current_scroll_position = 0
    
    for _ in range(4):
        current_scroll_position += quarter_height
        driver.execute_script(f"window.scrollTo(0, {current_scroll_position});")
        time.sleep(random.randint(1,2))

    page_source_crise = driver.page_source
    soup_crise = BeautifulSoup(page_source_crise, "html.parser")
    articles = extract_articles_jornal_noticias(soup_crise)

    driver.execute_script(f"window.scrollTo(0, 0);")
    
    try:
        while True:
            next_page_button = scroll_and_find_element(driver,(By.XPATH,'//*[@id="app"]/div[3]/div[2]/div/div[2]/div[1]/div/div/div/div[13]/div/div/div[6]/a'))
            next_page_button.click()

            print(f"Next button pressed.")

            current_scroll_position = 0

            for _ in range(4):
                current_scroll_position += quarter_height
                driver.execute_script(f"window.scrollTo(0, {current_scroll_position});")
                time.sleep(random.randint(1,2))

            page_source_crise = driver.page_source
            soup_crise = BeautifulSoup(page_source_crise, "html.parser")
            articles += extract_articles_jornal_noticias(soup_crise)

            driver.execute_script(f"window.scrollTo(0, 0);")

    except Exception as e:
            print(f"Next page can't be pressed")

    #following_pages = next_pages(driver)

    # for article in articles:
    #     article["article_content"] = extract_full_article(article["link"], driver)

    #     print(f"Title: {article['title']}")
    #     print(f"Link: {article['link']}")
    #     print(f"Content Inside Div: {article['div_content']}")
    #     print(f"Full Article Content:\n{article['article_content'][:500]}...")
    #     print("-" * 100)

    driver.quit()

    df = pd.DataFrame(articles)

    df = df.drop_duplicates(subset=['title'],ignore_index=True)

    df_politica = df[df['category'].isin(['Politica','Política','politica']) ]
    display(df)
    
if __name__ == "__main__":
    main()


Crise Politica button pressed.
HC002 light none found: 12
ITC001 light none found: 0
PC001 light corners-square none hover-scale found: 0
ITC002 light corners-square none found: 0
HC002 light none found: 12
End of page reached, it didnt find the element
Next page can't be pressed


Unnamed: 0,title,link,category
0,Carneiro critica Governo e compara Conselho de...,https://www.jn.pt/4833319811/carneiro-critica-...,Legislativas
1,Legislativas: Portugueses preferem estabilidad...,https://www.jn.pt/7058462869/legislativas-port...,Sondagem
2,Pedro Nuno acusa Montenegro de ter fugido sist...,https://www.jn.pt/1335174180/pedro-nuno-acusa-...,PS
3,Sondagem Legislativas: AD deixa PS para trás. ...,https://www.jn.pt/1547311803/sondagem-legislat...,Barómetro
4,"Montenegro diz que é tempo de ""olhar para o fu...",https://www.jn.pt/6947033525/montenegro-diz-qu...,Vila do Conde
5,PS vai atualizar programa eleitoral em cinco d...,https://www.jn.pt/1090115365/ps-vai-atualizar-...,Legislativas
6,Montenegro diz ter a maior experiência de escr...,https://www.jn.pt/6810106387/montenegro-diz-te...,Spinumviva
7,Fenprof alerta Governo em gestão para urgência...,https://www.jn.pt/3457907890/fenprof-alerta-go...,Ensino
8,Do semáforo nas intervenções ao chumbo da moçã...,https://www.jn.pt/8290538235/do-semaforo-nas-i...,Legislativas
9,Presidente executiva da Sonae diz que 2025 vai...,https://www.jn.pt/2746641373/presidente-execut...,Estabilidade
