In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm.auto import tqdm
from dire33Utils import getHrefLinks, getQandA

In [2]:
options = Options()
options.headless = True 
options.add_argument("--window-size=1280,1024")  


In [4]:
def getAllHrefForArgument(driver: webdriver.Chrome, link: str, numeroPagine: int) -> set[str]:
    """
    This Python function retrieves all href links from a specified number of pages of a given website
    using a WebDriver.
    
    :param driver: The `driver` parameter is an instance of the `webdriver.Chrome` class, which is
    typically used in Selenium to automate web browsers for testing or web scraping purposes
    :type driver: webdriver.Chrome
    :param link: The `link` parameter in the `getAllHrefForArgument` function is a string that
    represents the URL of a webpage from which you want to extract href links
    :type link: str
    :param numeroPagine: The `numeroPagine` parameter in the `getAllHrefForArgument` function represents
    the total number of pages to iterate through when scraping href links from a website. This parameter
    determines how many pages the function will visit to collect the href links
    :type numeroPagine: int
    :return: A set of strings containing all the href links found on the specified number of pages
    starting from the given link.
    """
    driver.get(link)
    linkList = set()
    for currentIndex in tqdm(range(1, numeroPagine+1)):
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'elencoA')))
        except:
            break
        elencoWebElementList = driver.find_elements(By.CLASS_NAME, "elencoA")
        linkList.update(getHrefLinks(elencoWebElementList))
        try:
            driver.execute_script(f"document.getElementById('pgg').value='{
                currentIndex}'; paginazione.submit();")
        except: 
            return linkList
    return linkList

In [5]:
def getTextFromArticle(driver: webdriver.Chrome, link: str) -> tuple[str]:
    """
    This Python function uses Selenium to extract text content from a specified article link and then
    processes the text to return a tuple containing questions and answers.

    :param driver: The `driver` parameter is an instance of the `webdriver.Chrome` class, which is used
    for controlling the Chrome web browser through Selenium
    :type driver: webdriver.Chrome
    :param link: The `link` parameter in the `getTextFromArticle` function is a string that represents
    the URL of the article from which you want to extract text
    :type link: str
    :return: The function `getTextFromArticle` returns a tuple containing a single string element, which
    is the result of calling the `getQandA` function on the text content of the article element located
    by its class name 'txtArticolo'.
    """
    driver.get(link)
    article = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'txtArticolo')))
    return getQandA(article.text)

In [6]:
driver = webdriver.Chrome(options=options)
linkSet = set()

linkToSearchList = ["https://www.dica33.it/esperto-risponde/stomaco/", "https://www.dica33.it/esperto-risponde/reflusso/", 
                    "https://www.dica33.it/esperto-risponde/celiachia/", "https://www.dica33.it/esperto-risponde/peso/" ,
                    "https://www.dica33.it/esperto-risponde/domande-stomaco-e-intestino/", "https://www.dica33.it/esperto-risponde/domande-mente-e-cervello/",
                    "https://www.dica33.it/esperto-risponde/domande-scheletro-e-articolazioni/", "https://www.dica33.it/esperto-risponde/domande-fegato/",
                    "https://www.dica33.it/esperto-risponde/domande-pelle/", "https://www.dica33.it/esperto-risponde/domande-cuore-circolazione-e-malattie-del-sangue/",
                    "https://www.dica33.it/esperto-risponde/domande-orecchie-naso-e-gola/", "https://www.dica33.it/esperto-risponde/domande-occhio-e-vista/"
                    
                    ]

for link in tqdm(linkToSearchList):
    linkSet.update(getAllHrefForArgument(driver, link, 40))
 

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

NameError: name 'getHrefLinks' is not defined

In [None]:
QandA = set()

for link in tqdm(linkSet):
    try:
        QandA.add(getTextFromArticle(driver,link))
    except:
        pass
    
driver.quit()

In [None]:
import pickle
with open('dire33.pkl', 'wb') as f:
    pickle.dump(linkSet,f)