In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm.auto import tqdm
from dire33Utils import getHrefLinks, getQandA, loadSetPickleSnapshot, createSetPickleSnapshot

In [20]:
options = Options()
options.headless = True 
options.add_argument("--window-size=1280,1024")  


In [21]:
def getAllHrefForArgument(driver: webdriver.Chrome, link: str, numeroPagine: int) -> set[str]:
    """
    This Python function retrieves all href links from a specified number of pages of a given website
    using a WebDriver.
    
    :param driver: The `driver` parameter is an instance of the `webdriver.Chrome` class, which is
    typically used in Selenium to automate web browsers for testing or web scraping purposes
    :type driver: webdriver.Chrome
    :param link: The `link` parameter in the `getAllHrefForArgument` function is a string that
    represents the URL of a webpage from which you want to extract href links
    :type link: str
    :param numeroPagine: The `numeroPagine` parameter in the `getAllHrefForArgument` function represents
    the total number of pages to iterate through when scraping href links from a website. This parameter
    determines how many pages the function will visit to collect the href links
    :type numeroPagine: int
    :return: A set of strings containing all the href links found on the specified number of pages
    starting from the given link.
    """
    driver.get(link)
    linkList = set()
    for currentIndex in tqdm(range(1, numeroPagine+1)):
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'elencoA')))
        except:
            break
        elencoWebElementList = driver.find_elements(By.CLASS_NAME, "elencoA")
        linkList.update(getHrefLinks(elencoWebElementList))
        try:
            driver.execute_script(f"document.getElementById('pgg').value='{
                currentIndex}'; paginazione.submit();")
        except: 
            return linkList
    return linkList

In [27]:
def getTextFromArticle(driver: webdriver.Chrome, link: str) -> str:
    """
    This Python function uses Selenium to extract text content from a specified article link and then
    processes the text to return a tuple containing questions and answers.

    :param link: The `link` parameter in the `getTextFromArticle` function is a string that represents
    the URL of the article from which you want to extract text
    :type link: str
    :return: The function `getTextFromArticle` returns a tuple containing a single string element, which
    is the result of calling the `getQandA` function on the text content of the article element located
    by its class name 'txtArticolo'.
    """
    driver.get(link)
    article = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'txtArticolo')))
    result = article.text
    if (result is None or result == ""):
        article = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'txtArticolo')))
        result = article.text
    
    return result

In [None]:
driver = webdriver.Chrome(options=options)
linkSetPath = "dire33LinkSet.pkl"
linkSet = set()

linkToSearchList = ["https://www.dica33.it/esperto-risponde/stomaco/", "https://www.dica33.it/esperto-risponde/reflusso/", 
                    "https://www.dica33.it/esperto-risponde/celiachia/", "https://www.dica33.it/esperto-risponde/peso/" ,
                    "https://www.dica33.it/esperto-risponde/domande-stomaco-e-intestino/", "https://www.dica33.it/esperto-risponde/domande-mente-e-cervello/",
                    "https://www.dica33.it/esperto-risponde/domande-scheletro-e-articolazioni/", "https://www.dica33.it/esperto-risponde/domande-fegato/",
                    "https://www.dica33.it/esperto-risponde/domande-pelle/", "https://www.dica33.it/esperto-risponde/domande-cuore-circolazione-e-malattie-del-sangue/",
                    "https://www.dica33.it/esperto-risponde/domande-orecchie-naso-e-gola/", "https://www.dica33.it/esperto-risponde/domande-occhio-e-vista/"
                    
                    ]

for link in tqdm(linkToSearchList):
    linkSet.update(getAllHrefForArgument(driver, link, 40))
 

In [None]:
linkSetPath = "dire33LinkSet.pkl"
createSetPickleSnapshot(linkSet,linkSetPath)

In [23]:
linkSetPath = "dire33LinkSet.pkl"
pickleLinkSet = loadSetPickleSnapshot(linkSetPath)

In [26]:
getTextFromArticle(webdriver.Chrome(), "https://www.dica33.it/esperto-risponde/domanda-naso-chiuso-opzioni-e-opportunita-211907.asp")

"Domande e risposte »\nRisposte di Orecchie naso e gola »\n\n06 maggio 2012\nNaso chiuso: opzioni e opportunità\nSalve, ho praticamente da sempre il naso chiuso e mi sono finalmente deciso ad approfondire il problema.\nLe analisi hanno fornito i seguenti risultati:\nRinomanometria nasale: con pressione 75 il flusso narice sinistra è di 28 ccm/s mentre il flusso narice destra è di 32 ccm/s.\nTAC del massiccio facciale ha evidenziato: Scoliosi del setto nasale con sperone osteo-cartilagineo aggettante nella porzione inferiore della fossa nasale di sinistra e riduzione del corrispondente spazio aereo che è ai limiti della norma anche controlateralmente per il ridotto diametro trasverso.\nCoesistenza di pseudocisti mucose nei recessi alveolari di ambedue i seni mascellari. In via collaterale discreta tumefazione dei tessuti molli e della volta del cavo rinofaringeo con ridotta evidenza, bilateralmente, dei recessi tubarici e delle fossette laterali. Conservata la pneumatizzazione delle mas

In [29]:
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from queue import Queue

driver_queue = Queue()

retrySet = set()

def create_driver():
    driver = webdriver.Chrome()  
    driver_queue.put(driver)
    return driver


num_drivers = 10
drivers = [create_driver() for _ in range(num_drivers)]

def process_link(link):
    driver = driver_queue.get()
    try:
        result = getTextFromArticle(driver, link)
        if (result is None or result == ""):
            print(f"Result for {link} is empty")
            retrySet.add(link)
    except Exception as e:
        print(f"Error processing link {link}: {str(e)}")
        result = 'Error'
    finally:
        driver_queue.put(driver)
    return result

with ThreadPoolExecutor(max_workers=num_drivers) as executor:
    results = list(tqdm(executor.map(process_link, pickleLinkSet), total=len(pickleLinkSet)))
with ThreadPoolExecutor(max_workers=num_drivers) as executor:
    results = list(tqdm(executor.map(process_link, retrySet), total=len(retrySet)))

  0%|          | 0/8177 [00:00<?, ?it/s]

Result for https://www.dica33.it/esperto-risponde/domanda-celiachia-e-le-medicine-5090.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-itraconazolo-e-integratori-228836.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-valore-ggt-sballato-279793.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-reflusso-273682.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-deodorante-anti-sudorazione-e-depilazione-laser-294253.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-sinusite-purulenta-221266.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-risonanza-magnetica-296113.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-colesterolo-ldl-269751.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-levopraid-298381.asp is empty
Result for https://www.dica33.it/esperto-risponde/domanda-fenomeno-di-raynaud-317076.asp is empty
Result fo

KeyboardInterrupt: 

In [None]:
import pickle
with open('dire33.pkl', 'wb') as f:
    pickle.dump(QandA,f)

In [None]:
print(len(QandA))