In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm.auto import tqdm
from Medic1Utils import getHrefLinks

In [None]:
options = Options()
options.headless = True
options.add_argument("--window-size=1280,1024")

In [None]:
def getAllHrefForArgument(driver: webdriver.Chrome, link: str, numeroPagine: int) -> set[str]:
    """
    This Python function retrieves all href links from a specified number of pages of a given website
    using a WebDriver.

    :param driver: The `driver` parameter is an instance of the `webdriver.Chrome` class, which is
    typically used in Selenium to automate web browsers for testing or web scraping purposes
    :type driver: webdriver.Chrome
    :param link: The `link` parameter in the `getAllHrefForArgument` function is a string that
    represents the URL of a webpage from which you want to extract href links
    :type link: str
    :param numeroPagine: The `numeroPagine` parameter in the `getAllHrefForArgument` function represents
    the total number of pages to iterate through when scraping href links from a website. This parameter
    determines how many pages the function will visit to collect the href links
    :type numeroPagine: int
    :return: A set of strings containing all the href links found on the specified number of pages
    starting from the given link.
    """
    linkSet = set()
    for currentIndex in tqdm(range(1, numeroPagine+1)):
        linkWithPage = link + f"&pagina={currentIndex}"
        
        driver.get(linkWithPage)
        
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'titconsulto')))
        except Exception as e:
            return linkSet
        elencoWebElementList = driver.find_elements(By.CLASS_NAME, "titconsulto")
        linkSet.update(getHrefLinks(elencoWebElementList))
    return linkSet

In [None]:
def getTextFromArticle(driver: webdriver.Chrome, link: str) -> tuple[tuple]:
    """
    This Python function uses Selenium to extract text content from a specified article link and then
    processes the text to return a tuple containing questions and answers.

    :param link: The `link` parameter in the `getTextFromArticle` function is a string that represents
    the URL of the article from which you want to extract text
    :type link: str
    :return: The function `getTextFromArticle` returns a tuple containing a single string element, which
    is the result of calling the `getQandA` function on the text content of the article element located
    by its class name 'txtArticolo'.
    """
    driver.get(link)
    try:
        WebDriverWait(driver, 10).until(
            EC.text_to_be_present_in_element(
                (By.CLASS_NAME, 'txtArticolo'), '')
        )
    except:
        print("Timed out waiting for page to load")
    article = driver.find_element(By.CLASS_NAME, 'txtArticolo')
    domanda = article.text
    typeDomanda = "FROM_TEXT"
    if (domanda is None or domanda == ""):
        domanda = article.get_attribute("textContent")
        typeDomanda = "FROM_CONTENT"

    typeRisposta = "FROM_TEXT"
    espertoRispostaWeb = driver.find_element(By.CLASS_NAME, "espertoRispostaM")
    espertoRisposta = espertoRispostaWeb.text
    if (espertoRisposta is None or espertoRisposta == ""):
        espertoRisposta = espertoRispostaWeb.get_attribute("textContent")
        typeRisposta = "FROM_CONTENT"

    return ((domanda, typeDomanda), (espertoRisposta, typeRisposta))

In [None]:
driver = webdriver.Chrome(options=options)
linkSetPath = "Medic1LinkSet.pkl"
linkSet = set()

linkToSearchList = ["https://www.medicitalia.it/consulti/?tag=celiachia",
                    "https://www.medicitalia.it/consulti/?tag=asma",
                    "https://www.medicitalia.it/consulti/?tag=allergia",
                    "https://www.medicitalia.it/consulti/?tag=insonnia",
                    "https://www.medicitalia.it/consulti/?tag=emicrania",
                    "https://www.medicitalia.it/consulti/?tag=malattia-di-alzheimer",
                    "https://www.medicitalia.it/consulti/?tag=diabete",
                    "https://www.medicitalia.it/consulti/?tag=disturbi-della-vista",
                    "https://www.medicitalia.it/consulti/?tag=dermatite",
                    "https://www.medicitalia.it/consulti/?tag=salute-orale"
                    ]


driver.get(linkToSearchList[0])
accept_cookies_button = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "pt-accept-all")))     
accept_cookies_button.click()
for link in tqdm(linkToSearchList):
    linkSet.update(getAllHrefForArgument(driver, link, 70))

In [None]:
from Utils import createObjectPickleSnapshot

linkSetPath = "MedicItaliaLinkSet.pkl"
createObjectPickleSnapshot(linkSet, linkSetPath)

In [None]:
from Utils import loadObjectPickleSnapshot


linkSetPath = "MedicItaliaLinkSet.pkl"
pickleLinkSet = loadObjectPickleSnapshot(linkSetPath)

In [None]:
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from queue import Queue

driver_queue = Queue()


def create_driver():
    driver = webdriver.Chrome()
    driver_queue.put(driver)
    return driver


num_drivers = 14
drivers = [create_driver() for _ in range(num_drivers)]


def process_link(link):
    driver = driver_queue.get()
    try:
        result = getTextFromArticle(driver, link)
    except Exception as e:
        print(f"Error processing link {link}: {str(e)}")
    finally:
        driver_queue.put(driver)
    return result


with ThreadPoolExecutor(max_workers=num_drivers) as executor:
    results = list(
        tqdm(executor.map(process_link, pickleLinkSet), total=len(pickleLinkSet)))

In [None]:
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("dica33DataNotClean.csv", sep=",")