In [None]:
import sys
sys.path.append("../.")

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm.auto import tqdm
from Utils.Medic1Utils import getAllHrefForArgument, getTextFromArticle
from Utils.Utils import createObjectPickleSnapshot

In [None]:
options = Options()
options.headless = True
options.add_argument("--window-size=1280,1024")

In [None]:
driver = webdriver.Chrome(options=options)
linkSetPath = "Medic1LinkSet.pkl"
linkSet = set()

linkToSearchList = ["https://www.medicitalia.it/consulti/?tag=celiachia",
                    "https://www.medicitalia.it/consulti/?tag=asma",
                    "https://www.medicitalia.it/consulti/?tag=allergia",
                    "https://www.medicitalia.it/consulti/?tag=insonnia",
                    "https://www.medicitalia.it/consulti/?tag=emicrania",
                    "https://www.medicitalia.it/consulti/?tag=malattia-di-alzheimer",
                    "https://www.medicitalia.it/consulti/?tag=diabete",
                    "https://www.medicitalia.it/consulti/?tag=disturbi-della-vista",
                    "https://www.medicitalia.it/consulti/?tag=dermatite",
                    "https://www.medicitalia.it/consulti/?tag=salute-orale"
                    ]


driver.get(linkToSearchList[0])
accept_cookies_button = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "pt-accept-all")))     
accept_cookies_button.click()
for link in tqdm(linkToSearchList):
    linkSet.update(getAllHrefForArgument(driver, link, 100))
driver.quit()

In [None]:
linkSetPath = "MedicItaliaLinkSet.pkl"
createObjectPickleSnapshot(linkSet, linkSetPath)

In [None]:
from Utils import loadObjectPickleSnapshot


linkSetPath = "MedicItaliaLinkSet.pkl"
pickleLinkSet = loadObjectPickleSnapshot(linkSetPath)

In [None]:
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from queue import Queue

driver_queue = Queue()


def create_driver():
    driver = webdriver.Chrome()
    driver_queue.put(driver)
    return driver


num_drivers = 14
drivers = [create_driver() for _ in range(num_drivers)]


def process_link(link):
    driver = driver_queue.get()
    try:
        result = getTextFromArticle(driver, link)
    except Exception as e:
        print(f"Error processing link {link}: {str(e)}")
    finally:
        driver_queue.put(driver)
    return result


with ThreadPoolExecutor(max_workers=num_drivers) as executor:
    results = list(
        tqdm(executor.map(process_link, pickleLinkSet), total=len(pickleLinkSet)))

In [None]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle("MedicItaliaDataNotClean.pkl")