## Dependencies

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pickle
from pprint import pprint
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.webdriver import WebDriver
import json
import traceback
import pandas as pd
import threading
import os
import base64
import requests
import csv

In [2]:
def web_driver():
    options = webdriver.ChromeOptions()

    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1200")
    options.add_argument('--disable-dev-shm-usage')

    # Initialize the WebDriver instance with the specified options
    driver = webdriver.Chrome(options=options)
    return driver

## Funzioni

In [3]:


def wait_resource(driver, string, by='XPATH', retry=3, sec=20):
    if(by == 'XPATH'):
        by_object = By.XPATH
    elif(by == 'CLASS_NAME'):
        by_object = By.CLASS_NAME
    elif(by == 'TAG_NAME'):
        by_object = By.TAG_NAME
    elif(by == 'CSS_SELECTOR'):
        by_object = By.CSS_SELECTOR

    retries = 0
    while retries < retry:
        try:
            WebDriverWait(driver, sec).until(
                EC.presence_of_element_located((by_object, string)),
            )
            return True
        except:
            if isinstance(driver, WebDriver):
                driver.refresh()
                retries += 1
            else:
                return False

    return False


def get_category_links(root):
    links = []

    driver = web_driver()
    driver.get(root)

    if(not wait_resource(driver, '/html/body/main/div[2]/div/div[2]/div[3]', by='XPATH')):
        print("Errore ottenimento links")
        return []

    div_element = driver.find_element(By.XPATH, '/html/body/main/div[2]/div/div[2]/div[3]')
    a_elements = div_element.find_elements(By.TAG_NAME, 'a')

    count = 0
    for a in a_elements:
        link = a.get_attribute('href')
        name = a.get_attribute('innerText')
        links.append([name, link])
        count += 1
        print('Link numero', count, 'preso.')

    driver.close()
    driver.quit()

    return links


def save_file(df, page, category_name, step=10):
    with open("./data/" + category_name + ".csv",'ab') as f:                          
        if (page==step):
            df.to_csv(f, index=False, quoting=csv.QUOTE_ALL)
        else:
            df.to_csv(f, index=False, quoting=csv.QUOTE_ALL, header=False)


def check_interrupt():
    if os.path.exists(os.path.join(os.getcwd(),'stop.txt')):
        print('Interrupted')
        return True
    return False

def encode_image_to_base64(img):
    encoded = base64.b64encode(img).decode("utf-8")
    return encoded

                            
def get_document_data(driver, url):
    doc = {'URL': url, 'question': '', 'answer': '', 'quest_date': '', 'ans_date': ''}

    driver.get(url)

    # Question
    if(wait_resource(driver, 'div.col.cons.px-4.pb-0', by='CSS_SELECTOR', retry=2, sec=5)):
        q = driver.find_element(By.CSS_SELECTOR, 'div.col.cons.px-4.pb-0')
        doc['question'] = q.get_attribute("innerText")

        # Question Date
        if(wait_resource(q, 'time.text-secondary.small', by='CSS_SELECTOR', retry=1, sec=1)):
            time_element = q.find_element(By.CSS_SELECTOR, 'time.text-secondary.small')
            doc['quest_date'] = time_element.get_attribute("innerText")

    # Answer
    if(wait_resource(driver, 'div.col.cons.px-4.pt-4.pb-0', by='CSS_SELECTOR', retry=2, sec=5)):
        a = driver.find_element(By.CSS_SELECTOR, 'div.col.cons.px-4.pt-4.pb-0')
        doc['answer'] = a.get_attribute("innerText")

    # Answer Date
    if(wait_resource(driver, 'div.col.text-right.small.text-secondary', by='CSS_SELECTOR', retry=1, sec=1)):
        div_element = driver.find_element(By.CSS_SELECTOR, 'div.col.text-right.small.text-secondary')
        if(wait_resource(div_element, 'time', by='TAG_NAME', retry=1, sec=1)):
            time_element = div_element.find_element(By.TAG_NAME, 'time')
            doc['ans_date'] = time_element.get_attribute("innerText")

    return doc


def get_doctor_data(driver, url):
    doc = {'URL': url, 'name': '', 'specialization': '', 'location': '', 'ranking': None, 'n_replies': None, 'n_likes': None, 'picture': None}

    driver.get(url)

    # Name
    if(wait_resource(driver, 'h1.profilo-h1.text-center.text-md-left', by='CSS_SELECTOR', retry=2, sec=1)):
        elem = driver.find_element(By.CSS_SELECTOR, 'h1.profilo-h1.text-center.text-md-left')
        doc['name'] = elem.get_attribute("innerText")

    # Stars
    if(wait_resource(driver, 'div.pref.Stars', by='CSS_SELECTOR', retry=2, sec=1)):
        elem = driver.find_element(By.CSS_SELECTOR, 'div.pref.Stars')
        doc['ranking'] = float(elem.get_attribute("style").split(': ')[-1][:-1])
    
    # N. replies
    if(wait_resource(driver, '/html/body/main/div/div/section/div[1]/div[3]/p[1]/span[1]', by='XPATH', retry=2, sec=1)):
        elem = driver.find_element(By.XPATH, '/html/body/main/div/div/section/div[1]/div[3]/p[1]/span[1]')
        doc['n_replies'] = int(elem.get_attribute("innerText").replace('.', ''))
    
    # N. likes
    if(wait_resource(driver, '/html/body/main/div/div/section/div[1]/div[3]/p[2]/span[1]', by='XPATH', retry=2, sec=1)):
        elem = driver.find_element(By.XPATH, '/html/body/main/div/div/section/div[1]/div[3]/p[2]/span[1]')
        doc['n_likes'] = int(elem.get_attribute("innerText").replace('.', ''))

    # Location
    if(wait_resource(driver, 'div.box_sedi', by='CSS_SELECTOR', retry=1, sec=1)):       
        elem = driver.find_element(By.CSS_SELECTOR, 'div.box_sedi')                  # solo primo address

        location, region = '', ''
        if(wait_resource(elem, "//span[@itemprop='addressLocality']", by='XPATH', retry=1, sec=1)):
            loc = elem.find_element(By.XPATH, "//span[@itemprop='addressLocality']")
            location = loc.get_attribute("innerText")
        
        if(wait_resource(elem, "//span[@itemprop='addressRegion']", by='XPATH', retry=1, sec=1)):
            reg = elem.find_element(By.XPATH, "//span[@itemprop='addressRegion']")
            region = reg.get_attribute("innerText")

        doc['location'] = location + ' ' + region               # es. Milano (MI)

    # Specialization
    if(wait_resource(driver, "//div[@class='row py-2 mx-0']//span[@itemprop='medicalSpecialty']", by='XPATH', retry=1, sec=1)):
        spec = driver.find_element(By.XPATH, "//div[@class='row py-2 mx-0']//span[@itemprop='medicalSpecialty']")
        doc['specialization'] = spec.get_attribute("innerText")

    # Picture 
    if(wait_resource(driver, 'img.img-fluid.rounded.m-auto', by='CSS_SELECTOR', retry=1, sec=1)):
        image_elem = driver.find_element(By.CSS_SELECTOR, 'img.img-fluid.rounded.m-auto')
        image_url = image_elem.get_attribute('src')
        image = requests.get(image_url).content
        doc['picture'] = encode_image_to_base64(image)

    return doc
        


def q_a_scraping(category_name, root, driver):
    q_a = []
    num_pages = 1
    max_pages = 200
    step = 10

    driver.get(root)

    # get number of pages
    if(wait_resource(driver, "text-center.numeri-pagina.rounded.shadow", by='CLASS_NAME', retry=2, sec=5)):
        div_element = driver.find_element(By.CLASS_NAME, 'text-center.numeri-pagina.rounded.shadow')
        page_info = div_element.find_element(By.XPATH, './/*[contains(text(), "Pagina ")]')
        num_pages = int(page_info.get_attribute('innerText').split('di ')[-1])

    num_pages = min(num_pages, max_pages)
    print(f'Categoria {category_name} pagina 1/{num_pages}')
    for page in range(1,num_pages+1):
        if (page%step) == 0:
            print(f'Starting {category_name} pagina {page}/{num_pages}')
        try:
            path = root + '?pagina=' + str(page)
            driver.get(path)

            if(wait_resource(driver, "titconsulto", by='CLASS_NAME')):
                a_elements = driver.find_elements(By.CLASS_NAME, "titconsulto")
                urls = [a.get_attribute("href") for a in a_elements]

                for i, url in enumerate(urls):
                    try:
                        if url:
                            if check_interrupt():
                                return q_a
                            #print(f'Pagina {page} url {i}, {url}')

                            # get question, answer and metadata
                            doc = get_document_data(driver, url)
                            doc['Category'] = category_name
                            # save
                            q_a.append(doc)
                        else:
                            print('Skip url:', path+url)
                    except:
                        print('Eccezione url', url)
                        traceback.print_exc()
        except:
            print('Eccezione pagina', page)
            traceback.print_exc()

        if (page%step) == 0:          # salva ogni 'step' pagine
            df = pd.DataFrame(q_a)
            save_file(df, page, category_name, step)
            print(f'Saved category {category_name} pagina {page}/{num_pages}')
            q_a = []

    if len(q_a) != 0:       # salva residui
        print('salvando residui')               
        df = pd.DataFrame(q_a)
        save_file(df, 42, category_name, step)
        q_a = []
    return q_a



def doctors_scraping(driver, root):
    doctors = []

    driver.get(root)
    if not wait_resource(driver, 'span.page-numbers.current.px-3', by='CSS_SELECTOR', retry=1, sec=1):
        print('Number of pages not found')
        return []
    
    span = driver.find_element(By.CSS_SELECTOR, 'span.page-numbers.current.px-3').get_attribute("innerText")
    num_pages = int(span.split('di ')[-1])

    step = 10
    print('Starting page 1')
    for page in range(1, num_pages+1):
        if (page%step) == 0:
            print(f'Starting page {page}/{num_pages}')
        try:
            path = root + '?pagina=' + str(page)
            driver.get(path)

            if wait_resource(driver, '/html/body/main/div/div/section[1]/div', by='XPATH', retry=2, sec=1):
                a_elements = driver.find_elements(By.XPATH, '/html/body/main/div/div/section[1]/div//a[contains(@class, "d-block")]')
                urls = [a.get_attribute("href") for a in a_elements]
                
                for i, url in enumerate(urls):
                    #print('Page', page, 'url', i, url)
                    try:
                        if url:
                            if check_interrupt():
                                return doctors

                            # get doctor data
                            doc = get_doctor_data(driver, url)

                            # save
                            doctors.append(doc)
                        else:
                            print('URL skipped', path+url)
                    except:
                        print('Eccezione url', url)
                        traceback.print_exc()

            else:
                print('Page skipped', page)

        except:
            print('Eccezione pagina', page)
            traceback.print_exc()

        if (page%step) == 0:          # salva ogni 'step' pagine
            df = pd.DataFrame(doctors)
            save_file(df, page, 'doctors', step)
            print(f'Saved pagina {page}/{num_pages}')
            doctors = []

    if len(doctors) != 0:       # salva residui
        print('salvando residui')               
        df = pd.DataFrame(doctors)
        save_file(df, 42, 'doctors', step)
        doctors = []
    return doctors

### Get category links

In [4]:
category_links = get_category_links('https://www.medicitalia.it/consulti/')

Link numero 1 preso.
Link numero 2 preso.
Link numero 3 preso.
Link numero 4 preso.
Link numero 5 preso.
Link numero 6 preso.
Link numero 7 preso.
Link numero 8 preso.
Link numero 9 preso.
Link numero 10 preso.
Link numero 11 preso.
Link numero 12 preso.
Link numero 13 preso.
Link numero 14 preso.
Link numero 15 preso.
Link numero 16 preso.
Link numero 17 preso.
Link numero 18 preso.
Link numero 19 preso.
Link numero 20 preso.
Link numero 21 preso.
Link numero 22 preso.
Link numero 23 preso.
Link numero 24 preso.
Link numero 25 preso.
Link numero 26 preso.
Link numero 27 preso.
Link numero 28 preso.
Link numero 29 preso.
Link numero 30 preso.
Link numero 31 preso.
Link numero 32 preso.
Link numero 33 preso.
Link numero 34 preso.
Link numero 35 preso.
Link numero 36 preso.
Link numero 37 preso.
Link numero 38 preso.
Link numero 39 preso.
Link numero 40 preso.
Link numero 41 preso.
Link numero 42 preso.
Link numero 43 preso.
Link numero 44 preso.
Link numero 45 preso.
Link numero 46 pres

### Q&A scraping

In [6]:
def q_a_scraping_worker(category_name, category_link):
    driver = web_driver()
    documents = q_a_scraping(category_name, category_link, driver)
    if len(documents) != 0:
        print('Salvando residui')
        df = pd.DataFrame(documents)
        save_file(df, 42, category_name, -1)
    driver.close()
    driver.quit()

threads = []


for category_name, category_link in category_links:
    thread = threading.Thread(target=q_a_scraping_worker, args=(category_name, category_link))
    threads.append(thread)
    print('Creating thread')
    thread.start()

for thread in threads:
    thread.join()


"indici = [21,22,27,31]\ncategories = []\n\nfor i in indici:\n    categories.append(category_links[i])\n\nfor category_name, category_link in categories:\n    thread = threading.Thread(target=q_a_scraping_worker, args=(category_name, category_link))\n    threads.append(thread)\n    print('Creating thread')\n    thread.start()\n\nfor thread in threads:\n    thread.join()"

### Doctors scraping

In [7]:
path = 'https://www.medicitalia.it/specialisti/'
driver = web_driver()
doctors = doctors_scraping(driver, path)
if len(doctors):
    print('Salvando i residui')
    df = pd.DataFrame(doctors)
    save_file(df, 42, 'doctors', -1)

driver.close()
driver.quit()

Starting page 1
Starting page 10/489
Saved pagina 10/489
Starting page 20/489
Saved pagina 20/489
Starting page 30/489
Saved pagina 30/489
Starting page 40/489
Saved pagina 40/489
Starting page 50/489
Saved pagina 50/489
Starting page 60/489
Saved pagina 60/489
Starting page 70/489
Saved pagina 70/489
Starting page 80/489
Saved pagina 80/489
Starting page 90/489
Saved pagina 90/489
Starting page 100/489
Saved pagina 100/489
Starting page 110/489
Saved pagina 110/489
Starting page 120/489
Saved pagina 120/489
Starting page 130/489
Saved pagina 130/489
Starting page 140/489
Saved pagina 140/489
Starting page 150/489
Saved pagina 150/489
Starting page 160/489
Saved pagina 160/489
Starting page 170/489
Saved pagina 170/489
Starting page 180/489
Saved pagina 180/489
Starting page 190/489
Saved pagina 190/489
Starting page 200/489
Eccezione pagina 200
Saved pagina 200/489


Traceback (most recent call last):
  File "c:\Users\Andrea\miniconda3\envs\scraping\lib\site-packages\urllib3\connection.py", line 175, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "c:\Users\Andrea\miniconda3\envs\scraping\lib\site-packages\urllib3\util\connection.py", line 95, in create_connection
    raise err
  File "c:\Users\Andrea\miniconda3\envs\scraping\lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] Impossibile stabilire la connessione. Risposta non corretta della parte connessa dopo l'intervallo di tempo oppure mancata risposta dall'host collegato

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Andrea\miniconda3\envs\scraping\lib\site-packages\urllib3\connectionpool.py", line 710, in urlopen
    chunked=chunked,
  File "c:\Users\Andrea\miniconda3\envs\scraping\lib\site-packages\urllib3\connec

Starting page 210/489
Saved pagina 210/489
Starting page 220/489
Saved pagina 220/489
Starting page 230/489
Saved pagina 230/489
Starting page 240/489
Saved pagina 240/489
Starting page 250/489
Saved pagina 250/489
Starting page 260/489
Saved pagina 260/489
Starting page 270/489
Saved pagina 270/489
Starting page 280/489
Saved pagina 280/489
Starting page 290/489
Saved pagina 290/489
Starting page 300/489
Saved pagina 300/489
Starting page 310/489
Saved pagina 310/489
Starting page 320/489
Saved pagina 320/489
Starting page 330/489
Saved pagina 330/489
Starting page 340/489
Saved pagina 340/489
Starting page 350/489
Saved pagina 350/489
Starting page 360/489
Saved pagina 360/489
Starting page 370/489
Saved pagina 370/489
Starting page 380/489
Saved pagina 380/489
Starting page 390/489
Saved pagina 390/489
Starting page 400/489
Saved pagina 400/489
Starting page 410/489
Saved pagina 410/489
Starting page 420/489
Saved pagina 420/489
Starting page 430/489
Saved pagina 430/489
Starting pa

In [8]:
df = pd.read_csv('./data/doctors.csv')
df

Unnamed: 0,URL,name,specialization,location,ranking,n_replies,n_likes,picture
0,https://www.medicitalia.it/m.cecchini/,Dr. Maurizio Cecchini,Cardiologia,Pisa (PI),4.7,109073.0,3635.0,UklGRtwWAABXRUJQVlA4INAWAABwgACdASrIAPoAPk0ejE...
1,https://www.medicitalia.it/a.ferraloro/,Dr. Antonio Ferraloro,Neurologia,Messina (ME),4.9,75488.0,2330.0,UklGRroWAABXRUJQVlA4IK4WAACQdACdASrIAPoAPlEijk...
2,https://www.medicitalia.it/felice.cosentino/,Dr. Felice Cosentino,Gastroenterologia e endoscopia digestiva,Milano (MI),4.8,71469.0,2264.0,UklGRnwiAABXRUJQVlA4IHAiAABQkwCdASrIAPoAPlEkjk...
3,https://www.medicitalia.it/giovanniberetta/,Dr. Giovanni Beretta,Andrologia,Firenze (FI),4.8,56298.0,1226.0,UklGRuYVAABXRUJQVlA4INoVAACQYgCdASrIAPoAPlEkj0...
4,https://www.medicitalia.it/matteopacini/,Dr. Matteo Pacini,Psichiatria,Milano (MI),4.8,43952.0,998.0,UklGRiwzAABXRUJQVlA4ICAzAADwfQCdASrIAPoAPlEijU...
...,...,...,...,...,...,...,...,...
9752,https://www.medicitalia.it/antoniovarriale/,Dr. Antonio Varriale,Odontoiatria e odontostomatologia,Giugliano In Campania (Napoli) (NA),0.0,0.0,0.0,UklGRuQTAABXRUJQVlA4INgTAAAQWwCdASrIAPoAPlEkj0...
9753,https://www.medicitalia.it/yleniacrocetto/,Dr.ssa Ylenia Crocetto,Psicologia,Caserta (CE),0.0,0.0,0.0,UklGRgASAABXRUJQVlA4IPQRAABQTwCdASrIAPoAPlEkj0...
9754,https://www.medicitalia.it/giorgiapurisiol/,Dr.ssa Giorgia Purisiol,Psicologia,Verona (VR),0.0,0.0,0.0,UklGRl4ZAABXRUJQVlA4IFIZAADwdACdASrIAPoAPlEkj0...
9755,https://www.medicitalia.it/mariagiuseppacuttano/,Dr.ssa Maria Giuseppa Cuttano,Urologia,Pisa (PI),0.0,0.0,0.0,UklGRgASAABXRUJQVlA4IPQRAABQTwCdASrIAPoAPlEkj0...
