### Dependencies

In [48]:
import pandas as pd
import os
import base64
import requests
import csv
import traceback
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import WebDriver

In [49]:
def web_driver():
    options = webdriver.ChromeOptions()

    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1200")
    options.add_argument('--disable-dev-shm-usage')

    # Initialize the WebDriver instance with the specified options
    driver = webdriver.Chrome(options=options)
    return driver

### Funzioni

In [87]:
def wait_resource(driver, string, by='XPATH', retry=3, sec=20):
    if(by == 'XPATH'):
        by_object = By.XPATH
    elif(by == 'CLASS_NAME'):
        by_object = By.CLASS_NAME
    elif(by == 'TAG_NAME'):
        by_object = By.TAG_NAME
    elif(by == 'CSS_SELECTOR'):
        by_object = By.CSS_SELECTOR

    retries = 0
    while retries < retry:
        try:
            WebDriverWait(driver, sec).until(
                EC.presence_of_element_located((by_object, string)),
            )
            return True
        except:
            if isinstance(driver, WebDriver):
                driver.refresh()
                retries += 1
            else:
                return False

    return False

def save_file(df, page, category_name, step=10):
    with open("./data/" + category_name + ".csv",'ab') as f:                          
        if (page==step):
            df.to_csv(f, index=False, quoting=csv.QUOTE_ALL)
        else:
            df.to_csv(f, index=False, quoting=csv.QUOTE_ALL, header=False)

def check_interrupt():
    if os.path.exists(os.path.join(os.getcwd(),'stop.txt')):
        print('Interrupted')
        return True
    return False

def encode_image_to_base64(img):
    encoded = base64.b64encode(img).decode("utf-8")
    return encoded


def get_doctor_data(driver, url):
    doc = {'URL': url, 'name': '', 'specialization': '', 'n_replies': 0, 'picture': None}
    driver.get(url)

    # Name
    if(wait_resource(driver, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/h1/b', by='XPATH', retry=1, sec=1)):
        n = driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/h1/b')
        doc['name']= n.get_attribute("innerText")
    
    # Specialization
    if(wait_resource(driver, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/h2[3]/span', by='XPATH', retry=1, sec=1)):
        s = driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/h2[3]/span')
        span = s.get_attribute("innerText")

        if 'Specializzazione' in span:
            if(wait_resource(driver, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]', by='XPATH', retry=1, sec=1)):
                spec = driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]')
                doc['specialization'] = spec.get_attribute("innerText").split('Specializzazione\n',2)[1].split(';')[0]

    # N. replies
    if(wait_resource(driver, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/ul[1]/li/a', by='XPATH', retry=1, sec=1)):
        n = driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/ul[1]/li/a')
        doc['n_replies'] = int(n.get_attribute("innerText").split(' rispost')[0])
    elif(wait_resource(driver, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/ul[1]/li', by='XPATH', retry=1, sec=1)):
        n = driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/ul[1]/li')
        doc['n_replies'] = int(n.get_attribute("innerText").split(' rispost')[0])
        
    # Picture 
    if(wait_resource(driver, "//div[@class='txtArticolo']/img", by='XPATH', retry=1, sec=1)):
        image_elem = driver.find_element(By.XPATH, "//div[@class='txtArticolo']/img")
        image_url = image_elem.get_attribute('src')
        image = requests.get(image_url).content
        doc['picture'] = encode_image_to_base64(image)

    return doc


def doctors_scraping(driver, root):
    doctors = []

    pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'z']

    step = 5
    for i, page in enumerate(pages):
        page_number = i+1
        print(f'Starting page {page_number}/{len(pages)} ({page})')
        try:
            path = root + page
            driver.get(path)

            if wait_resource(driver, 'div.txtArticolo', by='CSS_SELECTOR', retry=2, sec=2):

                a_elements = driver.find_elements(By.XPATH, "//div[@class='txtArticolo']/a")

                urls = set([a.get_attribute("href") for a in a_elements])               # ogni link profilo è presente due volte
                urls = list(urls)
                #print(urls)
         
                for j, url in enumerate(urls):
                    try:
                        print(f'Pagina {page_number} ({page}), url {j}, {url}')
                        if url:
                            if check_interrupt():
                                return doctors

                            # get doctor data
                            doc = get_doctor_data(driver, url)

                            # save
                            doctors.append(doc)
                        else:
                            print('URL skipped', path+url)
                    except:
                        print('Eccezione url', url)
                        traceback.print_exc()

            else:
                print('Page skipped', page)

        except:
            print('Eccezione pagina', page)
            traceback.print_exc()

        if (page_number%step) == 0:          # salva ogni 'step' pagine
            df = pd.DataFrame(doctors)
            save_file(df, page_number, 'doctors', step)
            print(f'Saved pagina {page_number}/{len(pages)}')
            doctors = []

    if len(doctors) != 0:       # salva residui
        print('salvando residui')               
        df = pd.DataFrame(doctors)
        save_file(df, 42, 'doctors', step)
        doctors = []
    return doctors

### Q&A scraping

In [51]:
def scraping(j,k,driver):

    data = []

    for i in range(j,k):

        next_page_script = "document.getElementById('pgg').value='{}'; paginazione.submit();".format(i)
        driver.execute_script(next_page_script)

        retries = 0

        for urls in driver.find_elements(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[14]/div/div/h3/a'):
            url = urls.get_attribute("href")
            #print("URL QA: {}".format(url))
            #print()

            curr_driver = web_driver()
            curr_driver.get(url)

            while retries <= 5:
                try:
                    WebDriverWait(curr_driver, 20).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[15]'))
                    )
                    break
                except:
                    curr_driver.refresh()
                    retries += 1

            try:
                date = curr_driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[15]/div[4]').text
            except:
                date = None

            try:
                profile_link = curr_driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[15]/div[5]/div/h2/b/a')
                profile_link = profile_link.get_attribute("href")
            except:
                profile_link = None

            try:
                question = curr_driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[15]').text.split('\n')[5]
            except:
                question = None

            try:
                answer = curr_driver.find_element(By.XPATH, '//*[@id="wrap"]/div/div/div/div[2]/div[1]/main/article/div[15]/div[5]/div').text
            except:
                answer = None

            #print("Question Date: {}".format(date))
            #print()
            #print("Question Title: {}".format(question_title))
            #print()
            #print("Question: {}".format(question))
            #print()
            #print("Answer: {}".format(answer))
            #print()
            #print("Doctor profile: {}".format(profile_link))
            #print()

            data.append({
                "URL": url,
                "Question Date": date,
                "Question": question,
                "Answer": answer,
                "Doctor profile": profile_link
            })

            curr_driver.close()
            curr_driver.quit()


    df = pd.DataFrame(data)
    return df

In [52]:

def processing(df, category):
    new_data = []

    for i, row in df.iterrows():
        raw_answer = row['Answer']

        match_prof = re.search(r'Prof\.\s+((?:\w+\s?)+)', raw_answer)
        match_doctor = re.search(r'Dott\.ssa\s+((?:\w+\s?)+)', raw_answer)
        if not match_doctor:
            match_doctor = re.search(r'Dott\.\s+((?:\w+\s?)+)', raw_answer)

        signature = ""
        if match_doctor:
            signature = "Dott"
        elif match_prof:
            signature = "Prof"

        match_date = re.search(r'Risposta del (\d+ \w+ \d+)', raw_answer)

        match_location = re.search(r'(\w+ \(\w+\))$', raw_answer)


        if match_location:
            location = match_location.group(1)
        else:
            location = None


        if match_date:
            date = match_date.group(1).strip()
        else:
            date = None

        if signature == "Prof":
            start_index = match_prof.end()
            answer = raw_answer[start_index:].strip()
        elif signature == "Dott":
            start_index = match_doctor.end()
            answer = raw_answer[start_index:].strip()
        else:
            answer = None

        if(answer):
            match_prof = re.search(r'Prof\.\s+((?:\w+\s?)+)', answer)
            match_doctor = re.search(r'Dott\.\s+((?:\w+\s?)+)', answer)

            if match_prof:
                end_index = match_prof.start()
                answer = answer[:end_index].strip()
            elif match_doctor:
                end_index = match_doctor.start()
                answer = answer[:end_index].strip()

        new_data.append({
                "URL": row["URL"],
                "Category": category,
                "Question Date": row["Question Date"],
                "Question": row["Question"],
                "Answer Date": date,
                "Answer": answer,
                "Location": location,
                "Doctor profile": row["Doctor profile"]
            })

    new_df = pd.DataFrame(new_data)
    return new_df

In [53]:
def q_a_scraping(urls, category_names):
    for idx,url in enumerate(urls):
        retries = 0

        driver = web_driver()
        driver.get(url)
        while retries <= 5:
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="paginazione"]'))
                )
                break
            except:
                driver.refresh()
                retries += 1
        if(retries == 6):
            continue

        # get number of pages
        num_pages=driver.find_element(By.XPATH, '//*[@id="paginazione"]').text
        num_pages = num_pages.split('di')[0]
        num_pages = num_pages.split('\n')[-2]
        num_pages = int(num_pages)

        start_page = 1
        end_page = 330
        num_pages = min(num_pages,end_page)
        step = 10
        for j in range(start_page,num_pages+1,step):
            if(j+step <= num_pages + 1):
                df = scraping(j,j+step,driver)
            else:
                df = scraping(j, num_pages+1,driver)
            new_df = processing(df, category_names[idx])
            with open("./data/" + category_names[idx] + ".csv",'ab') as f:
                if(j==1):
                    new_df.to_csv(f, index=False)
                else:
                    new_df.to_csv(f, index=False, header=False)
        driver.close()
        driver.quit()
        #os.system('killall chrome')

### Get category names

In [None]:
path = 'https://www.dica33.it/esperto-risponde/temi-piu-trattati/temi-piu-trattati.asp'
driver = web_driver()
driver.get(path)

urls = []
category_names = []

if(wait_resource(driver, 'div.txtArticolo a', by='CSS_SELECTOR', retry=1, sec=1)):
    a_elements = driver.find_elements(By.CSS_SELECTOR, 'div.txtArticolo a')
    urls = [a.get_attribute("href") for a in a_elements]
    category_names = [a.text for a in a_elements]
    

driver.close()
driver.quit()

Scraping

In [None]:
q_a_scraping(urls, category_names)

### Doctors scraping

In [88]:
path = 'https://www.dica33.it/esperto-risponde/medici-specialisti/'
driver = web_driver()
doctors = doctors_scraping(driver, path)
if len(doctors):
    print('Salvando i residui')
    df = pd.DataFrame(doctors)
    save_file(df, 42, 'doctors', -1)

driver.close()
driver.quit()

Starting page 1/24 (a)
Pagina 1 (a), url 0, https://www.dica33.it/esperto-risponde/medici-online-paolo-adamoli-9003.asp
Pagina 1 (a), url 1, https://www.dica33.it/esperto-risponde/medici-online-elsa-alberti-15634.asp
Pagina 1 (a), url 2, https://www.dica33.it/esperto-risponde/medici-online-pietro-arcadi-43165.asp
Pagina 1 (a), url 3, https://www.dica33.it/esperto-risponde/medici-online-fabio-arpaia-13243.asp
Pagina 1 (a), url 4, https://www.dica33.it/esperto-risponde/medici-online-franco-abeniacar-17460.asp
Pagina 1 (a), url 5, https://www.dica33.it/esperto-risponde/medici-online-gian-luca-maria-alati-1171.asp
Pagina 1 (a), url 6, https://www.dica33.it/esperto-risponde/medici-online-alessandro-ahrens-2975.asp
Pagina 1 (a), url 7, https://www.dica33.it/esperto-risponde/medici-online-alberto-alexandre-10059.asp
Pagina 1 (a), url 8, https://www.dica33.it/esperto-risponde/medici-online-giulia-roberta-arcidiacono-41175.asp
Pagina 1 (a), url 9, https://www.dica33.it/esperto-risponde/medici-o

In [89]:
df = pd.read_csv('./data/doctors.csv')
df

Unnamed: 0,URL,name,specialization,n_replies,picture
0,https://www.dica33.it/esperto-risponde/medici-...,Dott. PAOLO ADAMOLI,Pediatria,2,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...
1,https://www.dica33.it/esperto-risponde/medici-...,Dott.ssa ELSA ALBERTI,Pediatria,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
2,https://www.dica33.it/esperto-risponde/medici-...,Dott. PIETRO ARCADI,Igiene e medicina preventiva,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
3,https://www.dica33.it/esperto-risponde/medici-...,Dott. FABIO ARPAIA,Odontoiatria,37,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
4,https://www.dica33.it/esperto-risponde/medici-...,Dott. FRANCO ABENIACAR,Odontoiatria,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
...,...,...,...,...,...
2957,https://www.dica33.it/esperto-risponde/medici-...,Dott. ALESSANDRO ZALAFFI,Neurochirurgia,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
2958,https://www.dica33.it/esperto-risponde/medici-...,Dott.ssa ELENA ZINGARO,,11,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
2959,https://www.dica33.it/esperto-risponde/medici-...,Dott. PIETRO ZIZZO,Medicina generale,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
2960,https://www.dica33.it/esperto-risponde/medici-...,Dott. COSIMO ZAZA,Malattie apparato respiratorio,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUDBAQEAwUEBA...
