In [224]:
from selenium import webdriver
from selenium.webdriver.common.by import By

import pandas as pd
import re

from os import path
import warnings
import folium
 
warnings.filterwarnings("ignore", category=FutureWarning)

driver = webdriver.Firefox()

In [221]:
# Extract All Reviews from URL
def Get_Reviews_Data_From_URL(url, filename):
    
    # XPath Elements
    total_reivews_xpath = """//*[@id="fullIntro"]/span[1]"""
    category_title_xpath = """//*[@id="layout-content-container"]/div[1]/div[1]/div/ol/li[2]/a/span"""
    subcategory_title_xpath = """//*[@id="feedbacks-index-container"]/div[1]/h1"""
    review_client_fullname_xpath = """//*[@id="rank-{}"]/div/div[1]/span[1]"""
    review_location_xpath = """//*[@id="rank-{}"]/div/div[1]/span[1]/a"""
    review_date_xpath = """//*[@id="rank-{}"]/div/div[1]/span[3]"""
    review_service_description_xpath = """//*[@id="rank-{}"]/div/div[1]/p"""
    review_client_feedback_xpath = """//*[@id="rank-{}"]/div/div[2]/p"""
    review_feedback_general_score_xpath = """//*[@id="rank-{}"]/div/div[3]/div/div/div[1]/div"""
    review_feedback_quality_score_xpath = """//*[@id="rank-{}"]/div/div[3]/div/div/div[2]/div"""
    review_feedback_price_score_xpath = """//*[@id="rank-{}"]/div/div[3]/div/div/div[3]/div"""
    review_feedback_time_score_xpath = """//*[@id="rank-{}"]/div/div[3]/div/div/div[4]/div"""
    review_feedback_treatment_score_xpath = """//*[@id="rank-{}"]/div/div[3]/div/div/div[5]/div"""
    professional_total_reviews_xpath = """//*[@id="feedbacks-section-container"]/div[{}]/div[2]/div/div[2]/div[2]"""
    professional_sysid_xpath = """//*[@id="feedbacks-section-container"]/div[{}]/div[1]/a"""
    professional_image_xpath = """//*[@id="feedbacks-section-container"]/div[{}]/div[1]/span/img"""
    professional_average_score_xpath = """//*[@id="feedbacks-section-container"]/div[{}]/div[2]/div/div[1]/div[2]"""
    # 

    # Initiate Parameters & Webrowser
    driver.get(url)
    page_max = int(int(driver.find_element(By.XPATH, total_reivews_xpath).text)/10) + 1 # Reminder: 10 Reviews per Page --> For ex. 115 reviews means we need to iterate throguh pages (115/10)+1 pages
    category_title = driver.find_element(By.XPATH, category_title_xpath).text
    subcategory_title = driver.find_element(By.XPATH, subcategory_title_xpath).text
    #

    # Initiate DataFrames
    if(path.exists(filename)):
        df_reviews = pd.read_csv(filename)
    else:
        df_reviews = pd.DataFrame(columns=['Date', 'Category', 'Subcategory', 'Customer Name', 'Location', 'Service Categories', 'Review', 'Overall Score', 'Quality Score', 'Price Score', 'Time Score', 'Treatment Score', 'Professional SysID', 'Is Active', 'Total Prof. Reviews', 'Avg. Score'])
    #
    for page in range(1,page_max+1): # Iterate each page from 1-(page_max+1) --> collecting data and appending it to the main DataFrame
        curr_url = url + '?page=' + str(page)
        driver.get(curr_url)
        
        for i in range(1, 11): # Iterate each review
            try: # Getting Review's Data from page

                # Sometimes the customer name will apears as "D.V. Tel-Aviv" instead of "Daniel Ventura, Tel-Aviv" --> First 'if' checks if after splitting it with ',' the value stays the same meaning it doesn't have ',' in it.
                if(driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1].split(',')[0] == driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1] ):
                    client_fullname = str(driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1].rsplit('.', 1)[0])
                    location = str(driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1].split('.')[-1].strip())
                else:
                    client_fullname = str(driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1].split(',')[-2])
                    location = str(driver.find_element(By.XPATH, review_client_fullname_xpath.format(i)).text[:-1].split(',')[-1].strip())

                date = driver.find_element(By.XPATH, review_date_xpath.format(i)).text
                service_description = driver.find_element(By.XPATH, review_service_description_xpath.format(i)).text
                client_feedback = driver.find_element(By.XPATH, review_client_feedback_xpath.format(i)).text
                feedback_general_score = driver.find_element(By.XPATH, review_feedback_general_score_xpath.format(i)).text
                feedback_quality_score = driver.find_element(By.XPATH, review_feedback_quality_score_xpath.format(i)).text
                feedback_price_score = driver.find_element(By.XPATH, review_feedback_price_score_xpath.format(i)).text
                feedback_time_score = driver.find_element(By.XPATH, review_feedback_time_score_xpath.format(i)).text
                feedback_treatment_score = driver.find_element(By.XPATH, review_feedback_treatment_score_xpath.format(i)).text
                
                professional_sysid = Extract_Professional_SysID(professional_sysid_xpath.format(i*2), professional_image_xpath.format(i*2))
                professional_is_active = 1 if Is_Element_Exists(professional_sysid_xpath.format(i*2)) else 0
                professional_total_reviews = driver.find_element(By.XPATH, professional_total_reviews_xpath.format(i*2)).text if (Is_Element_Exists(professional_total_reviews_xpath.format(i*2))) else 'לא ידוע'
                professional_average_score = driver.find_element(By.XPATH, professional_average_score_xpath.format(i*2)).text if (Is_Element_Exists(professional_average_score_xpath.format(i*2))) else 'לא ידוע'
                
                df_reviews = df_reviews.append({'Date': date, 'Category': category_title, 'Subcategory': subcategory_title, 'Customer Name': client_fullname, 'Location': location, 'Service Categories': service_description, 'Review': client_feedback, 'Overall Score': feedback_general_score, 'Quality Score': feedback_quality_score, 'Price Score': feedback_price_score, 'Time Score': feedback_time_score, 'Treatment Score': feedback_treatment_score, 'Professional SysID': professional_sysid, 'Is Active': professional_is_active, 'Total Prof. Reviews': professional_total_reviews, 'Avg. Score': professional_average_score }, ignore_index=True)
            
            except Exception as e: 
                # print(i*page)
                # print(e)
                break # Exit the loop when can't find XPath element due to Exception --> meaning there is no other reviews left
    
    df_reviews['Date'] = pd.to_datetime(df_reviews['Date']) # Make sure the 'Date' column is in datetime object and not string
    df_reviews.sort_values(by=['Category', 'Subcategory', 'Date'],inplace=True, ascending=[True, True, False])
    df_reviews.to_csv(filename, index=False, encoding = 'utf-8-sig') # Exports the reviews to csv file
#

# Extract Professional SysID from given XPath --> A [div] in the XPath increasing by '2' for each review, therefore: i*2 for each unique XPath. for ex. 1st Review: .../div[2]/.. , 2nd Review: ..../div[4]/... etc.
def Extract_Professional_SysID(SysID_XPath, Image_XPath):
    if(Is_Element_Exists(SysID_XPath)):
        string = driver.find_element(By.XPATH, SysID_XPath).get_attribute('href')
        match = re.search(r'\/SpCard\/Sp\/(\d+)\?', string)

        return str(match.group(1))

    elif(Is_Element_Exists(Image_XPath)):
        string = driver.find_element(By.XPATH, Image_XPath).get_attribute('data-src')
        if(string.find("SP_no_pic") != -1 ):
            return "לא ידוע"
        else:
            match = re.search(r'\/Sp\/(\d+)\.jpg', string)
            return str(match.group(1))

    return "לא ידוע"


def Is_Element_Exists(XPath):
    try:
        driver.find_element(By.XPATH, XPath)
    except:
        return False
    return True

In [222]:
# Testing
url = "https://www.midrag.co.il/Content/FeedbacksIndex/570283"
filename = "data\\reviews.csv"
Get_Reviews_Data_From_URL(url, filename)


  df_reviews['Date'] = pd.to_datetime(df_reviews['Date']) # Make sure the 'Date' column is in datetime object and not string


In [261]:
# Create a Map that takes array of [locations, total_reviews] as an input, gets the Geo-Location of each location (using OpenStreeMap API) and Pin-Points the Top-5 Locations with the most reviews

import requests

map = folium.Map(location=[31.786060, 35.200779], zoom_start=7)

df = pd.read_csv("data\\reviews.csv")

df['Location'] = df['Location'].astype(str)
df['Location'] = df['Location'][~df['Location'].str.contains(r'\d')]
places = df['Location'].value_counts().to_dict()

places_dict = []
for location, total_reviews in places.items():
    places_dict.append({'name': location, 'total_reviews': total_reviews})

places_dict= sorted(places_dict, key=lambda x: x['total_reviews'], reverse=True)

for place in places_dict[:5]:
    popup = folium.Popup(f'<center><font size="2"><b>{place["name"]}</b></font><br><u>ביקורות</u>: {place["total_reviews"]}', max_width=300)

    url = f'https://nominatim.openstreetmap.org/search?q={place["name"]}+Israel&format=json'
    response = requests.get(url).json()
    if response:
        lat = response[0]['lat']
        lon = response[0]['lon']

    folium.Marker(location=[lat, lon], popup=popup).add_to(map)

map