In [1]:
# Install necessary packages
!pip install selenium webdriver_manager beautifulsoup4 pandas boto3

# Install Google Chrome
!apt-get update
!apt-get install -y wget unzip
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt --fix-broken install -y
!apt-get install -y google-chrome-stable

# Find out the installed Chrome version
!google-chrome --version


# Install the matching ChromeDriver
!wget -q https://storage.googleapis.com/chrome-for-testing-public/126.0.6478.55/linux64/chromedriver-linux64.zip
!unzip chromedriver-linux64.zip
!chmod +x chromedriver-linux64

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting botocore<1.30.0,>=1.29.100 (from boto3)
  Downloading botocore-1.29.165-py3-none-any.whl.metadata (5.9 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [3]:
!mv chromedriver-linux64 /usr/local/bin/

mv: cannot stat 'chromedriver-linux64': No such file or directory


In [4]:
!ls /usr/local/bin/chromedriver-linux64

LICENSE.chromedriver  chromedriver


In [5]:
import boto3
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from time import sleep, strftime
from random import randint
from datetime import date
import os

pd.set_option('max_colwidth', None)

class GoogleReviewsBot:
    def __init__(self, link, broker, driver_path='/usr/local/bin/chromedriver-linux64/chromedriver'):
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_experimental_option("detach", True)
        self.options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.link = link
        self.broker = broker
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
        self.driver.get("https://www.google.com/")
        self.driver.set_page_load_timeout(60)
        self.driver.get(self.link)

    def click_xpath(self, xpath_id):
        WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, xpath_id))).click()
        sleep(1)

    def get_num_reviews(self):
        total_number_of_reviews = self.driver.find_element(By.XPATH, '//div[contains(@class, "jANrlb")]')
        total_number_of_reviews = total_number_of_reviews.find_element(By.XPATH, './div[3]')
        total_number_of_reviews = total_number_of_reviews.text.split(" ")[0]
        total_number_of_reviews = total_number_of_reviews.replace(',', '')  # Remove commas
        return int(total_number_of_reviews)

    def click_more_buttons(self):
        more_buttons = self.driver.find_elements(By.XPATH, "//button[contains(@class, 'w8nwRe') and contains(@class, 'kyuRq')]")
        for button in more_buttons:
            try:
                button.click()
                sleep(1)
            except Exception as e:
                print(f"Error clicking 'Ver mais' button: {e}")

    def infinite_scroll(self, total_number_of_reviews):
        scrollable_div = self.driver.find_element(By.XPATH, '//div[contains(@class, "m6QErb") and contains(@class, "DxyBCb") and contains(@class, "kA9KIf") and contains(@class, "dS8AEf") and contains(@class, "XiKgde")]')
        loaded_reviews = 0
        while loaded_reviews < total_number_of_reviews and loaded_reviews < 50:
            self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
            sleep(2)
            self.click_more_buttons()
            current_reviews = self.get_reviews()
            loaded_reviews = len(current_reviews)

    def get_reviews(self):
        response = BeautifulSoup(self.driver.page_source, 'html.parser')
        reviews = response.find_all('div', class_="jftiEf fontBodyMedium")
        return reviews

    def parse_reviews(self, reviews, total_number_of_reviews):
        rev_dict = {
            'Review Rate': [],
            'Review Time': [],
            'Review Text': []
        }
        for review in reviews:
            review_elements = review.find_all('span', {'class': 'kvMYJc'})
            for review_element in review_elements:
                full_stars = len(review_element.find_all('span', {'class': 'hCCjke google-symbols NhBTye elGi1d'}))
                empty_stars = len(review_element.find_all('img', {'class': 'hCCjke '}))
                num_stars = full_stars + empty_stars
            review_time = review.find('span', class_='rsqaWe').text
            review_element = review.find('span', class_='wiI7pd')
            review_text = review_element.text if review_element is not None else ''
            rev_dict['Review Rate'].append(full_stars)
            rev_dict['Review Time'].append(review_time)
            rev_dict['Review Text'].append(review_text)
        rev_dict = pd.DataFrame(rev_dict)
        rev_dict['Broker'] = self.broker
        rev_dict['Link'] = self.link
        rev_dict['Average Rating'] = self.driver.find_element(By.XPATH, '//div[contains(@class, "jANrlb")]').find_element(By.XPATH, './div[1]').text
        rev_dict['Total number of ratings'] = total_number_of_reviews
        return rev_dict

    def run(self):
        self.click_xpath("//button[contains(@class, 'hh2c6') and contains(@class, 'G7m0Af')]")
        sleep(5)
        scrollable_div = self.driver.find_element(By.XPATH, '//div[contains(@class, "m6QErb") and contains(@class, "DxyBCb") and contains(@class, "kA9KIf") and contains(@class, "dS8AEf") and contains(@class, "XiKgde")]')
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)

        total_number_of_reviews = self.get_num_reviews()
        self.infinite_scroll(total_number_of_reviews)
        reviews = self.get_reviews()
        reviews = self.parse_reviews(reviews, total_number_of_reviews)
        return reviews


In [8]:
broker_df=pd.read_csv('/kaggle/input/dataset1/final.csv',encoding='utf-8')

In [9]:
broker_df

Unnamed: 0.1,Unnamed: 0,placeLabel,search_url,google_maps_url
0,1,Borj Ennar,https://www.google.com/maps/search/Borj+Ennar,"https://www.google.com/maps/place/Borj+Ennar/@34.735567,10.7596289,17z/data=!4m12!1m2!2m1!1sBorj+Ennar!3m8!1s0x13002d2d178de4f5:0xc381177ae752c70c!8m2!3d34.735567!4d10.764135!9m1!1b1!15sCgpCb3JqIEVubmFykgEYYWR1bHRfZW50ZXJ0YWlubWVudF9jbHVi4AEA!16s%2Fg%2F11gyxlykys?entry=ttu"
1,3,Bab Diwan,https://www.google.com/maps/search/Bab+Diwan,"https://www.google.com/maps/place/Beb+El+Diw%C3%A9n/@34.7347387,10.7577146,17z/data=!4m12!1m2!2m1!1sBab+Diwan!3m8!1s0x13002d2cf433e675:0xd40bddfba257ecaf!8m2!3d34.7347387!4d10.7622207!9m1!1b1!15sCglCYWIgRGl3YW6SARNoaXN0b3JpY2FsX2xhbmRtYXJr4AEA!16s%2Fg%2F11gdq14zkv?entry=ttu"
2,5,Bab El Kasbah,https://www.google.com/maps/search/Bab+El+Kasbah,"https://www.google.com/maps/place/%D8%A8%D8%A7%D8%A8+%D8%A7%D9%84%D9%82%D8%B5%D8%A8%D8%A9%E2%80%AD/@34.7333375,10.7595156,17z/data=!4m8!3m7!1s0x13002dd0092e4949:0x79f6d0678310448e!8m2!3d34.7333375!4d10.7595156!9m1!1b1!16s%2Fg%2F11gg5m_rlh?entry=ttu"
3,6,Stade TaÃ¯eb-Mehiri,https://www.google.com/maps/search/Stade+TaÃ¯eb-Mehiri,"https://www.google.com/maps/place/Stade+El+Mestiri+Ettadhamen/@36.8436614,10.0990286,17z/data=!4m8!3m7!1s0x12fd3295b8436539:0x79dc784a312c39b!8m2!3d36.8436614!4d10.0990286!9m1!1b1!16s%2Fg%2F11crzy4kbk?entry=ttu"
4,8,Grande MosquÃ©e de Sfax,https://www.google.com/maps/search/Grande+MosquÃ©e+de+Sfax,"https://www.google.com/maps/place/Grande+Mosqu%C3%A9e+de+Sfax/@34.7359595,10.7609287,17z/data=!4m8!3m7!1s0x13002d2c9f4ec867:0x8c221f05dcbe4534!8m2!3d34.7359595!4d10.7609287!9m1!1b1!16s%2Fg%2F1269nrly2?entry=ttu"
...,...,...,...,...
144,334,lycÃ©e de la rue de Russie,https://www.google.com/maps/search/lycÃ©e+de+la+rue+de+Russie,"https://www.google.com/maps/place/Lyc%C3%A9e+Rue+de+Russie/@36.795337,10.1776513,17z/data=!4m8!3m7!1s0x12fd340cdba5c65b:0x7d0c40c0327e0568!8m2!3d36.795337!4d10.1776513!9m1!1b1!16s%2Fg%2F11g1lqr58j?entry=ttu"
145,335,MusÃ©e du tapis de Kairouan,https://www.google.com/maps/search/MusÃ©e+du+tapis+de+Kairouan,"https://www.google.com/maps/place/Mus%C3%A9e+du+tapis/@35.672459,10.0993201,17z/data=!4m8!3m7!1s0x12fdc52a84dd6553:0xd5e9efba8fcbbf2!8m2!3d35.672459!4d10.0993201!9m1!1b1!16s%2Fg%2F11h0_1x78l?entry=ttu"
146,336,MosquÃ©e El Ajouzine,https://www.google.com/maps/search/MosquÃ©e+El+Ajouzine,"https://www.google.com/maps/place/Mosqu%C3%A9e+Al+Ajouzain/@34.7348887,8.454915,8z/data=!4m12!1m2!2m1!1sMosqu%C3%83%C2%A9e+El+Ajouzine!3m8!1s0x13002d2cf49dd767:0xebcb08a7efe59954!8m2!3d34.7348887!4d10.7620439!9m1!1b1!15sChZNb3NxdcODwqllIEVsIEFqb3V6aW5lkgEGbW9zcXVl4AEA!16s%2Fg%2F11csrpmpgy?entry=ttu"
147,337,Grande MosquÃ©e de Gafsa,https://www.google.com/maps/search/Grande+MosquÃ©e+de+Gafsa,"https://www.google.com/maps/place/La+Grande+Mosqu%C3%A9e+de+Gafsa/@34.4150807,8.7855187,17z/data=!4m8!3m7!1s0x12f896fdd5845a0f:0x362e5cd353a1f600!8m2!3d34.4150807!4d8.7855187!9m1!1b1!16s%2Fg%2F11f006ggx_?entry=ttu"


In [12]:
# Initialiser un DataFrame vide pour contenir tous les avis
all_reviews_list = pd.DataFrame()

# Obtenir la date d'aujourd'hui en tant que chaîne de caractères
today_str = date.today().strftime('%Y-%m-%d')

In [13]:
for index, row in broker_df.iterrows():
    broker = row['placeLabel']
    link = row['google_maps_url']
    print(f'Scraping {broker}...')
    my_bot = GoogleReviewsBot(link, broker)
    try:
        reviews = my_bot.run()
        print(f'{broker} scraped successfully!')

        # Sauvegarder les avis individuels du broker dans un fichier CSV (si nécessaire)
        # reviews.to_csv(f'{broker}-{today_str}.csv', index=False, encoding='utf-8')

        # Concaténer les avis avec la liste principale
        all_reviews_list = pd.concat([all_reviews_list, reviews], ignore_index=True)
    except Exception as e:
        print(f'Error scraping {broker}: {e}')
    finally:
        my_bot.driver.close()


Scraping Borj Ennar...
Borj Ennar scraped successfully!
Scraping Bab Diwan...
Bab Diwan scraped successfully!
Scraping Bab El Kasbah...
Bab El Kasbah scraped successfully!
Scraping Stade TaÃ¯eb-Mehiri...
Stade TaÃ¯eb-Mehiri scraped successfully!
Scraping Grande MosquÃ©e de Sfax...
Grande MosquÃ©e de Sfax scraped successfully!
Scraping Fondouk El Haddadine...
Fondouk El Haddadine scraped successfully!
Scraping Souk Kriaa...
Souk Kriaa scraped successfully!
Scraping MusÃ©e archÃ©ologique de Sousse...
MusÃ©e archÃ©ologique de Sousse scraped successfully!
Scraping Dar Am TaÃ¯eb...
Dar Am TaÃ¯eb scraped successfully!
Scraping MosquÃ©e Bou Ftata...
MosquÃ©e Bou Ftata scraped successfully!
Scraping MusÃ©e El Kobba...
MusÃ©e El Kobba scraped successfully!
Scraping Grande MosquÃ©e de Sousse...
Grande MosquÃ©e de Sousse scraped successfully!
Scraping Ribat de Sousse...
Ribat de Sousse scraped successfully!
Scraping Bab Charki...
Bab Charki scraped successfully!
Scraping MusÃ©e Dar Essid...
MusÃ©

In [21]:
data = pd.DataFrame(all_reviews_list)

In [24]:
data.to_csv('reviews.csv')

In [27]:
data.to_excel('reviews.xlsx', index=False)