In [6]:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import uuid
import json
driver = webdriver.Chrome()

class Scraper_Object:
    def __init__(self, category, url):
        self.category = category
        self.url = url
        self.crawler = []
        self.scraped_data = []
        driver.get(url)

    def accept_cookies(self):
        time.sleep(12)
        accept_cookies_button = driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]')
        accept_cookies_button.click()
        time.sleep(1)

    def search(self):
        search_bar = driver.find_element(By.XPATH, '//input[@class="herosearch_searchInputField__Pp2MD"]')
        search_bar.send_keys(self.category)
        time.sleep(3)
        #The following line identifies the first suggested category by finding the following sibling of the 'categories' heading under the search box
        first_category = driver.find_element(By.XPATH, '//h4[contains(text(),"Categories")]//following-sibling::a')
        first_category.click()
        time.sleep(3)
    
    def create_crawler(self, length):
        self.length = length
        #creates a list of the html elements corresponding to different companies 
        business_list = driver.find_elements(By.XPATH, '//div[@class="paper_paper__1PY90 paper_outline__lwsUX card_card__lQWDv card_noPadding__D8PcU styles_wrapper__2JOo2"]/a')
        #Iterates through the html elements and puts each href into the crawler
        for index in range(0, self.length):
            href = business_list[index].get_attribute('href')
            self.crawler.append(href)
        return self.crawler

    def scrape_from_crawler(self):
        for business in self.crawler:
            self.scraped_data.append(scrape_stuff(business))

    #Method for saving the scraped data as a json file
    def save_json(self):
        file_name = "raw_data/{}.json".format(self.category.replace(" ", "_"))
        with open(file_name, 'w') as json_file:
            json.dump(self.scraped_data, json_file)

#A function that scrapes all of the relevant data from a single href from the crawler, and puts it into a dictionary
def scrape_stuff(url):
    driver.get(url)
    time.sleep(1)
    item_dictionary = {}  

    #Finds name of company
    Name = driver.find_element(By.XPATH, '//span[@class="typography_display-s__qOjh6 typography_appearance-default__AAY17 title_displayName__TtDDM"]/.').text
    item_dictionary['Name'] = Name

    #Gives the item a unique ID (uuid4)
    item_dictionary['ID'] = str(uuid.uuid4())

    #Gives the item a timestamp
    item_dictionary['Timestamp'] = time.time()

    #Adds the href to item_dictionary
    item_dictionary['Href'] = url

    #Finds the number of reviews
    try:
        Num_reviews = driver.find_element(By.XPATH, '//p[@class="typography_body-l__KUYFJ typography_appearance-default__AAY17"]/.').text
        Number_reviews = Num_reviews.split(' ')
        item_dictionary["Number of Reviews"] = Number_reviews[0]
    except:
        item_dictionary["Number of Reviews"] = "N/A"

    #Finds the rating 
    try:
        Rating = driver.find_element(By.XPATH, '//span[@class="typography_heading-m__T_L_X typography_appearance-default__AAY17"]').text
        item_dictionary["Rating"] = Rating
    except:
        item_dictionary["Rating"] = "N/A"
    #Finds the email of the company
    try:
        Email = driver.find_element(By.XPATH, '//a[@class="link_internal__7XN06 typography_body-m__xgxZ_ typography_appearance-action__9NNRY link_link__IZzHN link_underlined__OXYVM"]').text
        item_dictionary['Email']= Email
    except:
        item_dictionary['Email']= "N/A"

    return item_dictionary

    #

if __name__ == "__main__":
    tester = Scraper_Object('energy supplier', 'https://www.trustpilot.com/')
    tester.accept_cookies()
    tester.search()
    tester.create_crawler(20)
    tester.scrape_from_crawler()
    print(tester.scraped_data)
    tester.save_json()

[{'Name': 'Integrity Energy ', 'ID': 'b5b79b60-c5c1-45fd-b552-d7fc87ee3436', 'Timestamp': 1668700568.636299, 'Href': 'https://www.trustpilot.com/review/integrityenergy.com', 'Number of Reviews': '137', 'Rating': '4.8', 'Email': '(216) 420-9700'}, {'Name': 'EZ Energy Services  ', 'ID': '68b4762c-332a-4c31-bf10-f502f858ef8b', 'Timestamp': 1668700579.5063722, 'Href': 'https://www.trustpilot.com/review/ezenergyservices.com', 'Number of Reviews': '85', 'Rating': '4.8', 'Email': 'support@ezenergyservices.com'}, {'Name': 'ElectricityRates.com ', 'ID': 'c2f0813e-c866-4922-bf66-791be8728f9b', 'Timestamp': 1668700585.400636, 'Href': 'https://www.trustpilot.com/review/electricityrates.com', 'Number of Reviews': '1,510', 'Rating': '4.6', 'Email': 'support@ElectricityRates.com'}, {'Name': 'IGS Energy ', 'ID': 'b9997b4a-8b11-45d3-9173-dd5418ba26f8', 'Timestamp': 1668700589.0139148, 'Href': 'https://www.trustpilot.com/review/igs.com', 'Number of Reviews': '16,370', 'Rating': '4.4', 'Email': '877.995.