In [None]:
# Importing libraries and modules
import pandas as pd
from urllib.request import Request
from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup
import time
from tqdm.auto import tqdm

In [None]:
# Creating the Trustpilot scraper
class trustpilot_product_review_scraper:
    """Creating a class to scrape all of the Trustpilot reviews"""
    
    def __init__(self, trustpilot_site, sleep_time=1):
        """Initializing url, reviews dictionary, pages to scrape, and starting page"""
        self.url = trustpilot_site
        self.reviews_dict = {'names':[],"time":[],"locations":[],"titles":[],"content":[],
                        "ratings":[]}
        self.sleep_time = sleep_time
        self.pages = [self.url]
        self.start_page = 1
        
        # Calling return_next_page function to collect all of the pages to scrape
        print("Starting to collect all pages - relax, it will take a moment!")
        self.return_next_page(self.url)
        self.end_page = len(self.pages)
        
    def return_next_page(self,page):
        """A function that returns to the next page to scrape"""
        req = Request(page, headers={'User-Agent': 'Mozilla/5.0'})
        html = urlopen(req).read()
        bsObj = BeautifulSoup(html, "html.parser")
        html = urlopen(req).read()
        next_pages = (bsObj.find_all(rel="next"))
        for page in  next_pages:
            if page["href"].startswith("ht"):
                next_page = page["href"]
                self.pages.append(next_page)
                self.sleep_time
                self.return_next_page(next_page)    
            else:
                pass
    
  
    def page_scraper(self, page):
        """A Function to scrape all of the attributes by one page per time"""
        try:
            req = Request(page, headers={'User-Agent': 'Mozilla/5.0'})
            html = urlopen(req).read()
            bsObj = BeautifulSoup(html, "html.parser")
            html = urlopen(req).read()
        
            # Creating all lists where to store data scraped
            content_list = []
            rating_list = []
            name_list = []
            time_list = []
            title_list = []
            location_list = []
                
            # Start scraping one page per time
            reviews_ = bsObj.find_all(class_="review")
            for review in reviews_:
                
                # Getting the reviews content
                content_ = review.find_all(class_="review-content__text")
                if len(content_) == 0:
                    content_list.append("NaN")
                else:
                    content = review.find_all(class_="review-content__text")
                    for c in content:
                        result = c.get_text("\n").strip()
                        content_list.append(result)
            
                # Getting reviews titles 
                titles_ = review.find_all(class_="review-content__title")
                if len(titles_) == 0:
                    title_list.append("NaN")
                else:
                    title = review.find_all(class_="review-content__title")
                    for t in title:
                        result = t.get_text().strip()
                        title_list.append(result)
                
                # Getting reviews time       
                times_ = review.find_all('div', {'class' : "review-content-header"})
                if len(times_) == 0:
                    time_list.append("NaN")
                else:
                    for time in times_:
                        result_time = time.find_all("script")[0].string
                        result_time = result_time.split('publishedDate":"',1)[1]
                        result = result_time.split('T',1)[0]
                        time_list.append(result)
                
                # Get reviews ratings
                ratings_ = review.find_all(class_="star-rating star-rating--medium")
                if len(ratings_) == 0:
                    rating_list.append('Nan')
                else:
                    for rating in ratings_:
                        result = rating('img')[0]['alt']
                        rating_list.append(result)
            
                # Getting reviews names
                names_ = review.find_all(class_="consumer-information__name")
                if len(names_) == 0:
                    name_list.append("NaN")
                else:
                    for name in names_:
                        result = name.get_text("\n").strip()
                        name_list.append(result)
                      
                # Getting reviews locations
                locations_ = review.find_all(class_="consumer-information__location")
                if len(locations_) == 0:
                    location_list.append("NaN")
                else:
                    for location in locations_:
                        result = location("span")[0].string
                        location_list.append(result)
                
            # Adding to the main dictionary
            self.reviews_dict['content'].extend(content_list)
            self.reviews_dict['ratings'].extend(rating_list)
            self.reviews_dict['time'].extend(time_list)
            self.reviews_dict['names'].extend(name_list)
            self.reviews_dict['locations'].extend(location_list)
            self.reviews_dict['titles'].extend(title_list)
                
        except:
            print ("Not able to scrape page".format(page), flush=True)
                
    # Scraping function - main function
    def scrape(self):
        """a Function that returns to a dataframe with all of the reviews scraped"""
        print ("Total pages to scrape: {}".format(self.end_page - self.start_page+1), flush=True)
        time.sleep(self.sleep_time)
        print ("Start page: {}; End page: {}".format(self.start_page, self.end_page))
        time.sleep(self.sleep_time)
        print ()
        print ("Starting to scrape pages!", flush=True)
        time.sleep(self.sleep_time)
        
        for page in tqdm(self.pages):

            self.page_scraper(page)
            #
            time.sleep(self.sleep_time)

        print ("Completed!")

        # Returning to a Pandas Dataframe
        return pd.DataFrame(self.reviews_dict)
    

In [8]:
# Substitute the trustpilot url where you want to start scraping
review_scraper = trustpilot_product_review_scraper( # https://de.trustpilot.com/review/ )

# Starting to scrape all of the pages and storing them into a dataframe
df_reviews = review_scraper.scrape()
        

Starting to collect all pages - relax, it will take a moment!
Total pages to scrape: 23
Start page: 1; End page: 23

Starting to scrape pages!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))


Completed!


In [None]:
# Calling the dataframe with all of the reviews scraped
df_reviews

In [None]:
# Exporting the DataFrame into Excel
df_reviews.to_excel("Trustpilot_reviews.xlsx")