In [None]:
'''
Yelp: https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C+OR&ns=1
City: Portland, Oregon
Business type: Restaurants
Settings: United States; English 
'''

In [None]:
'''
Main search result pages: 1-10 (Each has 30 restaurants)
Info to be extracted: Business name; Business url

Business pages: 
Business info: 
- Rating
- Price range
- Cuisine styles
- Number of reviews
- Number of photos
- Health score
- Whether the business has paid for the 'Highlights' service
Review info: 1-5 review pages (Each has 20 reviewers)
- Reviewer's name
- Reviewer's number of friends
- Reviewer's number of reviews
- Reviewer's number of photos
- Reviewer's location
- Reviewer's rating
'''

In [None]:
'''
CSV files: Display the headers only once, but not for each new line
- Business info
- Reviews info
'''

In [None]:
'''
Currently working:
- Main search result pages: Business name; Business url
- Business pages: Review info

Currently not working:
- Business pages: Business info 
- For the individual business, it works.
- For the looping businesses, it does not work.
'''

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# This is just an API key to avoid getting blocked, but you don't necessarily need to use it. 
from scraper_api import ScraperAPIClient
client = ScraperAPIClient('4f4a3198b345f90f2f71226ce21b9f15')

In [None]:
# Loop over multiple search results pages: 30 results per page * 10 pages
# Main page: 1
url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C+OR&ns=1'
main_page = client.get(url).text
soup = BeautifulSoup(main_page, 'lxml')
subpages = soup.select('.text-size--inherit__373c0__2fB3p .link-color--inherit__373c0__3dzpk')

object_name = []
object_href = []

for i in subpages[2:32]:
    object_name.append(i.string)
    object_href.append(i.get('href'))

In [None]:
# Main pages: 2-10
for i in range(1, 10):
    # HTML
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Portland%2C%20OR&start='
    main_pages = client.get(url + str(i * 30)).text
    # Soup
    soup = BeautifulSoup(main_pages, 'lxml')
    # Selector list
    subpages = soup.select('.text-size--inherit__373c0__2fB3p .link-color--inherit__373c0__3dzpk')
    # Selector object: Business name; Business link (Excluding the sponsored results)
    for j in subpages[2:32]:
        object_name.append(j.string)
        object_href.append(j.get('href'))
    
object_href = ['https://www.yelp.com' + url for url in object_href]
object_data = list(zip(object_name, object_href))
object_df = pd.DataFrame(data = object_data, columns = ['name', 'url'])

In [None]:
# Subpages: Extract information
for i in range(300):
    try:
        business_link = object_df.iloc[i]['url']
        business_name = object_df.iloc[i]['name']
        obs = client.get(business_link).text
        soup = BeautifulSoup(obs, 'lxml')
    
        # One-to-one mapping: For each business page, extract the following info
        # The 'if-else' is to make sure the extracted info is not empty
        
        # Total ratings: Ex: 5 stars rating
        # Caveat: CSS locator does not show the consistent patterns (Not sure how to fix it)
        ratings_info = soup.select('.i-stars--large-4__373c0__1d6HV')
        overall_ratings = []
        if ratings_info:
            for j in ratings_info:
                overall_ratings.append(j.attrs['aria-label'])
        else:
            overall_ratings.append('NA')

        # Style: Ex: Southern, Cajun/Creole, Breakfast & Brunch
        styles_info = soup.select('.margin-r1__373c0__zyKmV .link-size--inherit__373c0__1VFlE')
        styles = []
        if styles_info:
            for j in styles_info:
                styles.append(j.get_text())
        else:
            styles.append('NA')

        # Price range: Ex: $$
        price_info = soup.select('.text-bullet--after__373c0__3fS1Z.text-size--large__373c0__3t60B')
        price = []
        for j in price_info:
            price.append(j.get_text().replace(' ', ''))

        # Number of reviews
        num_reviews_info = soup.select('.text-color--mid__373c0__jCeOG.text-size--large__373c0__3t60B')
        num_reviews = []
        for j in num_reviews_info:
            num_reviews.append(int(j.get_text().split(' ')[0]))

        # Number of photos
        num_photos_info = soup.select('.display--inline__373c0__3JqBP .secondary-white__373c0__2OPxz .text--truncated__373c0__3sLaf')
        num_photos_info = []
        for j in num_photos_info:
            num_photos.append(int(j.get_text().split(' ')[2]))

        # Health score
        # Caveat: Some are missing; Some have non-digit characters
        health_score_info = soup.select('.layout-2-units__373c0__38itL .border-color--default__373c0__3-ifU:nth-child(1) .border-color--default__373c0__3-ifU .border-color--default__373c0__3-ifU .text-weight--bold__373c0__1elNz')
        health_score = []
        if health_score_info:
            for j in health_score_info:
                if j.get_text().strip().split(' ')[0].isdigit():
                    health_score.append(int(j.get_text().strip().split(' ')[0]))
        else:
            health_score.append('NA')

        # Highlights
        high_info = soup.select('.display--inline-block__373c0__1ZKqC.margin-r1__373c0__zyKmV')
        high = []
        if high_info:
            for j in high_info:
                high.append(j.get_text().split(' ')[0])
        else:
            high.append('NA')
        
        # DataFrame
        rating_data = list(zip(business_name, 
                               business_link, 
                               overall_ratings, 
                               styles, 
                               price,
                               num_reviews,
                               num_photos,
                               health_score,
                               high))
        rating_df = pd.DataFrame(data = rating_data, columns = ['business', 
                                                                'hyperreference',
                                                                'rating',
                                                                'styles',
                                                                'price_range',
                                                                'num_reviews',
                                                                'num_photos',
                                                                'health_score',
                                                                'highlights'])
        with open('Rating.csv', 'a', newline = '') as f:
            rating_df.to_csv(f, index = False, header = False, encoding = 'utf8')

        # One-to-many mapping: 20 reviewers per page * 5 pages
        
        # First review page:
        # Reviewers: Name
        reviewers_name_info = soup.select('.text-color--blue-dark__373c0__1jX7S .link-size--inherit__373c0__1VFlE')
        reviewers_name = []
        for j in reviewers_name_info:
            reviewers_name.append(j.get_text())
        
        # Reviewers: Location
        reviewers_location_info = soup.select('.lemon--span__373c0__3997G.text-color--normal__373c0__3xep9.text-weight--bold__373c0__1elNz.text-size--small__373c0__3NVWO')
        reviewers_location = []
        for j in reviewers_location_info:
            reviewers_location.append(j.get_text())
    
        # Reviewers: Number of friends
        reviewers_friends_info = soup.select('.icon--18-friends+ .border-color--default__373c0__3-ifU b')
        reviewers_friends = []
        for j in reviewers_friends_info:
            reviewers_friends.append(int(j.get_text()))

        # Reviewers: Number of reviews
        reviewers_reviews_info = soup.select('.icon--18-review+ .border-color--default__373c0__3-ifU b')
        reviewers_reviews = []
        for j in reviewers_reviews_info:
            reviewers_reviews.append(int(j.get_text()))

        # Reviewers: Number of photos
        # Caveat: Not every reviewer has this info (< # reviewers) (Not sure how to fix it)
        reviewers_photos_info = soup.select('.icon--18-camera+ .border-color--default__373c0__3-ifU b')
        reviewers_photos = []
        for j in reviewers_photos_info:
            reviewers_photos.append(int(j.get_text()))

        # Reviewers: Ratings
        # Caveat: Not every reviewer has only one rating for the business (> # reviewers) (Not sure how to fix it)
        reviewers_ratings_info = soup.select('.margin-b1__373c0__1khoT .border-color--default__373c0__3-ifU .border-color--default__373c0__3-ifU .border-color--default__373c0__3-ifU .overflow--hidden__373c0__2y4YK')
        reviewers_ratings = []
        for j in reviewers_ratings_info:
            reviewers_ratings.append(j.attrs['aria-label']) 
    
        # Other review pages: 2-5
        for k in range(1, 5):
            review_link = business_link + '&start=' + str(k * 20)
            other_obs = client.get(review_link).text
            other_soup = BeautifulSoup(other_obs, 'lxml')
    
            # Reviewers: Name
            other_reviewers_name_info = other_soup.select('.text-color--blue-dark__373c0__1jX7S .link-size--inherit__373c0__1VFlE')
            for j in other_reviewers_name_info:
                reviewers_name.append(j.get_text())
            
            # Reviewers: Location
            other_reviewers_location_info = soup.select('.lemon--span__373c0__3997G.text-color--normal__373c0__3xep9.text-weight--bold__373c0__1elNz.text-size--small__373c0__3NVWO')
            for j in other_reviewers_location_info:
                reviewers_location.append(j.get_text())
    
            # Reviewers: Number of friends
            other_reviewers_friends_info = other_soup.select('.icon--18-friends+ .border-color--default__373c0__3-ifU b')
            for j in other_reviewers_friends_info:
                reviewers_friends.append(int(j.get_text()))
    
            # Reviewers: Number of reviews
            other_reviewers_reviews_info = other_soup.select('.icon--18-review+ .border-color--default__373c0__3-ifU b')
            for j in other_reviewers_reviews_info:
                reviewers_reviews.append(int(j.get_text()))
    
            # Reviewers: Number of photos
            # Caveat: Not every reviewer has this info (< # reviewers) (Not sure how to fix it)
            other_reviewers_photos_info = other_soup.select('.icon--18-camera+ .border-color--default__373c0__3-ifU b')
            for j in other_reviewers_photos_info:
                reviewers_photos.append(int(j.get_text()))
    
            # Reviewers: Ratings
            # Caveat: Not every reviewer has only one rating for the business (> # reviewers) (Not sure how to fix it)
            other_reviewers_ratings_info = other_soup.select('.margin-b1__373c0__1khoT .border-color--default__373c0__3-ifU .border-color--default__373c0__3-ifU .border-color--default__373c0__3-ifU .overflow--hidden__373c0__2y4YK')
            for j in other_reviewers_ratings_info:
                reviewers_ratings.append(j.attrs['aria-label']) 
    
        # DataFrame
        business_name_multi = [business_name for i in range(len(reviewers_name))]
        business_link_multi = [business_link for i in range(len(reviewers_name))]
        review_data = list(zip(business_name_multi, 
                               business_link_multi, 
                               reviewers_name, 
                               reviewers_location,
                               reviewers_friends, 
                               reviewers_reviews))
        review_df = pd.DataFrame(data = review_data, columns = ['business', 
                                                                'hyperreference',
                                                                'reviewers',
                                                                'reviewers_location',
                                                                'reviewers_num_friends',
                                                                'reviewers_num_reviews'])
        with open('Review.csv', 'a', newline = '') as f:
            review_df.to_csv(f, index = False, header = False, encoding = 'utf8')
                
        print(f'Page {i}: O') 
        time.sleep(2)
    
    except:
        print(f'Page {i}: X')