In [None]:
# created by nov05 on 2020-01-18

In [22]:
import time
import random
from datetime import datetime
from requests import Session
from lxml import html

def yelpScrapePage(business_id, 
                   page=0, # page
                   date_range=None # date range
                   ): 
    ''' Do NOT use multi-threading to avoid getting blocked.
    '''
    status_code, results, keep_scraping = None, [], True
    
    base_url = "https://www.yelp.com/biz/" # add business id
    api_url = "/review_feed?sort_by=date_desc&start=" # add number

    with Session() as s:
        url = base_url + business_id + api_url + str(page*20)
        with s.get(url, timeout=5) as r:    
            status_code = r.status_code
            print('status code:', status_code)
            if status_code != 200:
                return status_code, results, keep_scraping
            response = dict(r.json()) 
            _html = html.fromstring(response['pagination'])
            text = _html.xpath("//div[@class='page-of-pages arrange_unit arrange_unit--fill']/text()")
            total_pages = int(text[0].strip().split(' ')[-1])
            if page+1 > total_pages:
                keep_scraping = False
                return status_code, results, keep_scraping
            
            _html = html.fromstring(response['review_list'])
            dates, stars, texts, review_ids, user_ids = [], [], [], [], []
            dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
            dates = [datetime.strptime(d.strip(), format("%m/%d/%Y")) for d in dates]
            stars = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")
            stars = [float(s.split(' ')[0]) for s in stars]
            texts = [e.text for e in _html.xpath("//div[@class='review-content']/p")]
            review_ids = _html.xpath("//div[@class='review review--with-sidebar']/@data-review-id")
            user_ids = [s.split(':')[1] for s in _html.xpath("//div[@class='review review--with-sidebar']/@data-signup-object")]
            results = [[date, star, text, review_id, user_id] 
                        for date, star, text, review_id, user_id 
                        in zip(dates, stars, texts, review_ids, user_ids)]
            print(f"page: {page}, total pages: {total_pages}")
            
            # filter by date
            if date_range is not None:
                idx0, idx1 = None, None
                for i in range(len(dates)):
                    if dates[i]<=date_range[1]:
                        idx0 = i
                        break
                for i in range(len(dates)):
                    if dates[len(dates)-1-i]>=date_range[0]:
                        idx1 = len(dates)-1-i
                        break
                print(dates[i], "indexes", idx0, idx1)
                if idx0 is None or idx1 is None or idx1<idx0: 
                    results = []
                else:
                    results = results[idx0:idx1+1]
                    keep_scraping = False

    return status_code, results, keep_scraping

def yelpScraper(business_id,
                date_range=None):
    status_code, results = None, []
    if date_range is not None: print(date_range[0], date_range[1])
        
    for i in range(1000):
        status_code, r, keep_scraping = yelpScrapePage(business_id, page=i, date_range=date_range)
        print('keep scraping:', keep_scraping)
        if status_code != 200:
            return status_code, []
        results = results + r
        if keep_scraping == False:
            break
        # scrape slowly to avoid being blocked
        time.sleep(random.uniform(1, 3))

    return status_code, results

# test
import pandas as pd
business_id = 'jga_2HO_j4I7tSYf5cCEnQ'

date_range = (datetime.strptime('2017-05-14', '%Y-%m-%d'),
              datetime.strptime('2017-05-20', '%Y-%m-%d'))
data = yelpScraper(business_id, date_range=date_range)[1]
pd.DataFrame(data)

2017-05-14 00:00:00 2017-05-20 00:00:00
status code: 200
page: 0, total pages: 4
2019-11-25 00:00:00 indexes None 19
keep scraping: True
status code: 200
page: 1, total pages: 4
2018-08-23 00:00:00 indexes None 19
keep scraping: True
status code: 200
page: 2, total pages: 4
2017-11-05 00:00:00 indexes None 19
keep scraping: True
status code: 200
page: 3, total pages: 4
2017-06-10 00:00:00 indexes 7 8
keep scraping: False


Unnamed: 0,0,1,2,3,4
0,2017-05-20,5.0,From the start this passion project has been n...,2Ux1p0hugv3AMntNMUKj5w,qZXNUj_VM6GXTSMAsPiB0Q
1,2017-05-14,5.0,"They have not officially opened yet, but have ...",DAgeYxOpMUiCQFwYZI0eBA,_Cc708kGSarBQ3VIdWMa-A


In [18]:
data = yelpScraper(business_id)[1]
print("reviews found:", len(data))
pd.DataFrame(data).head()

status code: 200
page: 0 , total pages: 4
keep scraping: True
status code: 200
page: 1 , total pages: 4
keep scraping: True
status code: 200
page: 2 , total pages: 4
keep scraping: True
status code: 200
page: 3 , total pages: 4
keep scraping: True
status code: 200
keep scraping: False
reviews found: 70


Unnamed: 0,0,1,2,3,4
0,2019-11-25,3.0,So when I was a kid I always had a cat. Today....,pZ-vmy9OVD-bYgA10A7mYw,R1RDHafKh9T426DijeRixA
1,2019-11-12,5.0,Love this place! We have been probably 6 or 7 ...,j7tZoyQbQys67S_lqaWdWQ,X5pB5Wr1SDhPO92JHv6YCg
2,2019-10-13,5.0,"This is such a neat concept! Visit with, play,...",01Qjq3yLZb2_n2w9ba6X1Q,YsjDcwNc9bgmesjwlBzaXw
3,2019-09-06,5.0,Super fun w the kitties s d the drag queens an...,qC_Ij8DPaDqvO-5oZ9tapA,Ix0LRMnciWHcamtn0HbFZA
4,2019-08-06,3.0,The cats are adorable and we had such a fun ti...,pK_tztOt1Xuzw30C_1d4-A,otsV60LW8fg3sDI6TKg2fw


In [19]:
pd.DataFrame(data).tail()

Unnamed: 0,0,1,2,3,4
65,2017-05-31,5.0,"After a long day at work, hanging out with som...",5NQWDNdYA1sn5hlcTt0_YQ,4cZ3f3i0Slv2W9mLN_VZBw
66,2017-05-27,5.0,What a great little cafe and wine bar shop wit...,mAAfu9UET7OjyA-9z34NAQ,KuEmun7kYxjo8suC5cjjtg
67,2017-05-20,5.0,From the start this passion project has been n...,2Ux1p0hugv3AMntNMUKj5w,qZXNUj_VM6GXTSMAsPiB0Q
68,2017-05-14,5.0,"They have not officially opened yet, but have ...",DAgeYxOpMUiCQFwYZI0eBA,_Cc708kGSarBQ3VIdWMa-A
69,2017-05-13,5.0,As of this review the Cafe is not fully operat...,lLCTJRgGsC7Tuv7Y9qXPDg,wi_a3hdD7pnZU4mzjCbdIw
