# Burpple User Reviews Scraper

In [4]:
import xml.etree.ElementTree as ET 
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd

from threading import Thread
import time
from random import choice

import pprint
pp = pprint.PrettyPrinter(indent=2)

## Defining Functions 

In [25]:
def scrape(url, name, class_):
    """Scrapes `url` using requests and bs4 for a `class_` with the specific`name`,
        and returns the scraped section of interest, `scraped` as a result_set. 

    Uses randomized User-Agents as headers for the scraping to avoid being blocked by the site.
    """

    try:
        desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']

        def random_headers():
            return {'User-Agent': choice(desktop_agents),
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}

        r = requests.get(url, timeout=10, headers=random_headers())
        soup = BeautifulSoup(r.text, 'html.parser')            
        scraped = soup.find_all(name, class_=class_)
        if scraped:
            return scraped
        else:
            print('scraping failed.')

    except requests.Timeout as e:
        print('Timeout error:\n', e)
    except requests.RequestException as e:
        print('General requests error:\n', e)
    except e:
        print('Some other error occurred:\n', e)

In [34]:
def extractUsers(results_dict, scraped_data):
    """
    """
    for u in scraped_data:
        if u.find('span', class_='searchUser-body-city').text.strip() == 'Singapore': 
            username = u.find('span', class_='searchUser-body-name').text.replace('\n','')
            if username == 'Burpple Guides': continue # exception 

            userstats = u.find('div', class_='searchUser-body-stats').text.replace('\n','') 
            try:
                userlevel = re.search('Level ([0-9]{1,}) Burppler', userstats).group(1)
                user_numreviews = re.search('([0-9]{1,}) Reviews', userstats).group(1)
            except AttributeError as e:
                print('Stats not found:', e) 
                # e.g. no reviews given yet.

            userpage = u.find('a').get('href')

            results_dict[username] = {
                            'Link' :  f'{userpage}',
                            'Level' : f'{userlevel}',
                            'NumReviews' : f'{user_numreviews}'
                           }

            # add badge-type to dict if exists
            userbadge = u.find('img', class_='searchUser-avatar-badge')
            if userbadge: results_dict[username]['Badge'] = userbadge.get('alt')[:2]

    return results_dict

In [28]:
def getUserLists(soup):
    ls_dict = {}
    for ls in soup:
        # if there is a list?
        linked_ls = ls.find('a', class_='a--grey')
        if not linked_ls:
            unlinked_ls = ls.find('span') 
            if unlinked_ls: 
                ls_name = unlinked_ls.text
                ls_dict[ls_name] = {}
        else:
            ls_name = linked_ls.text
            ls_link = linked_ls.get('href')
            ls_dict[ls_name] = {'Link': ls_link}

        # Find categories of the list
        ls_cats = ls.find('span', class_='box-content-preTitle')
        if ls_cats:
            ls_cats_links = ls_cats.find_all('a', class_='a--lightGrey')
            cats_dict = {}
            for cat in ls_cats_links:
                cat_name = cat.text
                cat_link = cat.get('href')
                cats_dict[cat_name] = cat_link
            ls_dict[ls_name]['Categories'] =cats_dict
            
    return ls_dict

In [29]:
def getUserReviews(soup):
    # page_soup contains the find_all() of the food card items on a user's review timeline
    review_dict = {}
    for rev in soup:
        restaurant_name = rev.find('div', class_='card-item-set--link-title').text.replace('\n','')
        restaurant_address = rev.find('div', class_='card-item-set--link-subtitle').text.replace('\n','')
        review_title = rev.find('div', class_='food-description-title').text
        review_desc = rev.find('div', class_='food-description-body')

        if review_title in review_dict: review_title += ' (Review2)'
        review_dict[review_title] = {'RestaurantName' : restaurant_name,
                                     'RestaurantAddress' : restaurant_address}
        if review_desc: 
            review_dict[review_title]['Review'] = review_desc.text.replace('\n','')
            
    return review_dict

In [30]:
def getUserWishlist(soup):
    wishlist_dict = {}
    for wishlist in soup:
        restaurant_name = wishlist.find('span', class_='searchVenue-header-name-name headingMedium').text
        restaurant_link = wishlist.find('a').get('href')
        wishlist_dict[restaurant_name] = restaurant_link
    return wishlist_dict

## TO DO:

In [None]:
def tryLoadMore(page_soup):
    # for each page that loads..
    # check if there is a load more button in the pagesoup : (masonryViewMore-btn)
    # or if there is no more such thing (only a masonryViewMore-end)
    ## -> or can lookat whether masonViewMore-end has the argument of data-visible='true'/'false'
    vm = page_soup.find_all(name='div',class_='masonryViewMore')
    if vm:
        re.search(vm.text)
        loadMore = True
        return loadMore
    else:
        print('Error, no loadmore section found on this page')
        
## Add f'?Offset={i*offset}' where i (num_pages) and offset = no. per page
# Users: Offset 12
# Lists: Offset 8
# Wishlists: offset 12
# Reviews: Offset 12
    

## Scraping

### Extract Basic User Information from the Search Page
For multiple users, to first populate a dictionary of results with the basic information about the users.

In [32]:
error_list = []
results_dict = {}
num_pages = 10

In [35]:
# collect basic user information from the search page
for pg in range(1,num_pages):
    users_url = f'https://www.burpple.com/search/sg?offset={pg*12}&q=&type=users'
    users_soup = scrape(users_url,name='div',class_='card-item card-item--header')
    if users_soup:
        extractUsers(results_dict, users_soup)
    else:
        error_list.append(users_url)
    time.sleep(2)
print(len(results_dict),' users have been scraped')

92  users have been scraped


In [36]:
pp.pprint(results_dict)

{ 'Ah Leong San': {'Level': '9', 'Link': '/@Ahleongsan', 'NumReviews': '1551'},
  'Alainlicious Eats': { 'Level': '10',
                         'Link': '/@Alainlicious',
                         'NumReviews': '3736'},
  'Alex Chua': {'Level': '10', 'Link': '/@makanarts', 'NumReviews': '2721'},
  'Alex Ortega': {'Level': '9', 'Link': '/@9778', 'NumReviews': '1408'},
  'Alvin Ong': {'Level': '10', 'Link': '/@munafique', 'NumReviews': '2222'},
  'Blanche Tan': {'Level': '9', 'Link': '/@Blancheeze', 'NumReviews': '1788'},
  'Blueskies Cottonclouds': { 'Badge': 'Tm',
                              'Level': '10',
                              'Link': '/@blueskiescottonclouds',
                              'NumReviews': '2902'},
  'Bryan Lee': {'Level': '9', 'Link': '/@bryandmlee', 'NumReviews': '1384'},
  'Cady McBronzie': {'Level': '9', 'Link': '/@10812', 'NumReviews': '1409'},
  'Carrie Carrie': { 'Badge': 'Ss',
                     'Level': '9',
                     'Link': '/@tic',
    

### Extract Lists, Reviews and Wishlists for each user.
This requires scraping from 3 pages: User's Profile Page, Reviews page, and Wishlists page (respectively). 

Currently keep running into NoneType errors, likely because there are some users who do not have wishlists or  reviews at all, and the bs4 find_all function(s) return a None. Have included a Try/Except clause to see if that helps.

In [112]:
for user in results_dict.values():
    try:
        if int(user['NumReviews']) > 0 :
            link = user['Link']
            userpage = f'http://www.burpple.com{link}'
            profilepage_soup = scrape(userpage, name= 'div', class_='profile-page page')
            for soup in profilepage_soup:
                profilepage_stats = soup.find_all('a')
                for stat in profilepage_stats:
                    if stat.get('href').split(r'/')[-1] == 'wishlist':
                        user['NumWishlists'] = stat.span.text

            userreviews = userpage + '/timeline'
            review_soup = scrape(userreviews, 
                          name='div',
                          class_='food card feed-item')
            user['Reviews'] = getUserReviews(review_soup)

            # check if user has any lists?
            lists_soup = soup.find_all(name='div', class_='box-content card-item card-item--header')
            user['Lists'] = getUserLists(lists_soup)

            if 'NumWishlists' in user and int(user['NumWishlists']) > 0: # check if user has wishlists
                userwishlists = userpage + '/wishlist'
                wishlist_soup = scrape(userwishlists, 
                              name='div',
                              class_='searchVenue-header card-item card-item--header')
                user['Wishlist'] = getUserWishlist(wishlist_soup)
    except Exception as e:
        error_list.append('Unable to scrape for user {}:\n{}'.format(user['Link'],e))

scraping failed.
scraping failed.


In [117]:
# error_list
len(error_list), len(results_dict)

(96, 92)

In [115]:
pp.pprint(results_dict)

{ 'Ah Leong San': { 'Level': '9',
                    'Link': '/@Ahleongsan',
                    'Lists': { 'Bandung Food': { 'Link': '/list/515323/bandung-food'},
                               'Fusion Food': { 'Link': '/list/512969/fusion-food'},
                               'Greek A! ': {'Link': '/list/509000/greek-a'},
                               'Korean Tasties': { 'Link': '/list/511983/korean-tasties'},
                               'Michelin Bib Gourmand Hawkers 2018': { 'Link': '/list/515404/michelin-bib-gourmand-hawkers-2018'},
                               'Penang Food': { 'Link': '/list/515827/penang-food'},
                               'Surprise Food Items!': { 'Link': '/list/510369/surprise-food-items'},
                               'Value For $ Set Lunches': { 'Link': '/list/509758/value-for-set-lunches'}},
                    'NumReviews': '1551',
                    'NumWishlists': '2',
                    'Reviews': { '$3 Fishball Noodle With Fish Cake': { 

                                'Fried Kway Teow With Chicken': { 'RestaurantAddress': '\n'
                                                                                       '5001 '
                                                                                       'Beach '
                                                                                       'Road, '
                                                                                       'Singapore\n',
                                                                  'RestaurantName': '\n'
                                                                                    'Diandin '
                                                                                    'Leluk '
                                                                                    'Thai '
                                                                                    'Restaurant\n',
                                                  

                                            'béni': '/beni-singapore?bp_ref=%2F%40blueskiescottonclouds%2Fwishlist'}},
  'Bryan Lee': { 'Level': '9',
                 'Link': '/@bryandmlee',
                 'Lists': { 'Bali': { 'Categories': { 'Travel': '/categories/sg/travel'},
                                      'Link': '/list/483226/bali'},
                            'Burpple Beyond': { 'Link': '/list/514947/burpple-beyond'},
                            'Delivery': {'Link': '/list/494790/delivery'},
                            'Hong Kong & Macao': { 'Categories': { 'Travel': '/categories/sg/travel'},
                                                   'Link': '/list/497680/hong-kong-macao'},
                            'Malaysia': { 'Categories': { 'Travel': '/categories/sg/travel'},
                                          'Link': '/list/501101/malaysia'},
                            'Michelin Guide Street Food Festival': { 'Categories': { 'Michelin Guide Singapore 2017': '/cat

                                                                                                                         'Four '
                                                                                                                         'different '
                                                                                                                         'soft-serves '
                                                                                                                         'available '
                                                                                                                         'each '
                                                                                                                         'time, '
                                                                                                                         'as '
                                                                                 

                                                                       '#foodbossindia\n'
                                                                       '#losangeleseats\n'
                                                                       '#eatingnyc\n'
                                                                       '#damien_tc\n'
                                                                       '#singaporeinsiders\n'
                                                                       '#thisisinsiderfood\n'
                                                                       '#jktfoodbang\n'
                                                                       '#exploreflavours\n'
                                                                       '#asiafoodporn\n'
                                                                       '#feedthepanda\n'
                                                                       '#foodie\n'
                  

                                                                                                                   '#BakeForLove '
                                                                                                                   '#MadAboutSucre '
                                                                                                                   '#Dessert '
                                                                                                                   '#Sweet'},
                               'Chocolate Chip Cookies (RM9 for a bag of eight)': { 'RestaurantAddress': '\n'
                                                                                                         '143, '
                                                                                                         'Jalan '
                                                                                                         'SS '
                                   

                                                                                  'simple '
                                                                                  'and '
                                                                                  'refreshing '
                                                                                  'dessert '
                                                                                  'to '
                                                                                  'have '
                                                                                  'after '
                                                                                  'a '
                                                                                  'feast '
                                                                                  'during '
                                                                                  'CNY. '
 

                                                                                             'am '
                                                                                             'leaning '
                                                                                             'towards '
                                                                                             '88 '
                                                                                             'for '
                                                                                             'its '
                                                                                             'non-existent '
                                                                                             'fats '
                                                                                             'to '
                                                                                     

                                                                                                   '(Japanese '
                                                                                                   'sushi '
                                                                                                   'rice) '
                                                                                                   'are '
                                                                                                   'double '
                                                                                                   'portions '
                                                                                                   'of '
                                                                                                   'fatty '
                                                                                                   'tuna '
                          

                                                                                                              'roasted, '
                                                                                                              'with '
                                                                                                              'a '
                                                                                                              'crisp '
                                                                                                              'skin '
                                                                                                              'that '
                                                                                                              'crackles '
                                                                                                              'softly '
                                                

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




                                                                                                   'three '
                                                                                                   'varieties '
                                                                                                   'are '
                                                                                                   'distinguished '
                                                                                                   'by '
                                                                                                   'their '
                                                                                                   'cocoa '
                                                                                                   'content '
                                                                                                   '(55%, '
                  

                                                                                        '#sgfoodhunt '
                                                                                        '#sgfoodhunter '
                                                                                        '#sgfoodtrend '
                                                                                        '#myfooddiary '
                                                                                        '#sgfooddiary '
                                                                                        '#sgcafe '
                                                                                        '#sgcafefood '
                                                                                        '#sgcafehopping '
                                                                                        '#foodforfoodie '
                                                                  

                                                                                                                                                                                          'food '
                                                                                                                                                                                          'experiences '
                                                                                                                                                                                          'to '
                                                                                                                                                                                          'our '
                                                                                                                                                                                          'customers. '
                    

                                                                                                                                                                                                                                       'serving '
                                                                                                                                                                                                                                       'awesome '
                                                                                                                                                                                                                                       'local '
                                                                                                                                                                                                                                       'dishes. '
                                  

                                                                             'the '
                                                                             'coffeeshop. '
                                                                             'Similarly, '
                                                                             'the\xa0'
                                                                             'Roast '
                                                                             'Duck\xa0'
                                                                             '当归烤鸭\xa0'
                                                                             '($12/lower '
                                                                             'quarter '
                                                                             '& '
                                                                             '$11/upper '
                                 

                                                                                 'Middle Eastern': '/categories/sg/middle-eastern'},
                                                                 'Link': '/list/519581/mediterranean-middle-eastern'},
                             'Melaka': {'Link': '/list/514278/melaka'},
                             'Penang': {'Link': '/list/518448/penang'}},
                  'NumReviews': '2707',
                  'NumWishlists': '339',
                  'Reviews': { 'Aged & Smoked Duck': { 'RestaurantAddress': '\n'
                                                                            '21 '
                                                                            'Campbell '
                                                                            'Lane, '
                                                                            'Singapore\n',
                                                       'RestaurantName': '\n'
                 

                              'places for a cuppa': { 'Categories': { 'Cafes & Coffee': '/categories/sg/cafes-and-coffee'},
                                                      'Link': '/list/476958/places-for-a-cuppa'},
                              'supper nights': { 'Categories': { 'Supper': '/categories/sg/supper'},
                                                 'Link': '/list/465202/supper-nights'}},
                   'NumReviews': '2336',
                   'NumWishlists': '0',
                   'Reviews': { '[UPCOMING EVENT] Starker Music Carnival @ Zhongshan park\n•\nNo carnival is complete without the food and fresh booze!': { 'RestaurantAddress': '\n'
                                                                                                                                                                                 '1 '
                                                                                                                                               

In [116]:
with open('sample2_users_scraped.json', 'w') as f:
    json.dump(results_dict, f, indent=4, ensure_ascii=False)

## Testing

In [39]:
sample_user = results_dict['Alex Chua']
link = sample_user['Link']
userpage= f'http://www.burpple.com{link}'
print(userpage)

profilepage_soup = scrape(userpage, name= 'div', class_='profile-page page')

# for soup in profilepage_soup:
# #     s = soup.find_all(name='div', class_='box-content card-item card-item--header')
#     profilepage_stats = soup.find_all('a')
# for stat in profilepage_stats:
#     print(stat.text)
# #     if stat.get('href').split(r'/')[-1] == 'wishlist':
# #         print('Number of wishlists: ' ,stat.span.text)

http://www.burpple.com/@makanarts
