In [1]:
# Import BeautifulSoup to parse the html and selenium, time to simulate a Chrome window
# Simulation of the chrome window allows us to get around the issue of the infinite scroll site
# Time is used to pause execution to allow the simulated Chrome window to load
# pandas and numpy are imported to manipulate the data

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Setup the parameters for url, chromedriver path and how long it takes your device to render the updated restaurants
postcode = 'SW1A 2AA'
url = f'https://deliveroo.co.uk/restaurants/london/westminster?postcode={postcode}&collection=all-restaurants'
chrome_driver_path = r'/home/selvino/chrome-driver/chromedriver'
initial_load_time = 2
scroll_pause_time = 1.5

In [3]:
# Assign the webdriver to a variable which simulates an instance of Chrome on the url of choice
# The 2 seconds sleep gives the webpage a 2 second break to load initially
# Get the height of the devices screen to know how much needs to be scrolled each time
# Tell the browser to click on the accept cookies button and the £10 off button

driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
time.sleep(initial_load_time)
screen_height = driver.execute_script('return window.screen.height;')
driver.execute_script('document.querySelector(".accept-cookies-button").click()')
# driver.execute_script('document.querySelectorAll(".ccl-d0484b0360a2b432")[1].click()')

In [4]:
# Create a whie loop that scrolls through the website. This is done so the full infinite scrolling webpage renders and is in the html source
# During every iteration of the loop, the window is scrolled through by the size of the screen
# This is done by scrolling to a multiple of the screen height, which increases by one every iteration
# When the place to scroll to exceeds where the window is currently at, it means we must be at the bottom of the page
# I.e, once we have scrolled to the bottom, the current location on the window is fixed while the place to scroll to increases on every iteration

i = 1
while True:
    driver.execute_script("window.scrollTo(0, {screen_height} * {i});".format(screen_height=screen_height, i = i))
    i += 1
    time.sleep(scroll_pause_time)
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    if screen_height * i > scroll_height: 
        break

In [5]:
# Create a beautiful soup object that parses the html and has useful methods for reading the text
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [6]:
# Looks like all the restaurants that can be ordered from (clicked on) are wrapped in an a tag
# The aria-label attribute is in all of the a tags that we want
# Therefore, we need to filter through the html and find all the a tags that have the aria-label attribute
# The a tags with the restaurants in are also very long in comparison to the other a tags that have aria-labels
soup.find_all('a')[6]

<a aria-label="Restaurant Waitrose &amp; Partners. Delivers in 10 to 20 minutes. Rated 4.8 from 500+ reviews. Serves Alcohol, Grocery, British, and Breakfast." class="HomeFeedUICard-1cc6964985e41c86" href="/menu/london/victoria/waitrose-victoria-street?day=today&amp;postcode=SW1A2AA&amp;time=ASAP"><span class="HomeFeedUICard-ebc18e264cc4b96c"><span class="HomeFeedUICard-c6f6e82a465d3148" style="border-style:solid;border-top-color:initial;border-right-color:initial;border-bottom-color:initial;border-left-color:initial"><div class="ccl-45bd106b75353ec9" style='background-image: url("https://rs-menus-api.roocdn.com/images/632c42ed-67db-468e-b593-5bd7c4378e97/image.jpeg?width=215&amp;height=121&amp;auto=webp&amp;format=jpg&amp;fit=crop&amp;v=");'><div class="ccl-d5fa9efd412993ea ccl-0220f29e0da451eb"></div><div class="ccl-0ca6410c5e153903"><div class="HomeFeedUICard-19d5d31598e0335f"><div class="HomeFeedUICard-dce4e0d3970c801d" style="background-color:initial"><span class="HomeFeedUICard-d

In [7]:
# Define a function that returns only if the tag is an a tag and it has the aria label attribute

def has_aria_label(tag):
    if tag.name == 'a':
        return tag.has_attr('aria-label')

In [8]:
# All potential restaurants will meet this criteria. But we will also need to filter based on length of the a tag
# I chose 300 just by trial and error, starting from 100 and getting rid of the short links
potential_restaurants = soup.find_all(has_aria_label)
restaurants = [str(x) for x in potential_restaurants if len(str(x)) > 300 ]

In [9]:
# Within an a tag, most of the useful information is stated after the word Restaurant (with a space)
# Most of the useful data ends with a " because aria-label is a html attribute
# We will use that as a data identifier, and look for all the text after this start identifier and before the end identifier

def find_main_data(text):
    start_identifier = 'Restaurant '
    end_identifier = '"'
    search_start = text.find(start_identifier) + len(start_identifier)
    search_end = text.find(end_identifier, search_start)
    return text[search_start:search_end]

In [10]:
# Distance is given in miles, so searching for the text 'miles' or 'mile' away will help us find the distance
# Before the words 'miles away', the distance is given to one decimal place, eg x.xx miles away
# Therefore, we need to go back 4 from the position of miles away
# For mile away, distance is given as 1 mile away
# Therefore, we need to go back 2 from the position of mile away

def find_distance_data(main_data):
    identifier = 'miles away'
    sub_identifier = 'mile away'
    start_index = main_data.find(identifier)
    if start_index == -1:
        search_start = main_data.find(sub_identifier) -2
        search_end = search_start + 2 + len(sub_identifier)
    else:
        search_start = main_data.find(identifier) -4
        search_end = search_start + 4 + len(identifier)
    return main_data[search_start: search_end]

In [13]:
# Delivery Cost will be found after a pound sign, although if there is no cost, we assume there is free delivery
# Luckily, the delivery cost comes before the text stating 'Buy X get £Y off', otherwise we get the wrong cost
# There are no offer badges that have pound signs except those with the delivery cost, so no issue with getting the wrong '£' text
# If the text contains free delivery, then the cost should be 0
   
def find_delivery_cost(main_data):
    identifier = '£'
    search_start = main_data.find(identifier)
    if 'Free delivery' in main_data:
        return '£0.00'
    if search_start == -1:
        return '£0.00'
    else:
        return main_data[search_start: search_start + 5]

In [14]:
# For each of the restaurants, find the main data, distance and delivery cost
# Put those into a dictionary so we can make them into a dataframe

restaurants_list = []
id = 0
for restaurant in restaurants:
    restaurant_dictionary = {
        'id': id,
        'main_data': find_main_data(restaurant),
        'distance': find_distance_data(restaurant),
        'delivery_cost': find_delivery_cost(restaurant)
    }
    restaurants_list.append(restaurant_dictionary)
    id += 1
    

In [15]:
# Make the dataframe from the dictionary
restaurant_df = pd.DataFrame.from_dict(restaurants_list)

In [16]:
# Now we need to clean the data! For example, the main_data column has several bits of data in
# Sentence Tokenize from nltk seperates the main data into sentences, making it easier to get each bit of data
restaurant_df['restaurant_name'] = restaurant_df['main_data'].map(lambda x: sent_tokenize(x.replace('&amp;', '&'))[0][:-1])
restaurant_df['delivery_time'] = restaurant_df['main_data'].map(lambda x: sent_tokenize(x)[1].split(' ')[2:5])

In [17]:
# Unfortunately, some restaurants have no reviews
# Because it is more complicated, will make a function to specifically tackle this
# Some restaurants have no review data

def find_review_data(main_data):
    start_identifier = 'Rated '
    end_identifier = 'reviews'
    search_start = main_data.find(start_identifier)
    if search_start > -1:
        search_end = main_data.find(end_identifier, search_start) - 1
        review_data = main_data[search_start: search_end]
        review_score = float(review_data.split(' ')[1])
        review_number = int(review_data.split(' ')[-1].split('+')[0])
        return {'score': review_score, 'number': review_number} 
    else:
        return {'score': 'None', 'number': 'None' }
    
restaurant_df['rating_score'] = restaurant_df['main_data'].map(lambda x: find_review_data(x)['score'])
restaurant_df['rating_number'] = restaurant_df['main_data'].map(lambda x: find_review_data(x)['number'])

In [None]:
# Now we have all the data we want, we need to clean it!

In [117]:
# Lastly, we need to get the cuisine data, which all follows the phrase 'Serves '
# Because it is a list, grammatically, all the cuisines are seperated by commas
# Splitting on commas will give all the cuisines that are seperated by commas as items in an array (cuisines_first_split)
# All items in the array except the last will be the cuisine only (first_cuisines)
# The last item in this array will either be the last cuisine or the only two cuisines (last_first_split)
# If there are two cuisines, they aren't seperated by commas, they are seperated by ' and ' instead
# Therefore, we should seperate last_first_split by ' and ', and we will have two cuisines or '' and the last cuisine
# We then need to add the last cuisine(s) to the first_cuisines, and we only want to add it if the cuisine is more than one letter, this is so you don't add the empty string ''
# Finally, we need to remove the full stop from the final element in the cuisines list and remove all surrounding whitespace

def find_cuisine_data(main_data):
    start_identifier = 'Serves '
    first_splitter = ','
    last_splitter = ' and '
    cuisines_start = main_data.find(start_identifier)
    if cuisines_start > -1:
        cuisines_raw = main_data[cuisines_start + len(start_identifier):]
        cuisines_first_split = cuisines_raw.split(first_splitter)
        last_first_split = cuisines_first_split[-1]
        first_cuisines = cuisines_first_split[:-1]
        last_cuisines = last_first_split.split(last_splitter)
        for i in last_cuisines:
            if len(i) > 1:
                first_cuisines.append(i)
        first_cuisines[-1] = first_cuisines[-1][:-1]
        all_cuisines = [x.strip() for x in first_cuisines]
        return all_cuisines
    
    
    
restaurant_df['cuisines'] = restaurant_df['main_data'].map(lambda x: find_cuisine_data(x))
restaurant_df['cuisines'][0]

['Alcohol', 'Grocery', 'British', 'Breakfast']

In [118]:
cuisine_list = []
for cuisines in restaurant_df['cuisines']:
    if cuisines:
        for cuisine in cuisines:
            if cuisine not in cuisine_list:
                cuisine_list.append(cuisine)

In [120]:
restaurant_df

Unnamed: 0,id,main_data,distance,delivery_cost,restaurant_name,delivery_time,rating_score,rating_number,cuisines
0,0,Waitrose &amp; Partners. Delivers in 10 to 20 ...,0.6 miles away,£3.99,Waitrose & Partners,"[10, to, 20]",4.8,500,"[Alcohol, Grocery, British, Breakfast]"
1,1,Pret a Manger. Delivers in 15 to 30 minutes. R...,1.7 miles away,£3.99,Pret a Manger,"[15, to, 30]",4.8,500,"[Salads, Breakfast, Sandwiches, Healthy]"
2,2,Wellness &amp; Beauty by Whole Foods Market. D...,2.7 miles away,£4.49,Wellness & Beauty by Whole Foods Market,"[40, to, 60]",5,16,"[Grocery, Healthy, Snacks, Drinks]"
3,3,Margarita Mama's. Delivers in 10 to 10 minutes...,5.7 miles away,£3.99,Margarita Mama's,"[10, to, 10]",3.9,500,"[Italian, Pasta, Pizza]"
4,4,Brinkley’s Wines - Chelsea. Delivers in 20 to ...,2.7 miles away,£4.49,Brinkley’s Wines - Chelsea,"[20, to, 35]",4.9,500,"[Alcohol, British, Grocery, Drinks]"
...,...,...,...,...,...,...,...,...,...
793,793,Eaton Square. Delivers at . Rated 4.3 from 468...,,£0.00,Eaton Square,[.],4.3,468,"[Pizza, Salads, Burgers, Mediterranean]"
794,794,Good Eggs. Delivers at . Rated 4.7 from 79 rev...,,£0.00,Good Eggs,[.],4.7,79,"[Breakfast, Wraps, Healthy]"
795,795,"Yolk &amp; Bun. Delivers at . Serves Café, Bag...",,£0.00,Yolk & Bun,[.],,,"[Café, Bagels, Breakfast, Brunch]"
796,796,BENVENUTI. Delivers at . Serves Italian and Gr...,,£0.00,BENVENUTI,[.],,,"[Italian, Grocery]"
