In [20]:
# Import BeautifulSoup to parse the html and selenium, time to simulate a Chrome window
# Simulation of the chrome window allows us to get around the issue of the infinite scroll site
# Time is used to pause execution to allow the simulated Chrome window to load
# pandas and numpy are imported to manipulate the data

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [2]:
# Setup the parameters for url, chromedriver path and how long it takes your device to render the updated restaurants
postcode = 'SW1A 2AA'
url = f'https://deliveroo.co.uk/restaurants/london/westminster?postcode={postcode}&collection=all-restaurants'
chrome_driver_path = r'/home/selvino/chrome-driver/chromedriver'
initial_load_time = 2
scroll_pause_time = 1.5

In [3]:
# Assign the webdriver to a variable which simulates an instance of Chrome on the url of choice
# The 2 seconds sleep gives the webpage a 2 second break to load initially
# Get the height of the devices screen to know how much needs to be scrolled each time
# Tell the browser to click on the accept cookies button and the £10 off button

driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
time.sleep(initial_load_time)
screen_height = driver.execute_script('return window.screen.height;')
driver.execute_script('document.querySelector(".accept-cookies-button").click()')
# driver.execute_script('document.querySelectorAll(".ccl-d0484b0360a2b432")[1].click()')

In [4]:
# Create a whie loop that scrolls through the website. This is done so the full infinite scrolling webpage renders and is in the html source
# During every iteration of the loop, the window is scrolled through by the size of the screen
# This is done by scrolling to a multiple of the screen height, which increases by one every iteration
# When the place to scroll to exceeds where the window is currently at, it means we must be at the bottom of the page
# I.e, once we have scrolled to the bottom, the current location on the window is fixed while the place to scroll to increases on every iteration

i = 1
while True:
    driver.execute_script("window.scrollTo(0, {screen_height} * {i});".format(screen_height=screen_height, i = i))
    i += 1
    time.sleep(scroll_pause_time)
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    if screen_height * i > scroll_height: 
        break

In [5]:
# Create a beautiful soup object that parses the html and has useful methods for reading the text
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [80]:
# Looks like all the restaurants that can be ordered from (clicked on) are wrapped in an a tag
# The aria-label attribute is in all of the a tags that we want
# Therefore, we need to filter through the html and find all the a tags that have the aria-label attribute
# The a tags with the restaurants in are also very long in comparison to the other a tags that have aria-labels
soup.find_all('a')[6]

<a aria-label="Restaurant Waitrose &amp; Partners. Delivers in 10 to 20 minutes. Rated 4.9 from 500+ reviews. Serves Alcohol, Grocery, British, and Breakfast." class="HomeFeedUICard-1cc6964985e41c86" href="/menu/london/victoria/waitrose-victoria-street?day=today&amp;postcode=SW1A2AA&amp;time=ASAP"><span class="HomeFeedUICard-ebc18e264cc4b96c"><span class="HomeFeedUICard-c6f6e82a465d3148" style="border-style:solid;border-top-color:initial;border-right-color:initial;border-bottom-color:initial;border-left-color:initial"><div class="ccl-45bd106b75353ec9" style='background-image: url("https://rs-menus-api.roocdn.com/images/632c42ed-67db-468e-b593-5bd7c4378e97/image.jpeg?width=268&amp;height=150&amp;auto=webp&amp;format=jpg&amp;fit=crop&amp;v=");'><div class="ccl-d5fa9efd412993ea ccl-0220f29e0da451eb"></div><div class="ccl-0ca6410c5e153903"><div class="HomeFeedUICard-19d5d31598e0335f"><div class="HomeFeedUICard-dce4e0d3970c801d" style="background-color:initial"><span class="HomeFeedUICard-d

In [81]:
# Define a function that returns only if the tag is an a tag and it has the aria label attribute

def has_aria_label(tag):
    if tag.name == 'a':
        return tag.has_attr('aria-label')

In [84]:
# All potential restaurants will meet this criteria. But we will also need to filter based on length of the a tag
# I chose 300 just by trial and error, starting from 100 and getting rid of the short links
potential_restaurants = soup.find_all(has_aria_label)
restaurants = [str(x) for x in potential_restaurants if len(str(x)) > 300 ]

In [127]:
# Within an a tag, most of the useful information is stated after the word Restaurant (with a space)
# Most of the useful data ends with a " because aria-label is a html attribute
# We will use that as a data identifier, and look for all the text after this start identifier and before the end identifier

def find_main_data(text):
    start_identifier = 'Restaurant '
    end_identifier = '"'
    search_start = text.find(start_identifier) + len(start_identifier)
    search_end = text.find(end_identifier, search_start)
    return text[search_start:search_end]

In [111]:
# Distance is given in miles, so searching for the text 'miles' or 'mile' away will help us find the distance
# Before the words 'miles away', the distance is given to one decimal place, eg x.xx miles away
# Therefore, we need to go back 4 from the position of miles away
# For mile away, distance is given as 1 mile away
# Therefore, we need to go back 2 from the position of mile away

def find_distance_data(text):
    identifier = 'miles away'
    sub_identifier = 'mile away'
    start_index = text.find(identifier)
    if start_index == -1:
        search_start = text.find(sub_identifier) -2
        search_end = search_start + 2 + len(sub_identifier)
    else:
        search_start = text.find(identifier) -4
        search_end = search_start + 4 + len(identifier)
    return text[search_start: search_end]

In [163]:
# Delivery Cost will be found after a pound sign, although if there is no cost, we assume there is free delivery
# Luckily, the delivery cost comes before the text stating 'Buy X get £Y off', otherwise we get the wrong cost
# There are no offer badges that have pound signs except those with the delivery cost, so no issue with getting the wrong '£' text
# If the text contains free delivery, then the cost should be 0

def find_delivery_cost(text):
    identifier = '£'
    search_start = text.find(identifier)
    if 'Free delivery' in text:
        return '£0.00'
    if search_start == -1:
        return '£0.00'
    else:
        return text[search_start: search_start + 5]

In [164]:
restaurants_list = []
id = 0
for restaurant in restaurants:
    restaurant_dictionary = {
        'id': id,
        'main_data': find_main_data(restaurant),
        'distance': find_distance_data(restaurant),
        'delivery_cost': find_delivery_cost(restaurant)
    }
    restaurants_list.append(restaurant_dictionary)
    id += 1
    

In [165]:
restaurant_df = pd.DataFrame.from_dict(restaurants_list)
restaurant_df['delivery_cost'].value_counts()

£2.99    107
£3.49     55
£2.49     55
£3.99     31
£0.00      9
£2.50      8
£2.95      3
£4.99      2
£5.99      2
£9.99      2
£3.50      2
£3.95      2
£4.50      2
£4.00      1
£5.00      1
£1.50      1
£6.99      1
£4.95      1
£4.49      1
Name: delivery_cost, dtype: int64