In [1]:
# Import BeautifulSoup to parse the html and selenium, time to simulate a Chrome window
# Simulation of the chrome window allows us to get around the issue of the infinite scroll site
# Time is used to pause execution to allow the simulated Chrome window to load
# pandas and numpy are imported to manipulate the data

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Setup the parameters for url, chromedriver path and how long it takes your device to render the updated restaurants
postcode = 'SW1A 2AA'
url = f'https://deliveroo.co.uk/restaurants/london/westminster?postcode={postcode}&collection=all-restaurants'
chrome_driver_path = r'/home/selvino/chrome-driver/chromedriver'
initial_load_time = 2
scroll_pause_time = 1.5

In [3]:
# Assign the webdriver to a variable which simulates an instance of Chrome on the url of choice
# The 2 seconds sleep gives the webpage a 2 second break to load initially
# Get the height of the devices screen to know how much needs to be scrolled each time
# Tell the browser to click on the accept cookies button and the £10 off button

driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
time.sleep(initial_load_time)
screen_height = driver.execute_script('return window.screen.height;')
driver.execute_script('document.querySelector(".accept-cookies-button").click()')
driver.execute_script('document.querySelectorAll(".ccl-d0484b0360a2b432")[1].click()')

In [4]:
# Create a whie loop that scrolls through the website. This is done so the full infinite scrolling webpage renders and is in the html source
# During every iteration of the loop, the window is scrolled through by the size of the screen
# This is done by scrolling to a multiple of the screen height, which increases by one every iteration
# When the place to scroll to exceeds where the window is currently at, it means we must be at the bottom of the page
# I.e, once we have scrolled to the bottom, the current location on the window is fixed while the place to scroll to increases on every iteration

i = 1
while True:
    driver.execute_script("window.scrollTo(0, {screen_height} * {i});".format(screen_height=screen_height, i = i))
    i += 1
    time.sleep(scroll_pause_time)
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    if screen_height * i > scroll_height: 
        break

In [5]:
# Create a beautiful soup object that parses the html and has useful methods for reading the text
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [6]:
# Looks like all the restaurants that can be ordered from (clicked on) are wrapped in an a tag
# The aria-label attribute is in all of the a tags that we want
# Therefore, we need to filter through the html and find all the a tags that have the aria-label attribute
# The a tags with the restaurants in are also very long in comparison to the other a tags that have aria-labels
soup.find_all('a')[77]

<a aria-label="Restaurant Jak's - Walton Street. Delivers in 20 to 35 minutes. Rated 4.7 from 500+ reviews. Serves Italian, Pizza, Salads, and Pasta." class="HomeFeedUICard-1cc6964985e41c86" href="/menu/london/chelsea/jak's?day=today&amp;postcode=SW1A2AA&amp;time=ASAP"><span class="HomeFeedUICard-ebc18e264cc4b96c"><span class="HomeFeedUICard-c6f6e82a465d3148" style="border-style: solid; border-color: initial;"><div class="ccl-45bd106b75353ec9" style='background-image: url("https://rs-menus-api.roocdn.com/images/f9eb64b0-af34-46b7-96b2-e4372813e65b/image.jpeg?width=210&amp;height=118&amp;auto=webp&amp;format=jpg&amp;fit=crop&amp;v=");'><div class="ccl-d5fa9efd412993ea ccl-0220f29e0da451eb"></div><div class="ccl-0ca6410c5e153903"><div class="HomeFeedUICard-19d5d31598e0335f"><div class="HomeFeedUICard-dce4e0d3970c801d" style="background-color: initial;"><span class="HomeFeedUICard-d189665d30dc18af HomeFeedUICard-2002941ca3f01c1b"><ul class="HomeFeedUICard-5fdde013431a78b7" style="backgrou

In [7]:
# Define a function that returns only if the tag is an a tag and it has the aria label attribute

def has_aria_label(tag):
    if tag.name == 'a':
        return tag.has_attr('aria-label')

In [8]:
# All potential restaurants will meet this criteria. But we will also need to filter based on length of the a tag
# I chose 300 just by trial and error, starting from 100 and getting rid of the short links
potential_restaurants = soup.find_all(has_aria_label)
restaurants = [str(x) for x in potential_restaurants if len(str(x)) > 300 ]

In [9]:
# Within an a tag, most of the useful information is stated after the word Restaurant (with a space)
# Most of the useful data ends with a " because aria-label is a html attribute
# We will use that as a data identifier, and look for all the text after this start identifier and before the end identifier

def find_main_data(text):
    start_identifier = 'Restaurant '
    end_identifier = '"'
    search_start = text.find(start_identifier) + len(start_identifier)
    search_end = text.find(end_identifier, search_start)
    return text[search_start:search_end]

In [10]:
# Distance is given in miles, so searching for the text 'miles' or 'mile' away will help us find the distance
# Before the words 'miles away', the distance is given to one decimal place, eg x.xx miles away
# Therefore, we need to go back 4 from the position of miles away
# For mile away, distance is given as 1 mile away
# Therefore, we need to go back 2 from the position of mile away

def find_distance_data(main_data):
    identifier = 'miles away'
    sub_identifier = 'mile away'
    start_index = main_data.find(identifier)
    if start_index == -1:
        search_start = main_data.find(sub_identifier) -2
        search_end = search_start + 2 + len(sub_identifier)
    else:
        search_start = main_data.find(identifier) -4
        search_end = search_start + 4 + len(identifier)
    return main_data[search_start: search_end]

In [11]:
# Delivery Cost will be found after a pound sign, although if there is no cost, we assume there is free delivery
# Luckily, the delivery cost comes before the text stating 'Buy X get £Y off', otherwise we get the wrong cost
# There are no offer badges that have pound signs except those with the delivery cost, so no issue with getting the wrong '£' text
# If the text contains free delivery, then the cost should be 0
   
def find_delivery_cost(text):
    start_identifier = '£'
    end_identifier = 'delivery'
    search_start = text.find(start_identifier)
    search_end = text.find(end_identifier, search_start)
    if 'Free delivery' in text:
        return '£0.00'
    elif (search_start == -1) | (search_end == -1):
        return None
    else:
        return text[search_start: search_end - 1]

In [12]:
# For each of the restaurants, find the main data, distance and delivery cost
# Put those into a dictionary so we can make them into a dataframe

restaurants_list = []
id = 0
for restaurant in restaurants:
    restaurant_dictionary = {
        'id': id,
        'main_data': find_main_data(restaurant),
        'distance': find_distance_data(restaurant),
        'delivery_cost': find_delivery_cost(restaurant)
    }
    restaurants_list.append(restaurant_dictionary)
    id += 1
    

In [13]:
# Make the dataframe from the dictionary
restaurant_df = pd.DataFrame.from_dict(restaurants_list)

In [14]:
# Now we need to clean the data! For example, the main_data column has several bits of data in
# Sentence Tokenize from nltk seperates the main data into sentences, making it easier to get each bit of data
restaurant_df['restaurant_name'] = restaurant_df['main_data'].map(lambda x: sent_tokenize(x.replace('&amp;', '&'))[0][:-1])
restaurant_df['delivery_time'] = restaurant_df['main_data'].map(lambda x: sent_tokenize(x)[1].split(' ')[2:5])

In [15]:
# Unfortunately, some restaurants have no reviews
# Because it is more complicated, will make a function to specifically tackle this
# Some restaurants have no review data

def find_review_data(main_data):
    start_identifier = 'Rated '
    end_identifier = 'reviews'
    search_start = main_data.find(start_identifier)
    if search_start > -1:
        search_end = main_data.find(end_identifier, search_start) - 1
        review_data = main_data[search_start: search_end]
        review_score = float(review_data.split(' ')[1])
        review_number = int(review_data.split(' ')[-1].split('+')[0])
        return {'score': review_score, 'number': review_number} 
    else:
        return {'score': 'None', 'number': 'None' }
    
restaurant_df['rating_score'] = restaurant_df['main_data'].map(lambda x: find_review_data(x)['score'])
restaurant_df['rating_number'] = restaurant_df['main_data'].map(lambda x: find_review_data(x)['number'])

In [16]:
# Now we have all the data we want, we need to clean it!

In [148]:
# Lastly, we need to get the cuisine data, which all follows the phrase 'Serves '
# Because it is a list, grammatically, all the cuisines are seperated by commas
# Splitting on commas will give all the cuisines that are seperated by commas as items in an array (cuisines_first_split)
# All items in the array except the last will be the cuisine only (first_cuisines)
# The last item in this array will either be the last cuisine or the only two cuisines (last_first_split)
# If there are two cuisines, they aren't seperated by commas, they are seperated by ' and ' instead
# Therefore, we should seperate last_first_split by ' and ', and we will have two cuisines or '' and the last cuisine
# We then need to add the last cuisine(s) to the first_cuisines, and we only want to add it if the cuisine is more than one letter, this is so you don't add the empty string ''
# Finally, we need to remove the full stop from the final element in the cuisines list and remove all surrounding whitespace
# Fish and chips poses a strong problem as then we can't split on ' and ', so we need to remove it if it exists then re-add it
# If there are only two cuisines, then a split on ',' won't do anything, so we need to 
def find_cuisine_data(main_data):
    start_identifier = 'Serves '
    first_splitter = ','
    last_splitter = ' and '
    cuisines_start = main_data.find(start_identifier)
    if cuisines_start > -1:
        cuisines_raw = main_data[cuisines_start + len(start_identifier):]
        cuisines_first_split = cuisines_raw.split(first_splitter)
        fish_chips = [x for x in cuisines_first_split if x == 'Fish and chips']
        if len(cuisines_first_split) < 2:
            cuisines_first_split = cuisines_first_split[0].split('and Fish and chips')
        cuisines_first_split = [x for x in cuisines_first_split if x != 'Fish and chips']
        last_first_split = cuisines_first_split[-1]
        first_cuisines = cuisines_first_split[:-1]
        last_cuisines = last_first_split.split(last_splitter)
        for i in last_cuisines:
            if len(i) > 1:
                first_cuisines.append(i)
        if len(first_cuisines) > 0:
            first_cuisines[-1] = first_cuisines[-1][:-1]
        if fish_chips:
            first_cuisines.append(fish_chips[0])
        all_cuisines = [x.strip() for x in first_cuisines]
        return all_cuisines
    
    
    
restaurant_df['cuisines'] = restaurant_df['main_data'].map(lambda x: find_cuisine_data(x))
restaurant_df

Unnamed: 0,id,main_data,distance,delivery_cost,restaurant_name,delivery_time,rating_score,rating_number,cuisines,distance_miles,delivery_cost_pounds,min_delivery_time,max_delivery_time,avg_delivery_time
0,0,Sticks'n'Sushi. Delivers in 15 to 30 minutes. ...,0.7 miles away,£3.49,Sticks'n'Sushi,"[15, to, 30]",4.8,500,"[Sushi, Japanese]",0.7,3.49,15.0,30.0,22.5
1,1,Waitrose &amp; Partners. Delivers in 15 to 25 ...,0.6 miles away,£4.49,Waitrose & Partners,"[15, to, 25]",4.8,500,"[Alcohol, Grocery, British, Breakfast]",0.6,4.49,15.0,25.0,20.0
2,2,Chipotle Mexican Grill. Delivers in 15 to 25 m...,0.8 miles away,£3.49,Chipotle Mexican Grill,"[15, to, 25]",4.6,500,"[Mexican, Burritos, Salads, Chicken]",0.8,3.49,15.0,25.0,20.0
3,3,Fresh Bake. Delivers in 35 to 55 minutes.,2.1 miles away,£4.49,Fresh Bake,"[35, to, 55]",,,,2.1,4.49,35.0,55.0,45.0
4,4,Sushi Shop. Delivers in 25 to 40 minutes. Rate...,1.5 miles away,£4.49,Sushi Shop,"[25, to, 40]",4.8,500,"[Sushi, Japanese, Salads, Poke]",1.5,4.49,25.0,40.0,32.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,794,Pronto Cafe. Delivers at . Rated 4.6 from 18 r...,,,Pronto Cafe,[.],4.6,18,"[Sandwiches, Breakfast, British]",,,,,
795,795,Tech Lounge Cafe. Delivers at . Serves America...,,,Tech Lounge Cafe,[.],,,"[American, Coffee, Breakfast, Brunch]",,,,,
796,796,Bbar Restaurant . Delivers at . Serves Chicken...,,,Bbar Restaurant,[.],,,"[Chicken, African]",,,,,
797,797,Casa Maria. Delivers at . Rated 4.7 from 72 re...,,,Casa Maria,[.],4.7,72,"[Mediterranean, Middle Eastern]",,,,,


### Now we have all the information we need, let's clean it up!

In [149]:
# Let's see what our data looks like
restaurant_df

Unnamed: 0,id,main_data,distance,delivery_cost,restaurant_name,delivery_time,rating_score,rating_number,cuisines,distance_miles,delivery_cost_pounds,min_delivery_time,max_delivery_time,avg_delivery_time
0,0,Sticks'n'Sushi. Delivers in 15 to 30 minutes. ...,0.7 miles away,£3.49,Sticks'n'Sushi,"[15, to, 30]",4.8,500,"[Sushi, Japanese]",0.7,3.49,15.0,30.0,22.5
1,1,Waitrose &amp; Partners. Delivers in 15 to 25 ...,0.6 miles away,£4.49,Waitrose & Partners,"[15, to, 25]",4.8,500,"[Alcohol, Grocery, British, Breakfast]",0.6,4.49,15.0,25.0,20.0
2,2,Chipotle Mexican Grill. Delivers in 15 to 25 m...,0.8 miles away,£3.49,Chipotle Mexican Grill,"[15, to, 25]",4.6,500,"[Mexican, Burritos, Salads, Chicken]",0.8,3.49,15.0,25.0,20.0
3,3,Fresh Bake. Delivers in 35 to 55 minutes.,2.1 miles away,£4.49,Fresh Bake,"[35, to, 55]",,,,2.1,4.49,35.0,55.0,45.0
4,4,Sushi Shop. Delivers in 25 to 40 minutes. Rate...,1.5 miles away,£4.49,Sushi Shop,"[25, to, 40]",4.8,500,"[Sushi, Japanese, Salads, Poke]",1.5,4.49,25.0,40.0,32.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,794,Pronto Cafe. Delivers at . Rated 4.6 from 18 r...,,,Pronto Cafe,[.],4.6,18,"[Sandwiches, Breakfast, British]",,,,,
795,795,Tech Lounge Cafe. Delivers at . Serves America...,,,Tech Lounge Cafe,[.],,,"[American, Coffee, Breakfast, Brunch]",,,,,
796,796,Bbar Restaurant . Delivers at . Serves Chicken...,,,Bbar Restaurant,[.],,,"[Chicken, African]",,,,,
797,797,Casa Maria. Delivers at . Rated 4.7 from 72 re...,,,Casa Maria,[.],4.7,72,"[Mediterranean, Middle Eastern]",,,,,


### Cleaning Tasks:
1. distance should be a float, not a string
2. delivery_cost should be a float, not a string
3. delivery_time should actually be split into min, max and average delivery time and should all be floats
4. Each cuisine should have its own column equal to 1 if they serve that cuisine, and 0 otherwise
5. We will need to remove all unnecessary columns 


In [150]:
# Take the distance column, split on a space and take the first element if a distance is given
restaurant_df['distance_miles'] = restaurant_df['distance'].map(lambda x: float(x.split(' ')[0]) if x else None)

In [151]:
# Take all the characters after the pound sign
restaurant_df['delivery_cost_pounds'] = restaurant_df['delivery_cost'].map(lambda x: float(x[1:]) if x else None)

In [152]:
# Take the first and last element of delivery_time, and also get their average
# We can only do this where the delivery time has 3 elements and the first and last element start with a digit
def clean_delivery_data(delivery_time):
    min_time = None
    max_time = None
    avg_time = None
    
    if delivery_time:
        if (delivery_time[0][0].isdigit()) & (len(delivery_time) == 3) :
            min_time = float(delivery_time[0])
            max_time = float(delivery_time[-1])
            avg_time = (min_time + max_time) / 2
    
    return {'min': min_time, 'max': max_time, 'avg': avg_time}

restaurant_df['min_delivery_time'] = restaurant_df['delivery_time'].map(lambda x: clean_delivery_data(x)['min'])
restaurant_df['max_delivery_time'] = restaurant_df['delivery_time'].map(lambda x: clean_delivery_data(x)['max'])
restaurant_df['avg_delivery_time'] = restaurant_df['delivery_time'].map(lambda x: clean_delivery_data(x)['avg'])

In [153]:
# Need to make dummy variables for each possible cuisine, so first we create a list of all the cuisines
# Assuming cuisines isn't empty, we need to iterate over it and add a cuisine to our list only if it is not already in the list
cuisine_list = []
for cuisines in restaurant_df['cuisines']:
    if cuisines:
        for i in cuisines:
            if i not in cuisine_list:
                cuisine_list.append(i)
np.sort(cuisine_list)

array(['Acai', 'African', 'Alcohol', 'American', 'Argentinian', 'Asian',
       'Asian Fusion', 'BBQ', 'Bagels', 'Bakery', 'Bento', 'Breakfast',
       'British', 'Brunch', 'Bubble tea', 'Burgers', 'Burritos', 'Café',
       'Cakes', 'Cantonese', 'Caribbean', 'Charcuterie', 'Chicken',
       'Chinese', 'Coffee', 'Crêpe', 'Curry', 'Dessert', 'Dim Sum',
       'Drinks', 'Dumplings', 'Falafel', 'Fatayer', 'Fish and chips',
       'French', 'Fried chicken', 'Gelato', 'German', 'Greek', 'Grill',
       'Grocery', 'Hawaiian', 'Healthy', 'Hot pot', 'Ice cream', 'Indian',
       'Iranian', 'Italian', 'Jamaican', 'Japanese', 'Juices', 'Kebab',
       'Korean', 'Latin American', 'Lebanese', 'Mediterranean', 'Mexican',
       'Mezze', 'Middle Eastern', 'Milkshakes', 'Moroccan', 'Noodles',
       'Pakistani', 'Pancakes', 'Pasta', 'Peruvian', 'Pho', 'Piadina',
       'Pie', 'Pizza', 'Poke', 'Portuguese', 'Ramen', 'Russian', 'Salads',
       'Sandwiches', 'Seafood', 'Shanghainese', 'Sichuan', 'Smoot

In [178]:
# Make a new column for each cuisine, which is equal to 1 if the restaurant serves the cuisine and zero otherwise
restaurant_df_drop_empty = restaurant_df.dropna()
for cuisine in cuisine_list:
    restaurant_df_drop_empty[cuisine] = restaurant_df_drop_empty['cuisines'].map(lambda x: 1 if (cuisine in x) else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurant_df_drop_empty[cuisine] = restaurant_df_drop_empty['cuisines'].map(lambda x: 1 if (cuisine in x) else 0)


In [179]:
# Let's have a look at what columns we have left
restaurant_df_drop_empty.columns

Index(['id', 'main_data', 'distance', 'delivery_cost', 'restaurant_name',
       'delivery_time', 'rating_score', 'rating_number', 'cuisines',
       'distance_miles',
       ...
       'Pakistani', 'Falafel', 'Bagels', 'German', 'Caribbean', 'African',
       'Fish and chips', 'Moroccan', 'Jamaican', 'Fatayer'],
      dtype='object', length=110)

In [182]:
# We don't need main_data, distance, delivery_cost, delivery_time
restaurant_df_clean = restaurant_df_drop_empty.drop(axis = 1, columns = ['main_data', 'distance', 'delivery_cost', 'delivery_time'])
restaurant_df_clean

Unnamed: 0,id,restaurant_name,rating_score,rating_number,cuisines,distance_miles,delivery_cost_pounds,min_delivery_time,max_delivery_time,avg_delivery_time,...,Pakistani,Falafel,Bagels,German,Caribbean,African,Fish and chips,Moroccan,Jamaican,Fatayer
0,0,Sticks'n'Sushi,4.8,500,"[Sushi, Japanese]",0.7,3.49,15.0,30.0,22.5,...,0,0,0,0,0,0,0,0,0,0
1,1,Waitrose & Partners,4.8,500,"[Alcohol, Grocery, British, Breakfast]",0.6,4.49,15.0,25.0,20.0,...,0,0,0,0,0,0,0,0,0,0
2,2,Chipotle Mexican Grill,4.6,500,"[Mexican, Burritos, Salads, Chicken]",0.8,3.49,15.0,25.0,20.0,...,0,0,0,0,0,0,0,0,0,0
4,4,Sushi Shop,4.8,500,"[Sushi, Japanese, Salads, Poke]",1.5,4.49,25.0,40.0,32.5,...,0,0,0,0,0,0,0,0,0,0
5,5,Margarita Mama's,3.9,500,"[Italian, Pasta, Pizza]",5.7,3.99,10.0,10.0,10.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,363,Supporting the NHS,4.9,500,[British],1.6,0.00,10.0,10.0,10.0,...,0,0,0,0,0,0,0,0,0,0
364,364,The Real Eating Company,,,"[British, Breakfast, Coffee]",1.7,4.49,35.0,55.0,45.0,...,0,0,0,0,0,0,0,0,0,0
365,365,Valentinas Kitchen,4.3,108,"[Italian, Pasta, Salads, Dessert]",0.9,3.49,40.0,65.0,52.5,...,0,0,0,0,0,0,0,0,0,0
366,366,East Pearl,,,[Chinese],1.5,4.99,30.0,30.0,30.0,...,0,0,0,0,0,0,0,0,0,0
