In [126]:
# Import BeautifulSoup to parse the html and urlopen to req html from a site
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import re
import pandas as pd
import numpy as np
from pandasql import sqldf
import emoji

In [11]:
def index_first_number(string):
    while True: 
        for i in range(0, len(string)):
            if string[i].isdigit():
                return string.find(string[i])
        return 'None'

In [2]:
# Setup the parameters for url, chromedriver path and how long it takes your device to render the updated restaurants
postcode = 'EC1R0HX'
url = f'https://deliveroo.co.uk/restaurants/london/clerkenwell?postcode={postcode}&collection=all-restaurants'
chrome_driver_path = r'/home/selvino/chrome-driver/chromedriver'
initial_load_time = 2
scroll_pause_time = 1.5

In [3]:
# Assign the webdriver to a variable which simulates an instance of Chrome on the url of choice
# The 2 seconds sleep gives the webpage a 2 second break to load (can be changed)
# Get the height of the devices screen to know how much needs to be scrolled each time
# Tell the browser to click on the accept cookies button and the £10 off button

driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
time.sleep(initial_load_time)
screen_height = driver.execute_script('return window.screen.height;')
driver.execute_script('document.querySelector(".accept-cookies-button").click()')
driver.execute_script('document.querySelectorAll(".ccl-d0484b0360a2b432")[1].click()')

In [4]:
# Create a whie loop that scrolls through the website. This is done so the full infinite scrolling webpage renders and is in the html source

i = 1
while True:
    driver.execute_script("window.scrollTo(0, {screen_height} * {i});".format(screen_height=screen_height, i = i))
    i += 1
    time.sleep(scroll_pause_time)
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    if screen_height * i > scroll_height: 
        break

In [5]:
# Create a beautiful soup object that parses the html and has useful methods for reading the text
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [158]:
# Need to loop through the text on the source web page and make a list of all the restaurants' data
# The identifier variables are used to know where in the webpage to start looking and what seperates restaurants

start_identifier = 'restaurants'
restaurant_identifier= 'miles away'
restaurant_identifier_2 = 'mile away'

start_point = soup.get_text().find(start_identifier) + len(start_identifier)
useful_text = soup.get_text()[start_point:]
restaurants = []
search = True
search_start = 0
id = 0
while search:
    end_index = useful_text.find(restaurant_identifier, search_start)
    end_index2 = useful_text.find(restaurant_identifier_2, search_start)
    if (end_index == -1) & (end_index2 == -1):
        break
    else:
        if (end_index > end_index2) & (end_index2 != -1):
            end_index = end_index2
        
        search_end = end_index + len(restaurant_identifier)
        restaurant_data = useful_text[search_start : search_end]
        restaurant_dict = {}
        
        #Extract Cuisines and Distance
        cuisines_raw = restaurant_data.split('·')
        cuisine_list = [x for x in cuisines_raw if len(x) > 1]
        if len(cuisine_list) > 1: 
            raw_last_cuisine = cuisine_list[-1]
            last_cuisine = raw_last_cuisine[:index_first_number(raw_last_cuisine)]
            distance = raw_last_cuisine[index_first_number(raw_last_cuisine):]
            cuisine_list = cuisine_list[1:-1]
            cuisine_list.append(last_cuisine)
            
            restaurant_dict['Cuisines'] = cuisine_list
            restaurant_dict['Distance'] = distance
        
        #Extract Ratings
        if len(cuisines_raw) > 1:
            raw_rating = cuisines_raw[0]
            seperated_rating = raw_rating.split('.')
            if len(seperated_rating) > 1:
                rating = seperated_rating[0][-1] + '.' + seperated_rating[1]
                restaurant_dict['Rating'] = rating
        
        #Extract Name and Delivery Time
        if len(cuisine_list) > 1:
            raw_name = seperated_rating[0][:-1]
            split_raw_name = raw_name.split('–')[-1][6:]
            if len(split_raw_name) > 1:
                restaurant_dict['Name'] = split_raw_name
                restaurant_dict['Delivery Time'] = seperated_rating[0].split(r'\n')
                restaurant_dict['Restaurant ID'] = id

            
        if restaurant_dict:
            restaurants.append(restaurant_dict)
            id += 1
        search_start = search_end + 1
        

In [88]:
restaurants

[{'Cuisines': ['Mexican', 'Burritos', 'Salads', 'Chicken'],
  'Distance': '0.7 miles away',
  'Rating': '4.7 Excellent(500+)',
  'Name': 'Chipotle Mexican Grill',
  'Delivery Time': ['Free\ndelivery10 – 20minChipotle Mexican Grill4'],
  'Restaurant ID': 0},
 {'Cuisines': ['Salads', 'Poke', 'Hawaiian', 'Seafood'],
  'Distance': '0.7 miles away',
  'Rating': '4.8 Excellent(500+)',
  'Name': '🏄Honi Poke - Angel🏄',
  'Delivery Time': ['Free deliveryFree\ndelivery10 – 20min🏄Honi Poke - Angel🏄4'],
  'Restaurant ID': 1},
 {'Cuisines': ['American', 'Burgers', 'Milkshakes'],
  'Distance': '0.7 miles away',
  'Rating': '4.7 Excellent(500+)',
  'Name': 'Five Guys - Burger and Fries',
  'Delivery Time': ['Free deliveryFree\ndelivery15 – 30minFive Guys - Burger and Fries4'],
  'Restaurant ID': 2},
 {'Cuisines': ['German', 'Kebab'],
  'Distance': '0.4 miles away',
  'Rating': '4.6 Excellent(500+)',
  'Name': 'German Doner Kebab',
  'Delivery Time': ['Free deliveryFree\ndelivery10 – 20minGerman Doner

In [159]:
df = pd.DataFrame.from_dict(restaurants)

In [160]:
columns = ['Restaurant ID', 'Name', 'Cuisines','Rating', 'Distance']
restaurant_df = df[columns]

In [91]:
restaurant_df

Unnamed: 0,Restaurant ID,Name,Cuisines,Rating,Distance
0,0.0,Chipotle Mexican Grill,"[Mexican, Burritos, Salads, Chicken]",4.7 Excellent(500+),0.7 miles away
1,1.0,🏄Honi Poke - Angel🏄,"[Salads, Poke, Hawaiian, Seafood]",4.8 Excellent(500+),0.7 miles away
2,2.0,Five Guys - Burger and Fries,"[American, Burgers, Milkshakes]",4.7 Excellent(500+),0.7 miles away
3,3.0,German Doner Kebab,"[German, Kebab]",4.6 Excellent(500+),0.4 miles away
4,4.0,atis,"[Salads, Chicken, Healthy]",4.8 Excellent(500+),0.8 miles away
...,...,...,...,...,...
952,952.0,eliveryFree\ndelivery11:30 - 12:00TomorrowThe ...,"[British, Salads, Burgers, Chicken]",4.5 Excellent(103),1.9 miles away
953,953.0,eliveryFree\ndelivery11:30 - 12:00TomorrowHonm...,"[Chicken, Sushi]",,1.5 miles away
954,954.0,eliverySpecial OfferFree\ndelivery11:30 - 12:0...,"[Curry, Dessert]",,2.3 miles away
955,955.0,elivery45%\noff11:45 - 12:15TomorrowThe Big Ki...,"[Japanese, Dessert, Sandwiches, Burgers]",4.3 Very Good(300),2.3 miles away


In [161]:
restaurant_df['main_cuisine'] = restaurant_df['Cuisines'].map(lambda x: x[0]) 

In [162]:
restaurant_df_clean = restaurant_df.dropna()

In [163]:
def find_rating_number(rating):
    try:
        return rating.split(' ')[0]
    except:
        return None
        

In [164]:
def remove_emoji(text):
    remove_emoji = emoji.demojize(text).split(':person_surfing:')
    clean_list = [x for x in remove_emoji if len(x) > 1]
    return ''.join(clean_list)

In [165]:
restaurant_df_clean['rating_number'] = restaurant_df_clean['Rating'].map(lambda x: find_rating_number(x))
restaurant_df_clean['rating_text'] = restaurant_df_clean['Rating'].map(lambda x: x.split('(')[0].split(' ')[-1])
restaurant_df_clean['rating_count'] = restaurant_df_clean['Rating'].map(lambda x: x.split('(')[-1][:-1])
restaurant_df_clean['distance_miles'] = restaurant_df_clean['Distance'].map(lambda x: x.split(' ')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurant_df_clean['rating_number'] = restaurant_df_clean['Rating'].map(lambda x: find_rating_number(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurant_df_clean['rating_text'] = restaurant_df_clean['Rating'].map(lambda x: x.split('(')[0].split(' ')[-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [166]:
restaurant_df_clean

Unnamed: 0,Restaurant ID,Name,Cuisines,Rating,Distance,main_cuisine,rating_number,rating_text,rating_count,distance_miles
0,0.0,Chipotle Mexican Grill,"[Mexican, Burritos, Salads, Chicken]",4.7 Excellent(500+),0.7 miles away,Mexican,4.7,Excellent,500+,0.7
1,1.0,🏄Honi Poke - Angel🏄,"[Salads, Poke, Hawaiian, Seafood]",4.8 Excellent(500+),0.7 miles away,Salads,4.8,Excellent,500+,0.7
2,2.0,Five Guys - Burger and Fries,"[American, Burgers, Milkshakes]",4.7 Excellent(500+),0.7 miles away,American,4.7,Excellent,500+,0.7
3,3.0,German Doner Kebab,"[German, Kebab]",4.6 Excellent(500+),0.4 miles away,German,4.6,Excellent,500+,0.4
4,4.0,atis,"[Salads, Chicken, Healthy]",4.8 Excellent(500+),0.8 miles away,Salads,4.8,Excellent,500+,0.8
...,...,...,...,...,...,...,...,...,...,...
950,950.0,elivery25% off entire menu25%\noff11:30 - 12:0...,"[Asian, Sushi, Japanese, Poke]",4.8 Excellent(19),0.4 miles away,Asian,4.8,Excellent,19,0.4
951,951.0,"eliverySpend £35, get 25% offFree\ndelivery11:...","[American, Chicken, Dessert, Burgers]",4.1 Good(313),1.6 miles away,American,4.1,Good,313,1.6
952,952.0,eliveryFree\ndelivery11:30 - 12:00TomorrowThe ...,"[British, Salads, Burgers, Chicken]",4.5 Excellent(103),1.9 miles away,British,4.5,Excellent,103,1.9
955,955.0,elivery45%\noff11:45 - 12:15TomorrowThe Big Ki...,"[Japanese, Dessert, Sandwiches, Burgers]",4.3 Very Good(300),2.3 miles away,Japanese,4.3,Good,300,2.3


In [144]:
restaurant_df_clean['restaurant_name'] = restaurant_df_clean['Name'].map(lambda x: remove_emoji(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurant_df_clean['restaurant_name'] = restaurant_df_clean['Name'].map(lambda x: remove_emoji(x))


In [150]:
restaurant_df_clean.drop('Name',axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [171]:
q = """
    SELECT
        main_cuisine,
        COUNT(Name)
    FROM
        df_test
    GROUP BY
        main_cuisine
"""
sqldf(q)

Unnamed: 0,main_cuisine,COUNT(Name)
0,Acai,2
1,Alcohol,29
2,American,78
3,Asian,18
4,Asian Fusion,1
...,...,...
68,Thai,34
69,Turkish,7
70,Vietnamese,8
71,Wine,2


In [167]:
df_test = restaurant_df_clean.applymap(str)

Unnamed: 0,Restaurant ID,Cuisines,Rating,Distance,main_cuisine,rating_number,rating_text,rating_count,distance_miles,restaurant_name
0,0.0,"['Mexican', 'Burritos', 'Salads', 'Chicken']",4.7 Excellent(500+),0.7 miles away,Mexican,4.7,Excellent,500+,0.7,Chipotle Mexican Grill
1,1.0,"['Salads', 'Poke', 'Hawaiian', 'Seafood']",4.8 Excellent(500+),0.7 miles away,Salads,4.8,Excellent,500+,0.7,Honi Poke - Angel
2,2.0,"['American', 'Burgers', 'Milkshakes']",4.7 Excellent(500+),0.7 miles away,American,4.7,Excellent,500+,0.7,Five Guys - Burger and Fries
3,3.0,"['German', 'Kebab']",4.6 Excellent(500+),0.4 miles away,German,4.6,Excellent,500+,0.4,German Doner Kebab
4,4.0,"['Salads', 'Chicken', 'Healthy']",4.8 Excellent(500+),0.8 miles away,Salads,4.8,Excellent,500+,0.8,atis
...,...,...,...,...,...,...,...,...,...,...
950,950.0,"['Asian', 'Sushi', 'Japanese', 'Poke']",4.8 Excellent(19),0.4 miles away,Asian,4.8,Excellent,19,0.4,elivery25% off entire menu25%\noff11:30 - 12:0...
951,951.0,"['American', 'Chicken', 'Dessert', 'Burgers']",4.1 Good(313),1.6 miles away,American,4.1,Good,313,1.6,"eliverySpend £35, get 25% offFree\ndelivery11:..."
952,952.0,"['British', 'Salads', 'Burgers', 'Chicken']",4.5 Excellent(103),1.9 miles away,British,4.5,Excellent,103,1.9,eliveryFree\ndelivery11:30 - 12:00TomorrowThe ...
955,955.0,"['Japanese', 'Dessert', 'Sandwiches', 'Burgers']",4.3 Very Good(300),2.3 miles away,Japanese,4.3,Good,300,2.3,elivery45%\noff11:45 - 12:15TomorrowThe Big Ki...
