## Final Zomato code for webscraping

In [26]:
import time                #for using sleep in order to load website
from selenium import webdriver    #webdriver for loading webpages
from selenium.webdriver.common.by import By   # for locating elements in the webpage
from selenium.common.exceptions import NoSuchElementException # For handling exceptions
import pandas as pd # Pandas data manipulation library for storing data as CSV file

driver = webdriver.Firefox()  #instance of Firefox WebDriver is created
# driver.maximize_window() #maximize the window size

##### Web scrapper for infinite scrolling page #####
driver.get("https://www.zomato.com/coimbatore")    #The driver.get method will navigate to a page given by the URL
time.sleep(2)  # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script(f"window.scrollTo(0, {screen_height}*{i});")  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break


### Extract restaurant details once we reach the end of dynamic Zomato webpage ####
rest_list = []  ## Restaurant list variable to which dictionary of each restaurant details will be appended

row = 10 # row variable for iterating Restaurant elements XPATH (***This has to checked manually in the webpage manually before running this script since it will also change dynamically sometimes***)
column = 1 # column variable for iterating Restaurant elements XPATH

while True:  #infinite loop initiation
# Restaurants are diplayed in the website similar to (n-rows x 3-columns) matrix. So basically we are indexing the XPATH link using row and column cariable.
    if column > 3:   #Since 3 restaurants are displayed in 1 row, this condition is necessary to restrict the column within 3.
        row += 1     # Row is also incremented to 1 to move on to the next row when column variable completed 3.
        column = 1   # we also need the reset the column back to 1 

    try:
        rest_name = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[1]/h4").text  #Returns retaurant name from corresponding XPATH element 
    except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
        rest_name = None

    try:
        rating = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[1]/div/div/div/div/div/div[1]").text #Returns retaurant rating from corresponding XPATH element
    except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
        rating = None

    try:
        offer_ = driver.find_elements(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/div[3]/p") ##Returns list of Offers from corresponding XPATH elements
        if len(offer_) > 1:  #checking the length if there are any multiple offers
            pro_offer = offer_[0].text #Zomato offer for only pro users
            offer = offer_[1].text # Zomato offer for all users
        else:                      
            pro_offer = None
            offer = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/div[3]/p").text #Returns offer from corresponding XPATH element
    except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
        pro_offer = None
        offer = None

    try:
        del_time = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/p").text #Returns delivery time from corresponding XPATH element
    except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
        del_time = None

    try:
        category = driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[2]/p[1]").text #Returns restaurant category from corresponding XPATH element
    except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
        category = None

    try:
        cfo = driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[2]/p[2]").text #Returns *cost for one* from corresponding XPATH element
    except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
        cfo = None

    try:
        roc = driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[4]/div/div/div/p").text #Returns *recent orders count* from corresponding XPATH element
    except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
        roc = None

    if (rest_name or rating or offer or del_time or category or cfo or roc) == None:     #This is the infinite loop breaker. When all the variables is none, loop breaks
        break


    rest_item = {                         #dictionary for storing the restaurant feature variables
        'rest_name' : rest_name,
        'rating' : rating,
        'pro_offer': pro_offer,
        'offer' : offer,
        'del_time' : del_time,
        'category' : category,
        'cost_for_one' : cfo,
        'recent_orders_count': roc
    }

    rest_list.append(rest_item)     #dictionsry will be appended to this rest_list

    column += 1 # column is incremented
    
df = pd.DataFrame(rest_list)    #dataframe creation from the list
df.to_csv("Zomato_rest_df.csv", index = False) #Saving as CSV file

In [27]:
df

Unnamed: 0,rest_name,rating,pro_offer,offer,del_time,category,cost_for_one,recent_orders_count
0,Taco Bell,3.8,,60% OFF,35 min,"Mexican, Fast Food, Wraps, Beverages",₹150 for one,1200+ orders placed from here recently
1,HMR - Grand Kitchen,4.1,,40% OFF,29 min,"South Indian, Hyderabadi, Chinese, Biryani, De...",₹150 for one,4125+ orders placed from here recently
2,Barbeque Nation,3.9,,50% OFF,45 min,"BBQ, Chinese, North Indian, Beverages, Desserts",₹150 for one,550+ orders placed from here recently
3,Sree Annapoorna Sree Gowrishankar,4.2,,20% OFF,29 min,"South Indian, North Indian, Chinese, Beverages...",₹150 for one,5150+ orders placed from here recently
4,China Valley Chinese Restaurant,3.7,Pro extra 15% OFF,50% OFF,38 min,"Chinese, Seafood",₹150 for one,1675+ orders placed from here recently
...,...,...,...,...,...,...,...,...
565,Little Owl's Cafe,-,,,,"Fast Food, Desserts, Beverages, Shake",₹100 for one,
566,Zwarma,3.9,,20% OFF,,"Lebanese, BBQ",₹100 for one,75+ orders placed from here recently
567,Gt 12 To 12,3.2,,50% OFF,41 min,"Fast Food, Mughlai, Lebanese, Chinese",₹100 for one,225+ orders placed from here recently
568,A.R.C Sasi's Chat & More,4.0,,,,Fast Food,₹100 for one,300+ orders placed from here recently


In [28]:
df.to_csv("rest_df_12_10_PM.csv", index = False)

## Converting into scalable code using classes

In [1]:
import time         #for using sleep in order to load website
from selenium import webdriver    #webdriver for loading webpages
from selenium.webdriver.common.by import By   # for locating elements in the webpage
from selenium.common.exceptions import NoSuchElementException # For handling exceptions
import pandas as pd # Pandas data manipulation library for storing data as CSV file


class zomato_scrap:
    def __init__(self, city) -> None:
        self.city = city
        

    def open_browser(self):
        self.driver = webdriver.Firefox()  #instance of Firefox WebDriver is created
        
    def load_url(self):
        self.open_browser()
        self.driver.get(f"https://www.zomato.com/{self.city}")    #The driver.get method will navigate to a page given by the URL
        time.sleep(2)  # Allow 2 seconds for the web page to open

    def infinite_scroll(self):
        self.load_url()
        scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
        screen_height = self.driver.execute_script("return window.screen.height;")   # get the screen height of the web
        i = 1
        ##### Web scrapper for infinite scrolling page #####
        while True:
            # scroll one screen height each time
            self.driver.execute_script(f"window.scrollTo(0, {screen_height}*{i});")  
            i += 1
            time.sleep(scroll_pause_time)
            # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
            scroll_height = self.driver.execute_script("return document.body.scrollHeight;")  
            # Break the loop when the height we need to scroll to is larger than the total scroll height
            if (screen_height) * i > scroll_height:
                break

    def extract_data(self):
        self.infinite_scroll()
        ### Extract restaurant details once we reach the end of dynamic Zomato webpage ####
        rest_list = []  ## Restaurant list variable to which dictionary of each restaurant details will be appended

        row = 10 # row variable for iterating Restaurant elements XPATH (***This has to checked manually in the webpage manually before running this script since it will also change dynamically sometimes***)
        column = 1 # column variable for iterating Restaurant elements XPATH

        while True:  #infinite loop initiation
            # Restaurants are diplayed in the website similar to (n-rows x 3-columns) matrix. So basically we are indexing the XPATH link using row and column cariable.
            if column > 3:   #Since 3 restaurants are displayed in 1 row, this condition is necessary to restrict the column within 3.
                row += 1     # Row is also incremented to 1 to move on to the next row when column variable completed 3.
                column = 1   # we also need the reset the column back to 1 

            try:
                rest_name = self.driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[1]/h4").text  #Returns retaurant name from corresponding XPATH element 
            except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
                rest_name = None

            try:
                rating = self.driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[1]/div/div/div/div/div/div[1]").text #Returns retaurant rating from corresponding XPATH element
            except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
                rating = None

            try:
                offer_ = self.driver.find_elements(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/div[3]/p") ##Returns list of Offers from corresponding XPATH elements
                if len(offer_) > 1:  #checking the length if there are any multiple offers
                    pro_offer = offer_[0].text #Zomato offer for only pro users
                    offer = offer_[1].text # Zomato offer for all users
                else:                      
                    pro_offer = None
                    offer = self.driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/div[3]/p").text #Returns offer from corresponding XPATH element
            except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
                pro_offer = None
                offer = None

            try:
                del_time = self.driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[1]/p").text #Returns delivery time from corresponding XPATH element
            except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
                del_time = None

            try:
                category = self.driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[2]/p[1]").text #Returns restaurant category from corresponding XPATH element
            except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
                category = None

            try:
                cfo = self.driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[2]/p[2]").text #Returns *cost for one* from corresponding XPATH element
            except NoSuchElementException: #If the XPATH isn't found this exception is raised and handled
                cfo = None

            try:
                roc = self.driver.find_element(By.XPATH,f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[4]/div/div/div/p").text #Returns *recent orders count* from corresponding XPATH element
            except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
                roc = None

            if (rest_name or rating or offer or del_time or category or cfo or roc) == None:     #This is the infinite loop breaker. When all the variables is none, loop breaks
                break


            rest_item = {                         #dictionary for storing the restaurant feature variables
                'rest_name' : rest_name,
                'rating' : rating,
                'pro_offer': pro_offer,
                'offer' : offer,
                'del_time' : del_time,
                'category' : category,
                'cost_for_one' : cfo,
                'recent_orders_count': roc
            }

            rest_list.append(rest_item)     #dictionsry will be appended to this rest_list

            column += 1 # column is incremented

        return rest_list    


    def convert_csv(self):
        df = pd.DataFrame(self.extract_data())    #dataframe creation from the list
        df.to_csv(f"Zomato_rest_{self.city}_df.csv", index = False) #Saving as CSV file


covai = zomato_scrap("coimbatore")
covai.convert_csv()   


### Restaurant coordinates extraction

In [25]:
import time         #for using sleep in order to load website
from selenium import webdriver    #webdriver for loading webpages
from selenium.webdriver.common.by import By   # for locating elements in the webpage
from selenium.common.exceptions import NoSuchElementException # For handling exceptions
import pandas as pd # Pandas data manipulation library for storing data as CSV file
from datetime import datetime
from pytz import timezone 


driver = webdriver.Firefox()
url = "https://www.zomato.com/coimbatore"
driver.get(url)  #The driver.get method will navigate to a page given by the URL
time.sleep(2)  # Allow 2 seconds for the web page to open

scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script(f"window.scrollTo(0, {screen_height}*{i});")  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break




In [26]:
### Extract restaurant details once we reach the end of dynamic Zomato webpage ####
restaurant_list = []  ## Restaurant list variable to which dictionary of each restaurant details will be appended

row = 10 # row variable for iterating Restaurant elements XPATH (***This has to checked manually in the webpage manually before running this script since it will also change dynamically sometimes***)
column = 1 # column variable for iterating Restaurant elements XPATH

while True:  #infinite loop initiation
# Restaurants are diplayed in the website similar to (n-rows x 3-columns) matrix. So basically we are indexing the XPATH link using row and column cariable.
    if column > 3:   #Since 3 restaurants are displayed in 1 row, this condition is necessary to restrict the column within 3.
        row += 1     # Row is also incremented to 1 to move on to the next row when column variable completed 3.
        column = 1   # we also need the reset the column back to 1 

    try:
        restaurant_name = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]/div[1]/h4").text  #Returns retaurant name from corresponding XPATH element 
    except NoSuchElementException:  #If the XPATH isn't found this exception is raised and handled
        restaurant_name = None

    try:
        href_element = driver.find_element(By.XPATH, f"/html/body/div[1]/div/div[{row}]/div/div[{column}]/div/div/a[2]")
        restaurant_url = href_element.get_attribute('href')
    except NoSuchElementException:
        restaurant_url = None

    
    if (restaurant_name or restaurant_url) == None:
        break
    
    restaurant_item =  {'rest_name': restaurant_name,
                        'restaurant_url': restaurant_url}

    restaurant_list.append(restaurant_item)
    column += 1

df = pd.DataFrame(restaurant_list)    

In [50]:
df.head()

Unnamed: 0,rest_name,restaurant_url,location_url
0,Barbeque Nation,https://www.zomato.com/coimbatore/barbeque-nat...,https://www.google.com/maps/dir/?api=1&destina...
1,HMR - Grand Kitchen,https://www.zomato.com/coimbatore/hmr-grand-ki...,https://www.google.com/maps/dir/?api=1&destina...
2,China Valley Chinese Restaurant,https://www.zomato.com/coimbatore/china-valley...,https://www.google.com/maps/dir/?api=1&destina...
3,Sree Annapoorna Sree Gowrishankar,https://www.zomato.com/coimbatore/sree-annapoo...,https://www.google.com/maps/dir/?api=1&destina...
4,Richy Rich,https://www.zomato.com/coimbatore/richy-rich-r...,https://www.google.com/maps/dir/?api=1&destina...


In [28]:
map_url = []

for url in df['restaurant_url']:
    driver.get(url)
    time.sleep(2)
    try:
        direction_element = driver.find_element(By.XPATH, f"/html/body/div[1]/div/main/div/section[3]/div[1]/section/a")
        location_url = direction_element.get_attribute('href')
    except:
        location_url = None
    
    map_url.append(location_url)

In [30]:
df['location_url'] = map_url

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   rest_name       535 non-null    object
 1   restaurant_url  535 non-null    object
 2   location_url    534 non-null    object
dtypes: object(3)
memory usage: 12.7+ KB


In [51]:
df = df.dropna()

In [52]:
df["location_url"][2]

'https://www.google.com/maps/dir/?api=1&destination=10.9981130168,76.9825409353'

In [55]:
def url_to_gps(location_url):
    coord = location_url.split('=')[2].split(',')
    lat = coord[0]
    long = coord[1]
    return lat,long

In [57]:
df['lat'],df['long'] = np.vectorize(url_to_gps)(df["location_url"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
df = df.drop(['restaurant_url', 'location_url'], axis = 1)

In [63]:
df.to_csv('zomato_restaurant_gps_coordinates.csv',index=False)

https://www.zomato.com/coimbatore/china-valley-chinese-restaurant-ramanathapuram/order
https://www.zomato.com/coimbatore/hmr-grand-kitchen-gandhipuram/order
https://www.zomato.com/coimbatore/hangout-saibaba-colony/order
https://www.zomato.com/coimbatore/sree-annapoorna-sree-gowrishankar-1-gandhipuram/order
https://www.zomato.com/coimbatore/bakingo-saibaba-colony/order
https://www.zomato.com/coimbatore/sree-subbu-mess-gandhipuram/order
https://www.zomato.com/coimbatore/nin-hao-saibaba-colony/order
https://www.zomato.com/coimbatore/abitha-biriyani-hotel-ukkadam/order
https://www.zomato.com/coimbatore/cafe-quilon-ramanathapuram/order
https://www.zomato.com/coimbatore/haribhavanam-gandhipuram/order
https://www.zomato.com/coimbatore/haribhavanam-peelamedu/order
https://www.zomato.com/coimbatore/hotel-maa-race-course/order
https://www.zomato.com/coimbatore/ss-hotel-gandhipuram/order
https://www.zomato.com/coimbatore/hotel-chola-1-gandhipuram/order
https://www.zomato.com/coimbatore/shree-anan

In [46]:
import numpy as np
pd.DataFrame(
    np.random.randn(1000, 2) / [50, 50] + [37.76, -122.4],
    columns=['lat', 'lon'])

Unnamed: 0,lat,lon
0,37.783697,-122.425911
1,37.750346,-122.422913
2,37.762464,-122.355515
3,37.747327,-122.414125
4,37.785296,-122.394791
...,...,...
995,37.741782,-122.404050
996,37.750200,-122.416463
997,37.738763,-122.371150
998,37.800133,-122.390398


In [47]:
np.random.randn(1000, 2) / [50, 50] + [37.76, -122.4]

array([[  37.7510167 , -122.40383203],
       [  37.76164541, -122.42768226],
       [  37.75976577, -122.40345264],
       ...,
       [  37.77018811, -122.38357172],
       [  37.74881518, -122.41823268],
       [  37.80165202, -122.39253011]])