In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import requests
import pandas as pd
import time
import os
from pathlib import Path

In [2]:
def scrape_page(url, headers):
    try:
        
        with requests.Session() as session:
            response = session.get(url, headers=headers)
            if response.status_code == 200:
                return response.content
            else:
                print("Failed to retrieve the page. Status code:", response.status_code)
                return None
    except Exception as e:
        print("An error occurred:", e)
        return None

In [3]:
def find_areas(links_df_path):
    links_df = pd.read_csv(links_df_path)    
    print(links_df.head())
    print(type(links_df))
    areas = []
    links_having_error =[]

    for row in links_df.iterrows():
            restaurant_link = row['restaurant_links']
            print(restaurant_link)
            try:
                driver = webdriver.Chrome()
                driver.get(restaurant_link)
                headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' }
                content = scrape_page(restaurant_link, headers)
                soup = BeautifulSoup(content, 'html.parser')
                name = soup.find('h1',class_="sc-7kepeu-0 sc-iSDuPN fwzNdh").text.strip()

                cuisine_s = soup.find('div', class_="sc-fgfRvd gBMRZZ").find_all('span')
                cuisine = [span.text.strip() for span in cuisine_s[::2]]
                cuisine = [cos[:-1] for cos in cuisine]
                cuisine.append(cuisine_s[-1].text.strip())
                cuisine = ', '.join(map(str, cuisine))

                wait = WebDriverWait(driver, 10)
                wait.until(EC.presence_of_element_located((By.XPATH, "//a[@class='sc-clNaTc vNCcy']")))
                area_element = driver.find_element(By.XPATH, "//a[@class='sc-clNaTc vNCcy']")
                area = area_element.text.split(',')[-2]

                sub = restaurant_link[:-6]
                sub_driver = webdriver.Chrome()
                sub_driver.get(sub)

                address=sub_driver.find_elements(By.XPATH,"//p[contains(@class,'clKRrC')]")
                addresstext=[a1.text for a1 in address]
                addresstext=str(addresstext[0])

                for_call_loc = sub_driver.find_elements(By.XPATH, "//p[contains(@class, 'fanwIZ')]")
                calls = [element.text for element in for_call_loc]
                if len(calls) == 1:
                    calls=str(calls[0])
                    calls="'"+ calls
                else:
                    calls=', '.join(map(str, calls))

                areas.append({        
                    'Restaurant URL': restaurant_link,
                    'Name' : name,
                    'Address': addresstext,
                    'Cuisine' : cuisine, 
                    'Area': area,
                    'Contact Number' : calls
                })

                driver.quit()

            except Exception as e:
                print(f"Error processing restaurant link: {restaurant_link}, Error: {str(e)}")
                links_having_error.append({
                    'URL' : restaurant_link,
                    'Error' : str(e)
                })
                try:
                    driver.quit()
                except:
                    pass
                continue 

    error_df = pd.DataFrame(links_having_error)
    error_df.to_csv('errors.csv')
    areas_df = pd.DataFrame(areas)

    grouped = areas_df.groupby('Area')

    for area, group in grouped:
        area_filename = f'zomato_links_{links_df_path[:-4]}{area}.csv'
        group.to_csv(area_filename, index=False)
        print(f"Links for area '{area}' have been saved to {area_filename}")

In [5]:
find_areas('cities\pune\pune.csv')

                                    restaurant_links
0  https://www.zomato.com/pune/pizza-hut-shukrawa...
1  https://www.zomato.com/pune/kayani-bakery-east...
2  https://www.zomato.com/pune/wadeshwar-fc-road/...
3  https://www.zomato.com/pune/joshi-wadewale-shi...
4  https://www.zomato.com/pune/burger-king-1-sena...
<class 'pandas.core.frame.DataFrame'>
https://www.zomato.com/pune/pizza-hut-shukrawar-peth/order
https://www.zomato.com/pune/kayani-bakery-east-street/order
https://www.zomato.com/pune/wadeshwar-fc-road/order
https://www.zomato.com/pune/joshi-wadewale-shivaji-nagar/order
https://www.zomato.com/pune/burger-king-1-senapati-bapat-road/order
https://www.zomato.com/pune/anna-fc-road/order
https://www.zomato.com/pune/naadbramha-idli-shaniwar-peth/order
https://www.zomato.com/pune/oye-kiddan-kothrud/order
https://www.zomato.com/pune/wendys-burgers-karve-nagar/order
Links for area '['East Street', ' Pune']' have been saved to zomato_links_ Pune_East Street.csv
Links for area '['FC 

In [4]:
# Read the input CSV file and limit to first 2 rows for testing
df = pd.read_csv('all_cities_link.csv')
df = df.loc[2:3]

# Iterate over each link in the DataFrame
for index, row in df.iterrows():
    try:
        all_restaurant_links = []
        temp = row['URLs'][23:-1]
        os.makedirs(f"cities/{temp}", exist_ok=True)
        file_name = 'cities/'+temp+'/'+temp + '.csv'

        print(file_name)
        link = row['URLs'] + 'delivery'
        print(f"Processing link: {link}")
        
        # Initialize the WebDriver
        driver = webdriver.Chrome()
        driver.get(link)
        
        # Scroll down to the bottom of the page to load all content
        scroll_pause_time = 3
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Wait until the elements are present
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '/order')]")))

        # Find elements and extract the links
        elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/order')]")
        restaurant_links = [element.get_attribute('href') for element in elements]
        restaurant_links = restaurant_links[:len(restaurant_links):2]

        # Append each link to the list
        all_restaurant_links.extend(restaurant_links)
        print(f"Found {len(restaurant_links)} links for {link}")
        
        # Create a DataFrame for the current city with each link in a separate row
        df_each = pd.DataFrame(all_restaurant_links, columns=['restaurant_links'])
        df_each.to_csv(file_name, index=False)
        print(f"Links saved to {file_name}")
        find_areas(f'{file_name}')

        # Quit the driver
        driver.quit()

    except Exception as e:
        print(f"Error processing link: {link}, Error: {str(e)}")
        try:
            driver.quit()
        except:
            pass
        continue  

print("Processing complete.")


cities/mumbai/mumbai.csv
Processing link: https://www.zomato.com/mumbai/delivery
Found 9 links for https://www.zomato.com/mumbai/delivery
Links saved to cities/mumbai/mumbai.csv
                                    restaurant_links
0  https://www.zomato.com/mumbai/pizza-hut-dadar-...
1  https://www.zomato.com/mumbai/mcdonalds-dadar-...
2  https://www.zomato.com/mumbai/theobroma-matung...
3         https://www.zomato.com/SubwayMatunga/order
4  https://www.zomato.com/mumbai/burger-king-dada...
<class 'pandas.core.frame.DataFrame'>
https://www.zomato.com/mumbai/pizza-hut-dadar-west/order
https://www.zomato.com/mumbai/mcdonalds-dadar-west/order
Error processing restaurant link: https://www.zomato.com/mumbai/mcdonalds-dadar-west/order, Error: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=125.0.6422.77)
Stacktrace:
	GetHandleVerifier [0x00007FF60A331F22+60322]
	(No symbol) [0x00007FF60A2ACE99]
	(No symbol) [0x00007FF60A167EBA]
	(No symbol) [0x00007FF60A15FEA2]