In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def start_driver(headless=True):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


def scrape_justdial(url, city_name, pages=3):
    driver = start_driver()
    all_data = []

    for page in range(1, pages + 1):
        print(f"[{city_name}] Scraping page {page}...")

        final_url = url if page == 1 else f"{url}-p{page}"
        driver.get(final_url)
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        cards = soup.select(".store-details, .cntanr")

        for c in cards:
            name = c.select_one(".lng_cont_name")
            address = c.select_one(".cont_fl_addr")
            rating = c.select_one(".green-box")

            name = name.get_text(strip=True) if name else None
            address = address.get_text(strip=True) if address else None
            rating = rating.get_text(strip=True) if rating else None

            review_texts = []

            link_tag = c.select_one("a")
            if link_tag and link_tag.get("href"):
                detail_url = link_tag.get("href")
                try:
                    driver.get(detail_url)
                    time.sleep(1.5)
                    dsoup = BeautifulSoup(driver.page_source, "html.parser")

                    review_blocks = dsoup.select(".rvs-inner, .reviewdata, .jrev")
                    for r in review_blocks[:5]:
                        txt = r.get_text(" ", strip=True)
                        if len(txt) > 10:
                            review_texts.append(txt)

                except:
                    pass

            all_data.append({
                "city": city_name,
                "business_name": name,
                "address": address,
                "rating": rating,
                "reviews": review_texts
            })

    driver.quit()
    return pd.DataFrame(all_data)


In [None]:
url_ahm = "https://www.justdial.com/Ahmedabad/Restaurants"
url_ban = "https://www.justdial.com/Bangalore/Restaurants"

df_ahm = scrape_justdial(url_ahm, city_name="Ahmedabad", pages=5)
df_ban = scrape_justdial(url_ban, city_name="Bangalore", pages=5)

df_combined = pd.concat([df_ahm, df_ban], ignore_index=True)

df_combined.to_csv("../Local Business Review Project NT/data/processed/restaurants_ahm_ban_combined.csv", index=False)

df_combined.head()


In [None]:
url_ahm = "https://www.justdial.com/Ahmedabad/Salons"
url_ban = "https://www.justdial.com/Bangalore/Salons"

df_ahm = scrape_justdial(url_ahm, city_name="Ahmedabad", pages=5)
df_ban = scrape_justdial(url_ban, city_name="Bangalore", pages=5)

df_combined = pd.concat([df_ahm, df_ban], ignore_index=True)

df_combined.to_csv("../Local Business Review Project NT/data/processed/salons_ahm_ban_combined.csv", index=False)

df_combined.head()


In [None]:
url_ahm = "https://www.justdial.com/Ahmedabad/Gyms"
url_ban = "https://www.justdial.com/Bangalore/Gyms"

df_ahm = scrape_justdial(url_ahm, city_name="Ahmedabad", pages=5)
df_ban = scrape_justdial(url_ban, city_name="Bangalore", pages=5)

df_combined = pd.concat([df_ahm, df_ban], ignore_index=True)

df_combined.to_csv("../Local Business Review Project NT/data/processed/gyms_ahm_ban_combined.csv", index=False)

df_combined.head()
