In [2]:
import os
import json
import time
import logging
import numpy as np
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromiumService
from selenium.common.exceptions import TimeoutException, WebDriverException
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [3]:
headers = {
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.6",
    "cache-control": "no-cache",
    "content-type": "application/x-protobuf",
    "cookie": "_GRECAPTCHA=09AMNxLB8XaXxNkOyNW3EHzAfcM_Xw2f03cxPCxNgyj0fitf-42B0ttj0FevJvOQ5MTQz-HK9O_0SFadw2i9JYoBE",
    "origin": "https://www.google.com",
    "pragma": "no-cache",
    "sec-ch-ua": "\"Chromium\";v=\"136\", \"Brave\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "sec-fetch-storage-access": "none",
    "sec-gpc": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
}

<!-- Jio -->

In [9]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from urllib.parse import urljoin, urlparse

def is_valid_url(url, base_domain):
    """
    Check if the URL is valid and belongs to the base domain (including subdomains).
    """
    try:
        parsed = urlparse(url)
        return (
            parsed.scheme in ['http', 'https'] and
            parsed.netloc.endswith(base_domain)
        )
    except:
        return False


def clean_text(row_text: str) -> str:
    # Remove newlines and extra whitespace
    text = re.sub(r'\s+', ' ', row_text)  # Replace all whitespace (tabs, newlines, etc.) with a single space
    text = text.strip()                   # Remove leading and trailing whitespace
    text = text.lower()                   # Convert to lowercase
    return text

def scrape_sites(base_url, base_domain, output_folder):
    # --- Core Configuration ---
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    # --- Setup ---
    os.makedirs(output_folder, exist_ok=True)
    visited_urls = set()
    urls_to_visit = [base_url]
    request_delay = 1  # seconds

    while urls_to_visit:
        current_url = urls_to_visit.pop(0)

        # Normalize current URL
        normalized_url = current_url.split('#')[0].rstrip('/')

        if normalized_url in visited_urls:
            continue

        visited_urls.add(normalized_url)

        try:
            time.sleep(request_delay)
            response = requests.get(normalized_url, headers=headers, timeout=10)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                # --- Extract & Save Page Content ---
                page_text = clean_text(soup.get_text())
                page_title = soup.title.string.strip() if soup.title else "NoTitle"
                safe_filename = "".join(c if c.isalnum() else "_" for c in page_title)[:100]
                file_name = f"{safe_filename}.txt"
                file_full_path = os.path.join(output_folder, file_name)

                with open(file_full_path, "w", encoding="utf-8") as f:
                    f.write(page_text)

                 # Store file logs
                with open("logs.txt", "a", encoding="utf-8") as f: f.write(f"{current_url}, {file_name}\n")

                print(f"[✔] Scraped: {normalized_url}")

                # --- Find and Queue Internal Links ---
                for a_tag in soup.find_all('a', href=True):
                    href = a_tag['href'].strip()

                    # Ignore invalid links
                    if href.startswith(('mailto:', 'tel:', 'javascript:')):
                        continue

                    full_url = urljoin(normalized_url, href)
                    clean_url = full_url.split('#')[0].rstrip('/')

                    if is_valid_url(clean_url, base_domain) and clean_url not in visited_urls:
                        urls_to_visit.append(clean_url)

            else:
                print(f"[✖] Failed (Status {response.status_code}): {normalized_url}")

        except Exception as e:
            print(f"[!] Error scraping {normalized_url}: {str(e)[:100]}")

    print(f"\n✅ Scraping completed. Total pages scraped: {len(visited_urls)}")

if __name__ == "__main__":
    base_url = "https://www.cardekho.com/"
    base_domain = "cardekho.com"
    output_folder = "CarDekho"
    scrape_sites(base_url, base_domain, output_folder)

[✔] Scraped: https://www.cardekho.com
[✔] Scraped: https://www.cardekho.com/newcars
[✔] Scraped: https://www.cardekho.com/mg/windsor-ev
[✔] Scraped: https://www.cardekho.com/mahindra/be-6
[✔] Scraped: https://www.cardekho.com/mahindra/xev-9e
[✔] Scraped: https://www.cardekho.com/mg/comet-ev
[✔] Scraped: https://www.cardekho.com/tata/curvv-ev
[✔] Scraped: https://www.cardekho.com/tata/nexon-ev
[✔] Scraped: https://www.cardekho.com/electric-cars
[✔] Scraped: https://www.cardekho.com/mahindra/scorpio-n
[✔] Scraped: https://www.cardekho.com/mahindra/thar
[✔] Scraped: https://www.cardekho.com/mahindra/xuv700
[✔] Scraped: https://www.cardekho.com/hyundai/creta


KeyboardInterrupt: 