### Step 1: Install Necessary Packages

Install Selenium and the WebDriver Manager for easy driver management.

pip install selenium
pip install webdriver-manager

Install Additional Libraries
pip install fake-useragent

fake_useragent for generating random User-Agent headers (or you can create a list of User-Agents).

In [None]:
#from selenium import webdriver
#from selenium.webdriver.chrome.service import Service
#from selenium.webdriver.chrome.options import Options
#from webdriver_manager.chrome import ChromeDriverManager
#from fake_useragent import UserAgent
#import time
#import random
#import os


#### Step 2:Set Up Headless Selenium with Random User-Agent and Delays

Import Libraries:

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import time
import random
import os


Configure Headless Mode and Set Up Random User-Agent

1. Headless Mode: Running Chrome in headless mode lets you scrape without opening a browser window, making it faster and more efficient.
2. Randomized User-Agent: Randomizing the User-Agent string for each session helps mimic different users and makes detection harder.

In [8]:
def get_random_user_agent():
    ua = UserAgent()  # Initializes fake_useragent to generate random User-Agents
    return ua.random  # Returns a random User-Agent string

def setup_browser_options():
    chrome_options = Options()
    
    # Enable headless mode
    chrome_options.add_argument("--headless")  # Run browser in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU (helps with headless performance)
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model, for Docker compatibility
    
    # Randomize User-Agent
    user_agent = get_random_user_agent()
    chrome_options.add_argument(f"user-agent={user_agent}")
    
    # Optional: Disable automation flags for better disguise
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    
    return chrome_options


#### Step 3: Implement Randomized Delay

Introducing small, random delays between requests can make your actions appear more like real user behavior, helping to avoid detection.

In [9]:
def add_random_delay():
    delay = random.uniform(2, 5)  # Delay between 2 and 5 seconds
    time.sleep(delay)


Step 4: Putting It All Together

In [10]:
def fetch_and_save_to_file(url, path):
    # Set up Chrome with customized options
    chrome_options = setup_browser_options()
    
    # Initialize Chrome WebDriver with options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Create the directory if it doesn't exist
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    try:
        # Open the target URL
        driver.get(url)
        
        # Add random delay
        add_random_delay()
        
        # Save the HTML source to a file
        with open(path, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("HTML content saved successfully!")
    
    finally:
        # Close the browser session
        driver.quit()

# Usage example
url = "https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&budget_min=0&res_com=R&isPreLeased=N"
path = "Scraped Html/99acres_2.html"
fetch_and_save_to_file(url, path)

HTML content saved successfully!


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import time
import random
import os

def get_random_user_agent():
    ua = UserAgent()  # Initializes fake_useragent to generate random User-Agents
    return ua.random  # Returns a random User-Agent string

def setup_browser_options():
    chrome_options = Options()
    
    # Enable headless mode
    chrome_options.add_argument("--headless")  # Run browser in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU (helps with headless performance)
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model, for Docker compatibility
    
    # Randomize User-Agent
    user_agent = get_random_user_agent()
    chrome_options.add_argument(f"user-agent={user_agent}")
    
    # Optional: Disable automation flags for better disguise
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    
    return chrome_options

def add_random_delay():
    delay = random.uniform(2, 5)  # Delay between 2 and 5 seconds
    time.sleep(delay)

def fetch_and_save_to_file(url, path):
    # Set up Chrome with customized options
    chrome_options = setup_browser_options()
    
    # Initialize Chrome WebDriver with options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Create the directory if it doesn't exist
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    try:
        # Open the target URL
        driver.get(url)
        
        # Add random delay
        add_random_delay()
        
        # Save the HTML source to a file
        with open(path, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print(f"HTML content saved to {path}")
    
    finally:
        # Close the browser session
        driver.quit()

def scrape_multiple_pages(base_url, total_pages, path_prefix):
    # Start scraping from page 225
    for page_num in range(1, total_pages + 1):
        # Construct the URL with page number (assuming the URL structure changes with the page number)
        url = f"{base_url}&page={page_num}"
        
        # Generate the path for saving the HTML file
        path = f"{path_prefix}/page_{page_num}.html"
        
        try:
            # Fetch and save the HTML content for the page
            fetch_and_save_to_file(url, path)
        except Exception as e:
            print(f"Error scraping page {page_num}: {e}")
            break  # Stop on error, and you can resume later from where it stopped


# Usage example: scraping 1674 pages
base_url = "https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&budget_min=0&res_com=R&isPreLeased=N"
path_prefix = "Scraped Html"
total_pages = 1680

scrape_multiple_pages(base_url, total_pages, path_prefix)


HTML content saved to 001_Scraped Html/page_1.html


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import time
import random
import os

def get_random_user_agent():
    ua = UserAgent()  # Initializes fake_useragent to generate random User-Agents
    user_agent = ua.random  # Returns a random User-Agent string
    print("Using User-Agent:", user_agent)  # Debug print
    return user_agent

def setup_browser_options():
    chrome_options = Options()
    
    # Enable headless mode (comment out for debugging with GUI)
    # chrome_options.add_argument("--headless")  
    
    # Disable GPU and sandbox for compatibility
    chrome_options.add_argument("--disable-gpu")  
    chrome_options.add_argument("--no-sandbox")  
    
    # Randomize User-Agent
    user_agent = get_random_user_agent()
    chrome_options.add_argument(f"user-agent={user_agent}")
    
    # Optional: Disable automation flags for better disguise
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    
    return chrome_options

def add_random_delay():
    delay = random.uniform(2, 5)  # Delay between 2 and 5 seconds
    print(f"Sleeping for {delay:.2f} seconds")  # Debug print
    time.sleep(delay)

def fetch_and_save_to_file(url, path):
    # Set up Chrome with customized options
    chrome_options = setup_browser_options()
    
    # Initialize Chrome WebDriver with options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    print("ChromeDriver initialized")  # Debug print
    
    # Create the directory if it doesn't exist
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory created at {directory}")  # Debug print
    
    try:
        # Open the target URL
        print(f"Opening URL: {url}")  # Debug print
        driver.get(url)
        
        # Add random delay
        add_random_delay()
        
        # Save the HTML source to a file
        with open(path, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print(f"HTML content saved to {path}")  # Success message
    
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")  # Debug print on error
    
    finally:
        # Close the browser session
        driver.quit()
        print("Browser session closed")  # Debug print

def scrape_multiple_pages(base_url, total_pages, path_prefix):
    # Start scraping from page 1639
    for page_num in range(1681, total_pages + 1):
        # Construct the URL with page number
        url = f"{base_url}&page={page_num}"
        
        # Generate the path for saving the HTML file
        path = f"{path_prefix}/page_2_{page_num}.html"
        
        print(f"Scraping page {page_num}")  # Debug print for each page
        
        try:
            # Fetch and save the HTML content for the page
            fetch_and_save_to_file(url, path)
        
        except Exception as e:
            print(f"Error scraping page {page_num}: {e}")  # Debug print on error
            break  # Stop on error; can resume later from where it stopped

# Usage example: scraping 1680 pages
base_url = "https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&budget_min=0&res_com=R&isPreLeased=N"
path_prefix = "Scraped Html"
total_pages = 1682

scrape_multiple_pages(base_url, total_pages, path_prefix)


Scraping page 1681
Using User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0 Agency/98.8.8188.80
ChromeDriver initialized
Opening URL: https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&budget_min=0&res_com=R&isPreLeased=N&page=1681
Sleeping for 2.93 seconds


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import time
import random
import os

def get_random_user_agent():
    ua = UserAgent()  # Initializes fake_useragent to generate random User-Agents
    return ua.random  # Returns a random User-Agent string

def setup_browser_options():
    chrome_options = Options()
    
    # Enable headless mode
    chrome_options.add_argument("--headless")  # Run browser in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU (helps with headless performance)
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model, for Docker compatibility
    
    # Randomize User-Agent
    user_agent = get_random_user_agent()
    chrome_options.add_argument(f"user-agent={user_agent}")
    
    # Optional: Disable automation flags for better disguise
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    
    return chrome_options

def add_random_delay():
    delay = random.uniform(2, 5)  # Delay between 2 and 5 seconds
    time.sleep(delay)

def fetch_and_save_to_file(url, path):
    # Set up Chrome with customized options
    chrome_options = setup_browser_options()
    
    # Initialize Chrome WebDriver with options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Create the directory if it doesn't exist
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    try:
        # Open the target URL
        driver.get(url)
        
        # Add random delay
        add_random_delay()
        
        # Save the HTML source to a file
        with open(path, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print(f"HTML content saved to {path}")
    
    finally:
        # Close the browser session
        driver.quit()

def scrape_specific_pages(base_url, page_numbers, path_prefix):
    for page_num in page_numbers:
        # Construct the URL with page number
        url = f"{base_url}&page={page_num}"
        
        # Generate the path for saving the HTML file
        path = f"{path_prefix}/page_2_{page_num}.html"
        
        try:
            # Fetch and save the HTML content for the page
            fetch_and_save_to_file(url, path)
        except Exception as e:
            print(f"Error scraping page {page_num}: {e}")
            continue  # Skip to the next page if an error occurs

# Scraping without changing name 
#


#Scraping after Changing name
#259, 26, 260, 262, 266, 267, 269, 270, 272, 275, 276, 277, 278, 279, 28, 280, 281, 285, 286, 287, 288, 289, 290, 292, 297, 30, 302, 304, 305, 307, 31, 310, 313, 314, 317, 32, 324, 325, 329, 330, 331, 333, 334, 335, 338, 339, 34, 341, 342, 345, 347, 35, 350, 353, 356, 358, 36, 361, 363, 365, 366, 367, 369, 37, 370, 371, 372, 375, 377, 378, 38, 380, 381, 382, 383, 387, 39, 393, 397, 399, 402, 403, 404, 405, 407, 41, 412, 413, 415, 416, 417, 418, 420, 421, 423, 424, 425, 427, 431, 435, 436, 437, 439, 440, 442, 443, 444, 445, 446, 447, 448, 449, 451, 454, 455, 456, 457, 458, 459, 46, 462, 463, 465, 466, 468, 471, 473, 474, 476, 48, 485, 486, 489, 49, 490, 492, 495, 498, 50, 501, 504, 506, 507, 509, 51, 510, 511, 513, 515, 516, 519, 52, 522, 526, 527, 528, 530, 531, 534, 535, 536, 54, 540, 542, 543, 546, 548, 550, 551, 552, 554, 557, 558, 559, 56, 561, 562, 563, 564, 566, 567, 569, 570, 571, 573, 574, 576, 577, 579, 58, 580, 582, 584, 585, 586, 587, 589, 59, 590, 594, 595, 597, 599, 60, 601, 602, 603, 604, 607, 608, 61, 610, 612, 614, 615, 616, 617, 618, 619, 62, 621, 622, 623, 624, 625, 626, 628, 63, 631, 632, 633, 635, 637, 639, 64, 640, 641, 644, 646, 647, 648, 649, 65, 652, 653, 654, 655, 656, 657, 658, 659, 66, 660, 661, 662, 663, 664, 667, 668, 67, 670, 671, 672, 674, 675, 676, 678, 679, 68, 680, 681, 683, 685, 686, 687, 689, 69, 690, 692, 693, 695, 698, 699, 70, 702, 703, 704, 705, 706, 708, 71, 711, 712, 713, 715, 716, 718, 72, 721, 724, 726, 728, 730, 731, 733, 734, 735, 736, 738, 739, 74, 740, 741, 742, 744, 746, 748, 75, 750, 752, 753, 754, 756, 758, 76, 761, 762, 763, 765, 766, 767, 769, 77, 771, 772, 773, 775, 777, 779, 78, 780, 783, 784, 785, 787, 788, 789, 790, 791, 793, 794, 795, 796, 797, 798, 799, 80, 800, 801, 802, 803, 805, 806, 807, 809, 81, 811, 812, 813, 814, 815, 817, 818, 82, 821, 822, 823, 824, 826, 828, 83, 831, 832, 833, 834, 836, 837, 839, 84, 840, 841, 842, 843, 844, 846, 848, 85, 851, 852, 854, 855, 856, 857, 858, 86, 860, 861, 862, 863, 864, 866, 868, 87, 871, 872, 873, 874, 875, 876, 877, 878, 879, 88, 880, 882, 884, 885, 886, 887, 888, 889, 89, 890, 891, 892, 894, 896, 897, 898, 9, 90,903,905,906,907, 91, 910, 913, 914, 915, 92, 922, 924 ,927 ,928, 93, 930, 933, 934, 935, 936, 938, 939, 94, 941, 943, 945, 946, 947, 948, 951, 952, 954, 955, 959, 961, 962, 963, 966 ,967,968, 97, 970, 971, 972, 973, 975, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 996,997, 998, 999


# Array of page numbers to scrape
page_numbers = [10, 100, 1000, 1002, 1004, 1005, 1006, 101, 1010, 1011, 1012, 1014, 1015, 1019, 102, 1020, 1026, 1027, 1030, 1031, 1032, 1033, 1038, 104, 1041, 1042, 1044, 1045, 1046, 1048, 1049, 105, 1051, 1052, 1053, 1057, 1058, 1059, 106, 1062, 1064, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1075, 1076, 1077, 1080, 1082, 1084, 1088, 1089, 109, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1098, 11, 110, 1100, 1104, 1105, 1109, 1110, 1113, 1114, 1116, 112, 1121, 1122, 1123, 1125, 1126, 1127, 113, 1130, 1131, 1132, 1136, 1137, 114, 1141, 1144, 1146, 1147, 1148, 1149, 1151, 1152, 1153, 1154, 1157, 1158, 1159, 1160, 1165, 117, 1171, 1173, 1175, 1176, 1177, 1179, 1181, 1183, 1184, 1185, 1186, 1189, 119, 1190, 1191, 1192, 1193, 1196, 1197, 12, 1200, 1201, 1202, 1203, 1204, 1207, 121, 1212, 1213, 1214, 1218, 1225, 1227, 1228, 1230, 1231, 1232, 1235, 1236, 1237, 1238, 1239, 124, 1243, 1245, 1246, 1248, 1249, 125, 1251, 1252, 1254, 1258, 1259, 126, 1261, 1263, 1264, 1265, 127, 1270, 1272, 1274, 1276, 1277, 1278, 1279, 128, 1283, 1284, 1287, 1288, 1289, 129, 1291, 1292, 1293, 1295, 1296, 1297, 1299, 130, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1311, 1312, 1313, 1315, 1316, 1317, 1320, 1324, 1325, 1328, 1329, 133, 1331, 134, 135, 1367, 1368, 1370, 1371, 1372, 1374, 1375, 1376, 1377, 1379, 1380, 1382, 1386, 1387, 1388, 1390, 1391, 1392, 1395, 1396, 1397, 1398, 14, 1402, 1406, 1407, 1408, 1409, 141, 1410, 1413, 1414, 1415, 1417, 1419, 142, 1421, 1422, 1423, 1425, 1427, 1428, 1430, 1431, 1432, 1435, 1436, 1437, 1438, 1441, 1447, 1449, 145, 1454, 1456, 1459, 1461, 1466, 1468, 1469, 1473, 1474, 1476, 1477, 1478, 1479, 1480, 1483, 1484, 1485, 1486, 1487, 1488, 149, 1490, 1491, 1492, 1493, 1496, 1497, 1498, 1499, 15, 1500, 1501, 1502, 1509, 151, 1511, 1513, 1515, 1516, 1518, 1519, 152, 1522, 1525, 1527, 1528, 1529, 1530, 1532, 1533, 1534, 1535, 1536, 1537, 1539, 1542, 1543, 1545, 1548, 155, 1553, 1554, 1556, 1558, 156, 1561, 1562, 1564, 157, 1570, 1572, 1574, 1577, 1578, 1579, 158, 1580, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1593, 1594, 1595, 1598, 16, 160, 1601, 1602, 1603, 1604, 1606, 1607, 1611, 1612, 1613, 1618, 162, 1621, 1623, 1624, 1625, 1627, 1628, 1629, 1631, 1632, 1633, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1644, 1646, 165, 1653, 1654, 1655, 1657, 1659, 166, 1660, 1661, 1662, 1663, 1667, 1669, 1671, 1673, 1676, 168, 1680, 174, 175, 176, 177, 178, 18, 181, 183, 184, 185, 188, 19, 196, 203, 204, 21, 210, 211, 214, 215, 218, 219, 22, 220, 221, 225, 227, 228, 23, 230, 232, 235, 245, 246, 247, 248, 249, 251, 254, 255]

  # Replace with the specific array of pages you need

# Base URL and path prefix
base_url = "https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&budget_min=0&res_com=R&isPreLeased=N"
path_prefix = "Scraped Html"

# Run the scraper for only the specified pages
scrape_specific_pages(base_url, page_numbers, path_prefix)


HTML content saved to Scraped Html/page_2_10.html
HTML content saved to Scraped Html/page_2_100.html
HTML content saved to Scraped Html/page_2_1000.html
HTML content saved to Scraped Html/page_2_1002.html
HTML content saved to Scraped Html/page_2_1004.html
HTML content saved to Scraped Html/page_2_1005.html
HTML content saved to Scraped Html/page_2_1006.html
HTML content saved to Scraped Html/page_2_101.html
HTML content saved to Scraped Html/page_2_1010.html
HTML content saved to Scraped Html/page_2_1011.html
HTML content saved to Scraped Html/page_2_1012.html
HTML content saved to Scraped Html/page_2_1014.html
HTML content saved to Scraped Html/page_2_1015.html
HTML content saved to Scraped Html/page_2_1019.html
HTML content saved to Scraped Html/page_2_102.html
HTML content saved to Scraped Html/page_2_1020.html
HTML content saved to Scraped Html/page_2_1026.html
HTML content saved to Scraped Html/page_2_1027.html
HTML content saved to Scraped Html/page_2_1030.html
HTML content save