In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import numpy as np  # Import numpy for NaN value
import time

# Function to scrape amenities from a given flat URL
def scrape_amenities_from_flat(url):
    try:
        response = requests.get(url, timeout=10)  # Increase timeout
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        amenity_div = soup.find('div', id='amenity')
        if amenity_div:
            amenities = amenity_div.find_all("div", class_="txt", itemprop="amenityFeature")
            amenity_features = ', '.join([amenity.text.strip() for amenity in amenities])
            return amenity_features
    except Exception as e:
        print(f"Failed to scrape amenities from flat: {url}")
        print(e)
        return np.nan  # Return NaN for the entire row in the DataFrame

# Function to scrape flat details from a single page
def scrape_page(page_url, page_num):
    print(f"Scraping page {page_num}...")
    try:
        response = requests.get(page_url, timeout=10)  # Increase timeout
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.find_all('li', class_='cardholder')
        flat_details = []
        for row in rows:
            flat_url = row.find('a', class_='typelink')['href']
            face_tag = row.find('li', class_='keypoint', title='facing')
            face_value = face_tag.text.strip() if face_tag else "null"
            sq_ft_tag = row.find('td', class_='size').find('span', class_='val')
            sq_ft_value = sq_ft_tag.text.strip() if sq_ft_tag else "null"
            tagn = row.find('div', class_='title-line-wrap')
            bhk = tagn.find('strong').text.strip() if tagn else "null"
            price_tag = row.find('meta', itemprop="price")
            price_value = price_tag['content'] if price_tag else "null"
            furnish_tag = row.find('td', class_='val')
            furnish_value = furnish_tag.text.strip() if furnish_tag else "null"
            deposit_tag = row.find('li', class_='keypoint', title='deposit')
            deposit_value = deposit_tag.text.strip() if deposit_tag else "null"
            bathroom_tag = row.find('li', class_='keypoint', title='bathrooms')
            bathroom_value = bathroom_tag.text.strip() if bathroom_tag else "null"
            location_tag = row.find('a', class_='loclink')
            location_value = location_tag.text.strip() if location_tag else "null"
            amenities = scrape_amenities_from_flat(flat_url)
            flat_details.append([face_value, sq_ft_value, bhk, price_value, furnish_value, deposit_value, bathroom_value, location_value, amenities])
        return flat_details
    except Exception as e:
        print(f"Failed to scrape page: {page_url}")
        print(e)
        return []

# Define the base URL for scraping
base_url = 'https://www.makaan.com/mumbai-residential-property/rent-property-in-mumbai-city'

# Initialize an empty list to store the data
all_data = []

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=5) as executor:  # Limit the number of workers
    # Fetch pages in parallel
    futures = [executor.submit(scrape_page, f'{base_url}?_=1714155387977&page={page_num}', page_num) for page_num in range(1, 529)]
    
    # Retrieve results
    for future in futures:
        flat_details = future.result()
        all_data.extend(flat_details)
        time.sleep(1)  # Introduce a delay between requests

# Create a DataFrame from the list of data
columns = ["Facing", "Square Feet", "BHK Apartment", "Price", "Furnish Status", "Deposit Status", "Bathroom Count", "Location", "Amenities"]
data = pd.DataFrame(all_data, columns=columns)

# Save the DataFrame to a CSV file
data.to_csv("mumbai_rental_properties_with_amenities_main_file_10k.csv", index=False)

# Display the DataFrame
print(data)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Failed to scrape amenities from flat: https://www.makaan.com/mumbai/reputed-builder-green-wood%27s-in-kharghar-22637962/2bhk-2t-1150-sqft-apartment-for-rent
404 Client Error: Not Found for url: https://www.makaan.com/mumbai/reputed-builder-green-wood%27s-in-kharghar-22637962/2bhk-2t-1150-sqft-apartment-for-rent
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Failed to scrape amenities from flat: https://www.makaan.com/mumbai/reputed-builder-green-wood%27s-in-kharghar-22637962/2bhk-2t-1150-sqft-apartment-for-rent
404 Client Error: Not Found for url: ht