In [1]:
import numpy as np
import pandas as pd

import time
import re

import requests
from bs4 import BeautifulSoup

In [2]:
webpage = requests.get('https://housing.com/rent/apartments-for-rent-in-surat-gujarat-M1P20drlq24mltepds2?page=1').text
soup = BeautifulSoup(webpage, 'lxml')
all_divs = soup.find_all('div', class_="t17qvo1u _biqgdtch _1la84ssf _156v13i4 _axkb7n _9s1txw _j61sew T_82a3537f")

In [3]:
highlights = []
pattern = re.compile(r'(_l8ftgi _vy1osq _c81fwx _gdnqgrho _1q731fwx _4okudlk8 .*? highlights)')

for i in all_divs:
    
    try:
        highlights.append(i.find("div", class_=re.compile(r'(_l8ftgi _vy1osq _c81fwx _gdnqgrho _1q731fwx _4okudlk8 .*? highlights)')).text)
    except:
        highlights.append(np.nan)

In [7]:
data_frames = []

for i in range(1, 31):
    webpage = requests.get(f'https://housing.com/rent/apartments-for-rent-in-surat-gujarat-M1P20drlq24mltepds2?page={i}').text
    soup = BeautifulSoup(webpage, 'lxml')
    all_divs = soup.find_all('div', class_="t17qvo1u _biqgdtch _1la84ssf _156v13i4 _axkb7n _9s1txw _j61sew T_82a3537f")

    # Lists for collecting data for each property
    society_name = []
    bhk = []
    description = []
    highlights = []
    furnishing = []
    built_up_area = []
    locality = []
    nearby_place1 = []
    distance_away1 = []
    nearby_place2 = []
    distance_away2 = []

    # Loop through each div containing property info
    for j in all_divs:
        try:
            society_name.append(j.find("a", class_="value").text)
        except:
            society_name.append(np.nan)

        try:
            bhk.append(j.find("div", class_="T_091c165f _sq1l2s _vv1q9c _ks15vq T_3d3547ab _7s5wglyw _5vy24jg8 _blas1v10 new-title").text)
        except:
            bhk.append(np.nan)

        try:
            description.append(j.find("span", class_="T_091c165f _sq1l2s _vv1q9c _ks15vq T_3b18a44b _w41hna _7l14la _g3dlk8 _c81fwx").text)
        except:
            description.append(np.nan)

        try:
            highlights.append(j.find("div", class_=re.compile(r'(_l8ftgi _vy1osq _c81fwx _gdnqgrho _1q731fwx _4okudlk8 .*? highlights)')).text)
        except:
            highlights.append(np.nan)
        
        x = j.find_all("div", class_="T_091c165f _sq1l2s _vv1q9c _ks15vq T_efe231cd _vy1ipv _7ltvct _g3dlk8 _c81fwx _cs1nn1 value")

        # Try to extract each attribute if it exists
        try: furnishing.append(x[0].text)
        except: furnishing.append(np.nan)
        try: built_up_area.append(x[1].text)
        except: built_up_area.append(np.nan)
        try:locality.append(j.a.get_text())
        except:locality.append(np.nan)

        y = j.find_all("div", class_="T_091c165f _sq1l2s _vv1q9c _ks15vq T_7e22db16 _gz14y2 _j3r5k8 _7l5yda _g3dlk8 _c81fwx")

        try: nearby_place1.append(y[0].text)
        except: nearby_place1.append(np.nan)
        try: nearby_place2.append(y[1].text)
        except: nearby_place2.append(np.nan)

        z = j.find_all("div", class_="_c81fwx _g3dlk8 _7l11ef _gz1l7b _vy1ris T_b1c2c114")

        try: distance_away1.append(z[0].text)
        except: distance_away1.append(np.nan)
        try: distance_away2.append(z[1].text)
        except: distance_away2.append(np.nan)

    # Create a DataFrame for the current page and append it to the list
    df = pd.DataFrame({
        'SocietyName': society_name,
        'BHK': bhk,
        'Furnishing': furnishing,
        'BuiltUpArea': built_up_area,
        'Locality': locality,
        'NearbyPlace_1': nearby_place1,
        'DistanceAway_1': distance_away1,
        'NearbyPlace_2': nearby_place2,
        'DistanceAway_2': distance_away2,
        'Description': description,
        'Highlights': highlights
    })
    
    data_frames.append(df)  # Add the current page DataFrame to the list

    time.sleep(6)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(data_frames, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv("apartment.csv", index=False)

# Display the first few rows of the final DataFrame
final_df.head()

Unnamed: 0,SocietyName,BHK,Furnishing,BuiltUpArea,Locality,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description,Highlights
0,Aston Homes,"3 BHK Flat for rent in Palanpur, Surat",Semi Furnished,2200 sq.ft,"3 BHK Flat for rent in Palanpur, Surat",L P Savani School,is 0.2 km away,BACHPAN Children hospital,is 0.4 km away,Your search for a spacious home at affordable ...,Property Highlights24x7 SecurityChildren Play ...
1,Raghuvir Spectrum,"3 BHK Flat for rent in Bhimrad, Surat",Unfurnished,1775 sq.ft,"3 BHK Flat for rent in Bhimrad, Surat",Agarwal Vidya Vihar,is 0.7 km away,Amritam Multispeciality Hospital,is 0.9 km away,If you are looking for a modern house on rent ...,Property HighlightsClose to AirportClose to AT...
2,Ramaa Residency,"2 BHK Flat for rent in Dahin Nagar, Surat",Semi Furnished,1278 sq.ft,"2 BHK Flat for rent in Dahin Nagar, Surat",The Radiant International School,is 1.3 km away,Ortho Plus Hospital,is 0.9 km away,Its a 2 BHK UNUSED Semi Furnished Flat Availab...,Property Highlights24x7 SecurityGated Communit...
3,Swagat Clifton,"2 BHK Flat for rent in Dundi, Surat",Semi Furnished,1252 sq.ft,"2 BHK Flat for rent in Dundi, Surat",Bhagwan Mahavir International School,is 0.1 km away,Shraddha Clinic - Dr. Khushbu Patel Jani,is 2.5 km away,A spacious home designed for families aspiring...,Property Highlights24x7 SecurityChildren Play ...
4,Shyam Enclave Building A B C,"2 BHK Flat for rent in Dahin Nagar, Surat",Unfurnished,1254 sq.ft,"2 BHK Flat for rent in Dahin Nagar, Surat",Shishukunj Vidya Vihar & L B Contractor School,is 1.5 km away,Ortho Plus Hospital,is 1.4 km away,This is a well-designed 2 BHK Apartment availa...,


In [8]:
final_df.shape

(620, 11)

In [9]:
final_df.duplicated().sum()

55

In [10]:
final_df.isna().sum()

SocietyName       237
BHK                 0
Furnishing          0
BuiltUpArea         0
Locality            0
NearbyPlace_1       1
DistanceAway_1      1
NearbyPlace_2       1
DistanceAway_2      1
Description         0
Highlights        483
dtype: int64

In [11]:
final_df['Furnishing'].value_counts()

Furnishing
Fully Furnished    291
Unfurnished        184
Semi Furnished     145
Name: count, dtype: int64

In [12]:
# loop
# 21: shape: 590
#     duplicated: 21

# 31: shape: 620
#     duplicated: 55

# 41: shape: 650
#     duplicated: 85

# 51: shape: 680
#     duplicated: 115