# Web Scraping Real Estate Data

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os

# driver = webdriver.Chrome(ChromeDriverManager().install())

In [2]:
prices = []
beds = []
baths = []
sizes = []
addresses = []
# driver.get('https://www.realtor.com/realestateandhomes-search/New-York_NY')


city = "Georgia"
csv_file = f"{city.lower()}_listings.csv"
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    url = "https://api.webscrapingapi.com/v1"
    params = {
     "api_key": os.environ.get("API_KEY"),
     "url": f"https://www.realtor.com/realestateandhomes-search/{city}"
    }
    response = requests.request("GET", url, params=params)

    # content = driver.page_source
    content = response.text
    # print(content)
    soup = BeautifulSoup(content, features="html.parser")

    for element in soup.findAll("li", attrs={"class": "component_property-card"}):
        price = element.find("span", attrs={"data-label": "pc-price"})
        bed = element.find("li", attrs={"data-label": "pc-meta-beds"})
        bath = element.find("li", attrs={"data-label": "pc-meta-baths"})
        size = element.find("li", attrs={"data-label": "pc-meta-sqft"})
        address = element.find("div", attrs={"data-label": "pc-address"})

        if bed and bath:
            nr_beds = bed.find("span", attrs={"data-label": "meta-value"})
            nr_baths = bath.find("span", attrs={"data-label": "meta-value"})

            # if nr_beds and float(nr_beds.text) >= 2 and nr_baths and float(nr_baths.text) >= 1
            if nr_beds and nr_baths:
                beds.append(nr_beds.text)
                baths.append(nr_baths.text)

                if price and price.text:
                    prices.append(price.text)
                else:
                    prices.append("No display data")

                if size and size.text:
                    sizes.append(size.text)
                else:
                    sizes.append("No display data")
                if address and address.text:
                    addresses.append(address.text)
                else:
                    addresses.append("No display data")
    
    # TODO: Add scraping pagination
    df = pd.DataFrame({'Address': addresses, 'Price': prices, 'Beds': beds, 'Baths': baths, 'Sizes': sizes})
    df.to_csv(f'{city.lower()}_listings.csv', index=False, encoding='utf-8')

In [3]:
df

Unnamed: 0,Address,Price,Beds,Baths,Sizes
0,"2935 Bethsaida Rd, Riverdale, GA 30296","$195,900",3,2,No display data
1,"203 Scarborough Rd, Centerville, GA 31028","$159,900",3,2,"1,529sqft"
2,"767 Turtle Cove Trwy, Monticello, GA 31064","$215,000",3,2,"1,834sqft"
3,"19 Peppermill Dr SW, Cartersville, GA 30120","$345,000",3,2,"3,060sqft"
4,"2410 Elizabeth Ann Ln, Young Harris, GA 30582","$224,900",2,2,No display data
5,"130 Countryside Ln, Covington, GA 30016","$253,000",3,2,"1,172sqft"
6,"1480 Honeysuckle Dr NW, Conyers, GA 30012","$230,000",3,2,"1,660sqft"
7,"124 Gifton Thomas Rd, Bethlehem, GA 30620","$221,000",3,1,984sqft
8,"4981 Galbraith Cir, Stone Mountain, GA 30088","$225,000",3,2,No display data
9,"4041 Summit Chase Rd, Gainesville, GA 30506","$279,000",3,2,"1,608sqft"


In [4]:
def remove_sub_string(string, sub):
    if string == "No display data":
        string = 0
    if isinstance(string, (int, float)):
        return string
    return string.replace(sub, "")

In [5]:
df["Price"] = df["Price"].apply(remove_sub_string, args=("$",))
df["Price"] = df["Price"].apply(remove_sub_string, args=(",",))
df["Beds"] = df["Beds"].apply(remove_sub_string, args=("+",))
df["Baths"] = df["Baths"].apply(remove_sub_string, args=("+",))
df["Sizes"] = df["Sizes"].apply(remove_sub_string, args=(",",))
df["Sizes"] = df["Sizes"].apply(remove_sub_string, args=("sqft",))
df.head(10)

Unnamed: 0,Address,Price,Beds,Baths,Sizes
0,"2935 Bethsaida Rd, Riverdale, GA 30296",195900,3,2,0
1,"203 Scarborough Rd, Centerville, GA 31028",159900,3,2,1529
2,"767 Turtle Cove Trwy, Monticello, GA 31064",215000,3,2,1834
3,"19 Peppermill Dr SW, Cartersville, GA 30120",345000,3,2,3060
4,"2410 Elizabeth Ann Ln, Young Harris, GA 30582",224900,2,2,0
5,"130 Countryside Ln, Covington, GA 30016",253000,3,2,1172
6,"1480 Honeysuckle Dr NW, Conyers, GA 30012",230000,3,2,1660
7,"124 Gifton Thomas Rd, Bethlehem, GA 30620",221000,3,1,984
8,"4981 Galbraith Cir, Stone Mountain, GA 30088",225000,3,2,0
9,"4041 Summit Chase Rd, Gainesville, GA 30506",279000,3,2,1608


In [6]:
df = df.astype({"Price": int, "Beds": int, "Baths": float, "Sizes": int})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Address  42 non-null     object 
 1   Price    42 non-null     int64  
 2   Beds     42 non-null     int64  
 3   Baths    42 non-null     float64
 4   Sizes    42 non-null     int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 1.8+ KB


In [7]:
df.head()

Unnamed: 0,Address,Price,Beds,Baths,Sizes
0,"2935 Bethsaida Rd, Riverdale, GA 30296",195900,3,2.0,0
1,"203 Scarborough Rd, Centerville, GA 31028",159900,3,2.0,1529
2,"767 Turtle Cove Trwy, Monticello, GA 31064",215000,3,2.0,1834
3,"19 Peppermill Dr SW, Cartersville, GA 30120",345000,3,2.0,3060
4,"2410 Elizabeth Ann Ln, Young Harris, GA 30582",224900,2,2.0,0
