In [163]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

# Zillow listings web scraper

The following code is able to scrap data from a zillow listing based on a city name and the number of zillow pages desired.

To get access to Zillow web pages it is necessary to use headers in the URl. These headers bypass the normal web scraping blockers such as Captchas.


In [164]:
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

The function below takes in the name of a city, the URL headers, and a conditional parameter for the number of Zillow pages that you want.

The html for each page is append into a list that will then be processed in the next code section.

In [165]:

def get_zillow_pages(city, req_headers, number_of_pages=1):
    with requests.Session() as s:
        response_content = []
        for page in range(number_of_pages):
            if page == 0:
                response_content.append(s.get(f"https://www.zillow.com/homes/for_sale/{city}/", headers=req_headers))
            response_content.append(s.get(f"https://www.zillow.com/homes/for_sale/{city}/{page + 1}_p/", headers=req_headers))

        # parse into BeautifulSoup object
        soups = []
        for r in response_content:
            soups.append(BeautifulSoup(r.content, 'html.parser'))
        return soups

soups = get_zillow_pages("flagstaff", req_headers, number_of_pages=2)

Now that the Zillow page HTML has been grabbed we can take some listing data from it. This data is limited to the listing address, price, beds, baths, square footing, home type, page link and when it was last updated. In some cases there is a bit of extra info such as the name of the broker.

From here you can use the link to grab further data.

In [166]:
def get_info_from_zillow_pages(zillow_soup_pages):
    df = pd.DataFrame()

    address = []
    price = []
    link = []

    for soup in zillow_soup_pages:
        temp = soup.find_all('address')
        adder_texts = [t.text for t in temp]
        address = address.__add__(adder_texts)  # adds the address as string as single elements

        temp = soup.find_all("span", {"data-test":"property-card-price"})
        price_texts = [t.text for t in temp]
        price = price.__add__(price_texts)  # adds the price as string as single elements

        temp = soup.find_all("a", {"data-test":"property-card-link"})
        href_link = []
        last_temp = None
        for t in temp:
            if last_temp is None or last_temp != t.get("href"):
                if t.get("href") is None:
                    continue
                last_temp = t.get("href")
                href_link.append(t.get("href"))
        link = link.__add__(href_link)  # adds the link to more house info




    df['address'] = address
    df['price'] = price
    df['link'] = link

    # numeric conversions
    if df['price'].dtype != "float":
        df['price'] = df['price'].str.replace('$', '')
        df['price'] = df['price'].str.replace(',', '')
        df['price'] = df['price'].astype('float')
    # df['beds'] = pd.to_numeric(df['beds'], errors='coerce')
    # df['baths'] = pd.to_numeric(df['baths'], errors='coerce')
    # if df['sqft'].dtype != "float":
    #     df['sqft'] = df['sqft'].str.replace(',', '')
    #     df['sqft'] = pd.to_numeric(df['sqft'], errors='coerce')
    return df

df_z = get_info_from_zillow_pages(soups)
df_z

  df['price'] = df['price'].str.replace('$', '')


Unnamed: 0,address,price,link
0,"9330 Valerie Way, Flagstaff, AZ 86004",595000.0,https://www.zillow.com/homedetails/9330-Valeri...
1,"3484 Awatobi Ovi, Flagstaff, AZ 86005",485000.0,https://www.zillow.com/homedetails/3484-Awatob...
2,"8470 Selma Ln, Flagstaff, AZ 86004",395000.0,https://www.zillow.com/homedetails/8470-Selma-...
3,"69 Leupp Rd, Flagstaff, AZ 86004",289850.0,https://www.zillow.com/homedetails/69-Leupp-Rd...
4,"6050 E Camden Rd, Flagstaff, AZ 86004",875000.0,https://www.zillow.com/homedetails/6050-E-Camd...
5,"2606 N Main St, Flagstaff, AZ 86004",450000.0,https://www.zillow.com/homedetails/2606-N-Main...
6,"5339 Brackin Ranch Rd, Flagstaff, AZ 86001",1775000.0,https://www.zillow.com/homedetails/5339-Bracki...
7,"2024 W University Ave, Flagstaff, AZ 86001",680000.0,https://www.zillow.com/homedetails/2024-W-Univ...
8,"4962 S Topaz Rd, Flagstaff, AZ 86005",1259000.0,https://www.zillow.com/homedetails/4962-S-Topa...
9,"9330 Valerie Way, Flagstaff, AZ 86004",595000.0,https://www.zillow.com/homedetails/9330-Valeri...


Now if you want to grab further data from the Zillow listings you can do so by grabbing the HTML for the specific listing and parsing that data. In the example below the Zillow house estimate is grabbed and added to the pandas DataFrame. Further data can be collect by adding to the code below as needed.

In [167]:
def grab_listing_data(df, req_headers):
    """
    Currently only setup to grab the zillow estimate

    :param df:
    :return:
    """
    z_estimate = []
    beds = []
    baths = []
    sqft = []

    for l in df["link"]:
        with requests.Session() as s:
            found_estimate = False
            r = s.get(l, headers=req_headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            temp = soup.find(class_="summary-container")
            temp = temp.find_all("span")
            for item in temp:
                if item.text.startswith("Zestimate") and item.text.split(" ")[-1] != 'HelpHelp':
                    found_estimate= True
                    z_estimate.append(item.text.split(" ")[-1])
            if found_estimate is False:
                z_estimate.append("")

            temp = soup.find_all("span", {"data-testid":"bed-bath-item"})
            if "Acres" in temp[0].text:  # handles land lots
                beds.append("")
                baths.append("")
                sqft.append("")
            else:
                beds.append(temp[0].text.split(" ")[0])
                baths.append(temp[1].text.split(" ")[0])
                sqft.append(temp[2].text.split(" ")[0])


    df["Zestimate"] = z_estimate
    df["beds"] = beds
    df["baths"] = baths
    df["sqft"] = sqft


    df['beds'] = pd.to_numeric(df['beds'], errors='coerce')
    df['baths'] = pd.to_numeric(df['baths'], errors='coerce')
    if df['sqft'].dtype != "float":
        df['sqft'] = df['sqft'].str.replace(',', '')
        df['sqft'] = pd.to_numeric(df['sqft'], errors='coerce')
    return df

df_z_with_listings = grab_listing_data(df_z, req_headers)
df_z_with_listings

Unnamed: 0,address,price,link,Zestimate,beds,baths,sqft
0,"9330 Valerie Way, Flagstaff, AZ 86004",595000.0,https://www.zillow.com/homedetails/9330-Valeri...,"$571,291",3.0,2.0,1798.0
1,"3484 Awatobi Ovi, Flagstaff, AZ 86005",485000.0,https://www.zillow.com/homedetails/3484-Awatob...,,3.0,2.0,1348.0
2,"8470 Selma Ln, Flagstaff, AZ 86004",395000.0,https://www.zillow.com/homedetails/8470-Selma-...,"$382,258",3.0,2.0,1456.0
3,"69 Leupp Rd, Flagstaff, AZ 86004",289850.0,https://www.zillow.com/homedetails/69-Leupp-Rd...,,3.0,1.0,1120.0
4,"6050 E Camden Rd, Flagstaff, AZ 86004",875000.0,https://www.zillow.com/homedetails/6050-E-Camd...,,3.0,3.0,2460.0
5,"2606 N Main St, Flagstaff, AZ 86004",450000.0,https://www.zillow.com/homedetails/2606-N-Main...,"$450,001",5.0,3.0,1854.0
6,"5339 Brackin Ranch Rd, Flagstaff, AZ 86001",1775000.0,https://www.zillow.com/homedetails/5339-Bracki...,,3.0,3.0,2793.0
7,"2024 W University Ave, Flagstaff, AZ 86001",680000.0,https://www.zillow.com/homedetails/2024-W-Univ...,"$666,800",3.0,2.0,1874.0
8,"4962 S Topaz Rd, Flagstaff, AZ 86005",1259000.0,https://www.zillow.com/homedetails/4962-S-Topa...,"$1,259,003",4.0,4.0,4199.0
9,"9330 Valerie Way, Flagstaff, AZ 86004",595000.0,https://www.zillow.com/homedetails/9330-Valeri...,"$571,291",3.0,2.0,1798.0
