In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

# Zillow listings web scraper

The following code is able to scrap data from a zillow listing based on a city name and the number of zillow pages desired.

To get access to Zillow web pages it is necessary to use headers in the URl. These headers bypass the normal web scraping blockers such as Captchas.


In [2]:
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

The function below takes in the name of a city, the URL headers, and a conditional parameter for the number of Zillow pages that you want.

The html for each page is append into a list that will then be processed in the next code section.

In [3]:

def get_zillow_pages(city, req_headers, number_of_pages=1):
    with requests.Session() as s:
        response_content = []
        for page in range(number_of_pages):
            if page == 0:
                response_content.append(s.get(f"https://www.zillow.com/homes/for_sale/{city}/", headers=req_headers))
            response_content.append(s.get(f"https://www.zillow.com/homes/for_sale/{city}/{page + 1}_p/", headers=req_headers))

        # parse into BeautifulSoup object
        soups = []
        for r in response_content:
            soups.append(BeautifulSoup(r.content, 'html.parser'))
        return soups

soups = get_zillow_pages("flagstaff", req_headers, number_of_pages=2)

Now that the Zillow page HTML has been grabbed we can take some listing data from it. This data is limited to the listing address, price, beds, baths, square footing, home type, page link and when it was last updated. In some cases there is a bit of extra info such as the name of the broker.

From here you can use the link to grab further data.

In [4]:
def get_info_from_zillow_pages(zillow_soup_pages):
    df = pd.DataFrame()

    address = []
    price = []
    beds = []
    baths = []
    square_ft =[]
    home_type = []
    last_updated = []
    extra_info = []
    link = []

    for soup in zillow_soup_pages:
        temp = soup.find_all(class_= 'list-card-addr')
        adder_texts = [t.text for t in temp]
        address = address.__add__(adder_texts)  # adds the address as string as single elements

        temp = soup.find_all(class_='list-card-price', text=True)
        price_texts = [t.text for t in temp]
        price = price.__add__(price_texts)  # adds the address as string as single elements

        details = soup.find_all("ul", class_="list-card-details")

        for d in details:
            temp = d.find_all(class_="")
            if len(temp) == 0:
                beds.append("")  # grab the number, drop the string bds
            else:
                beds.append(temp[0].text.split(" ")[0])  # grab the number, drop the string bds

            if len(temp) < 2:
                baths.append("")  # grab the number, drop the string ba
            else:
                baths.append(temp[1].text.split(" ")[0])  # grab the number, drop the string ba

            if len(temp) < 3:
                square_ft.append("")  # grab the number, drop the string sqft
            else:
                square_ft.append(temp[2].text.split(" ")[0])  # grab the number, drop the string sqft

        temp = soup.find_all('div', {'class': 'list-card-footer'})
        home_texts = [t.text for t in temp if t.text != '']
        home_type = home_type.__add__(home_texts)  # adds the address as string as single elements

        temp = soup.find_all('div', {'class': 'list-card-top'})
        updated_texts = [t.text for t in temp if t.text != '']
        last_updated = last_updated.__add__(updated_texts)  # adds the address as string as single elements

        temp = soup.find_all(class_= 'list-card-link')
        href_link = []
        last_temp = None
        for t in temp:
            if last_temp is None or last_temp != t.get("href"):
                if t.get("href") is None:
                    continue
                last_temp = t.get("href")
                href_link.append(t.get("href"))
        link = link.__add__(href_link)  # adds the address as string as single elements

        temp = soup.find_all(class_='list-card-extra-info')
        extra_text = [t.text for t in temp]
        extra_info = extra_info.__add__(extra_text)  # adds the address as string as single elements


    df['address'] = address
    df['price'] = price
    df['beds'] = beds
    df['baths'] = baths
    df['sqft'] = square_ft
    df['home type'] = home_type
    df['last update'] = last_updated
    df['extra info'] = extra_info
    df['link'] = link

    # numeric conversions
    df['price'] = df['price'].str.replace('$', '')
    df['price'] = df['price'].str.replace(',', '')
    df['price'] = df['price'].astype('float')
    df['beds'] = pd.to_numeric(df['beds'], errors='coerce')
    df['baths'] = pd.to_numeric(df['baths'], errors='coerce')
    df['sqft'] = df['sqft'].str.replace(',', '')
    df['sqft'] = pd.to_numeric(df['sqft'], errors='coerce')
    return df

df_z = get_info_from_zillow_pages(soups)
df_z

  df['price'] = df['price'].str.replace('$', '')


Unnamed: 0,address,price,beds,baths,sqft,home type,last update,extra info,link
0,"1 Indian Rural Route Rd, Flagstaff, AZ 86004",40000.0,10,,,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",9 days on Zillow,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/1-Indian-Ru...
1,"2401 W Route 66 LOT 17, Flagstaff, AZ 86001",188500.0,3,2.0,1152.0,"MLS ID #189872, RE/MAX FINE PROPERTIES",29 days on Zillow,"MLS ID #189872, RE/MAX FINE PROPERTIES",https://www.zillow.com/homedetails/2401-W-Rout...
2,"8275 E Mercury Dr, Flagstaff, AZ 86004",550000.0,3,2.0,1752.0,"MLS ID #190147, REALTY ONE GROUP, MOUNTAIN DESERT","Price cut: $25,000 (Jun 8)","MLS ID #190147, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/8275-E-Merc...
3,"15 W Separation Canyon Trl, Flagstaff, AZ 86005",858000.0,3,3.0,1955.0,"MLS ID #190290, RE/MAX FINE PROPERTIES",Open: Sun. 12-3pm,"MLS ID #190290, RE/MAX FINE PROPERTIES",https://www.zillow.com/homedetails/15-W-Separa...
4,"1603 N Center St, Flagstaff, AZ 86004",1075000.0,8,4.0,3200.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/1603-N-Cent...
5,"1600 N Center St, Flagstaff, AZ 86004",1075000.0,8,4.0,3200.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/1600-N-Cent...
6,"2714 N Izabel St, Flagstaff, AZ 86004",1075000.0,8,4.0,2900.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/2714-N-Izab...
7,"3580 Huron, Flagstaff, AZ 86005",575000.0,3,3.0,1583.0,VILLAGE LAND SHOPPE,9 days on ZillowListing provided by ARMLS,VILLAGE LAND SHOPPE,https://www.zillow.com/homedetails/3580-Huron-...
8,"69 Leupp Rd, Flagstaff, AZ 86004",293000.0,3,1.0,1120.0,"MLS ID #190038, CONGRESS REALTY","Price cut: $11,900 (Jun 1)","MLS ID #190038, CONGRESS REALTY",https://www.zillow.com/homedetails/69-Leupp-Rd...
9,"1 Indian Rural Route Rd, Flagstaff, AZ 86004",40000.0,10,,,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",9 days on Zillow,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/1-Indian-Ru...


Now if you want to grab further data from the Zillow listings you can do so by grabbing the HTML for the specific listing and parsing that data. In the example below the Zillow house estimate is grabbed and added to the pandas DataFrame. Further data can be collect by adding to the code below as needed.

In [5]:
def grab_listing_data(df, req_headers):
    """
    Currently only setup to grab the zillow estimate

    :param df:
    :return:
    """
    z_estimate = []

    for l in df["link"]:
        with requests.Session() as s:
            found_estimate = False
            r = s.get(l, headers=req_headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            temp = soup.find(class_="summary-container")
            temp = temp.find_all("span")
            for item in temp:
                if item.text.startswith("Zestimate") and item.text.split(" ")[-1] != 'HelpHelp':
                    found_estimate= True
                    z_estimate.append(item.text.split(" ")[-1])
            if found_estimate is False:
                z_estimate.append("")

    df["Zestimate"] = z_estimate
    return df

df_z_with_listings = grab_listing_data(df_z, req_headers)
df_z_with_listings

Unnamed: 0,address,price,beds,baths,sqft,home type,last update,extra info,link,Zestimate
0,"1 Indian Rural Route Rd, Flagstaff, AZ 86004",40000.0,10,,,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",9 days on Zillow,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/1-Indian-Ru...,
1,"2401 W Route 66 LOT 17, Flagstaff, AZ 86001",188500.0,3,2.0,1152.0,"MLS ID #189872, RE/MAX FINE PROPERTIES",29 days on Zillow,"MLS ID #189872, RE/MAX FINE PROPERTIES",https://www.zillow.com/homedetails/2401-W-Rout...,
2,"8275 E Mercury Dr, Flagstaff, AZ 86004",550000.0,3,2.0,1752.0,"MLS ID #190147, REALTY ONE GROUP, MOUNTAIN DESERT","Price cut: $25,000 (Jun 8)","MLS ID #190147, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/8275-E-Merc...,"$594,200"
3,"15 W Separation Canyon Trl, Flagstaff, AZ 86005",858000.0,3,3.0,1955.0,"MLS ID #190290, RE/MAX FINE PROPERTIES",Open: Sun. 12-3pm,"MLS ID #190290, RE/MAX FINE PROPERTIES",https://www.zillow.com/homedetails/15-W-Separa...,"$851,600"
4,"1603 N Center St, Flagstaff, AZ 86004",1075000.0,8,4.0,3200.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/1603-N-Cent...,
5,"1600 N Center St, Flagstaff, AZ 86004",1075000.0,8,4.0,3200.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/1600-N-Cent...,
6,"2714 N Izabel St, Flagstaff, AZ 86004",1075000.0,8,4.0,2900.0,"W AND PARTNERS, LLC",2 days on ZillowListing provided by ARMLS,"W AND PARTNERS, LLC",https://www.zillow.com/homedetails/2714-N-Izab...,
7,"3580 Huron, Flagstaff, AZ 86005",575000.0,3,3.0,1583.0,VILLAGE LAND SHOPPE,9 days on ZillowListing provided by ARMLS,VILLAGE LAND SHOPPE,https://www.zillow.com/homedetails/3580-Huron-...,
8,"69 Leupp Rd, Flagstaff, AZ 86004",293000.0,3,1.0,1120.0,"MLS ID #190038, CONGRESS REALTY","Price cut: $11,900 (Jun 1)","MLS ID #190038, CONGRESS REALTY",https://www.zillow.com/homedetails/69-Leupp-Rd...,
9,"1 Indian Rural Route Rd, Flagstaff, AZ 86004",40000.0,10,,,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",9 days on Zillow,"MLS ID #190170, REALTY ONE GROUP, MOUNTAIN DESERT",https://www.zillow.com/homedetails/1-Indian-Ru...,
