In [3]:
import requests
from bs4 import BeautifulSoup as soup

# constructs the query for webscraping, main variable is the url string
def scrapeData(city, province):
    header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko)Chrome/83.0.4103.97 Safari/537.36',        
          'referer':'https://www.zillow.com/homes/for_rent/Manhattan,-New-York,-NY_rb/?searchQueryState=%7B%22pagination'}
 
    # Enter Zillow URL for the city of your preference
    url = 'https://www.zillow.com/homes/for_sale/' + city + ',-' + province + '_rb/'
    html = requests.get(url=url,headers=header)
    html.status_code
    bsobj = soup(html.content,'lxml')
    return bsobj

# webscraping guide for zillow found here https://www.octoparse.com/blog/how-to-scrape-zillow-data#

In [4]:
#grab home price from soup object
def getPrice(bsobj):
    price_list = [] 

    # loop through the price section of the data and extract

    # the text and store it in the list.
    for price in bsobj.findAll('span',{'data-test':'property-card-price'}):
        price = price.get_text()[2::]
        price_list.append(int(price.replace(',','')))
    
    return price_list

In [5]:
def getAddress(bsobj):
    address = []


    # loop through the address section of the data

    # and extract the text and store it in the list.
    for adr in bsobj.findAll('address',{'data-test':'property-card-addr'}):
        address.append(adr.get_text().strip())
    return address

In [6]:
import re
def getMLS(bsobj):
    mls = []


    # loop through the MLS section of the data

    # and extract the text and store it in the list.
    for adr in bsobj.findAll('div',{'class':'StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 hTcpwx'}):
        try:
            mls.append(re.findall(r"\w+", adr.get_text().strip())[2])
        except:
            mls.append("Null")
    return mls

In [7]:
def getPropertyFeature(bsobj):
    bed = [] 
    bath = []
    footage = []

    # loop through the features of the home (# of bds, ba, and house size) of the data
    for item in bsobj.findAll('span',{'class':'StyledPropertyCardHomeDetails-c11n-8-73-8__sc-1mlc4v9-0 jlVIIO'}):
        x = re.findall(r"\d+", item.get_text().strip().replace(',',''))
        try:
            bed.append(int(x[0]))
        except:
            bed.append(0)
        
        try:
            bath.append(int(x[1]))
        except:
            bath.append(0)
            
        try:
            footage.append(int(x[2]))
        except:
            footage.append(0)
            
    return bed, bath, footage

In [12]:
import pandas as pd
df = pd.DataFrame(columns = ['MLS','City','Province','Address', 'Price', 'NumberOfBeds', 'NumberOfBaths', 'Footage'] )
#some random locations for us to scrape, if scraping 2 cities in the same province, we should switch to a list of tuples

queryLocations = [["SK", "Saskatoon"]
                 ,["AB", "Edmonton"]
                 ,["AB", "Calgary"]
                 ,["ON", "Toronto"]
                 ,["ON", "Mississauga"]
                 ,["NS", "Halifax"]
                 ,["BC", "Vancouver"]
                 ,["MB", "Winnipeg"]]

for i in queryLocations:
    province = i[0]
    city = i[1]
    bsobj = scrapeData(city, province)
    price = getPrice(bsobj)
    address = getAddress(bsobj)
    mls = getMLS(bsobj)
    bed, bath, footage = getPropertyFeature(bsobj)
    
    #build a dataframe for each query
    tempDF = pd.DataFrame()
    tempDF['MLS'] = mls
    tempDF['City'] = city
    tempDF['Province'] = province
    tempDF['Address'] = address
    tempDF['Price'] = price
    tempDF['NumberOfBeds'] = bed
    tempDF['NumberOfBaths'] = bath
    tempDF['Footage'] = footage
    print(tempDF)
    df = df.append(tempDF)

print(df)

        MLS       City Province  \
0  SK913993  Saskatoon       SK   
1  SK914061  Saskatoon       SK   
2  SK914153  Saskatoon       SK   
3  SK914148  Saskatoon       SK   
4  SK914086  Saskatoon       SK   
5  SK914075  Saskatoon       SK   
6  SK914144  Saskatoon       SK   
7  SK914082  Saskatoon       SK   
8  SK914108  Saskatoon       SK   

                                           Address   Price  NumberOfBeds  \
0          207 Lehrer PLACE, Saskatoon, SK S7R 0L4  669900             3   
1        3104 Ortona STREET, Saskatoon, SK S7M 3R4  619900             3   
2      714A Victoria AVENUE, Saskatoon, SK S7N 0Z2  374900             3   
3    246 Frobisher CRESCENT, Saskatoon, SK S7K 4Y7  479900             5   
4         739 Hastings COVE, Saskatoon, SK S7V 0G6  899900             4   
5        334 Crean CRESCENT, Saskatoon, SK S7J 3X2  419900             4   
6  962 Kloppenburg CRESCENT, Saskatoon, SK S7W 0P2  559900             5   
7       2435 Rosewood DRIVE, Saskatoon, S

  df = df.append(tempDF)


        MLS      City Province  \
0  E4320360  Edmonton       AB   
1  E4320364  Edmonton       AB   
2  E4320358  Edmonton       AB   
3  E4320355  Edmonton       AB   
4  E4320309  Edmonton       AB   
5  E4320357  Edmonton       AB   
6  E4320331  Edmonton       AB   
7  E4320291  Edmonton       AB   
8  E4320295  Edmonton       AB   

                                             Address   Price  NumberOfBeds  \
0             14611 95th St NW, Edmonton, AB T5E 3Y7  575000             6   
1            10465 42nd Ave NW, Edmonton, AB T6J 7C7  429900             2   
2            10346 142nd St NW, Edmonton, AB T5N 2P1  393000             5   
3            5512 145th Ave NW, Edmonton, AB T5A 3R3  185000             3   
4            4611 128th Ave NW, Edmonton, AB T5A 2M7  399900             4   
5  7339 S Terwillegar Dr NW #1418, Edmonton, AB T...  130000             1   
6             1711 109th St NW, Edmonton, AB T6J 5Z8  519900             5   
7          13239 Delwood Rd NW, Edm

  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []


  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []


  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []


  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []


  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []


  df = df.append(tempDF)


Empty DataFrame
Columns: [MLS, City, Province, Address, Price, NumberOfBeds, NumberOfBaths, Footage]
Index: []
        MLS       City Province  \
0  SK913993  Saskatoon       SK   
1  SK914061  Saskatoon       SK   
2  SK914153  Saskatoon       SK   
3  SK914148  Saskatoon       SK   
4  SK914086  Saskatoon       SK   
5  SK914075  Saskatoon       SK   
6  SK914144  Saskatoon       SK   
7  SK914082  Saskatoon       SK   
8  SK914108  Saskatoon       SK   
0  E4320360   Edmonton       AB   
1  E4320364   Edmonton       AB   
2  E4320358   Edmonton       AB   
3  E4320355   Edmonton       AB   
4  E4320309   Edmonton       AB   
5  E4320357   Edmonton       AB   
6  E4320331   Edmonton       AB   
7  E4320291   Edmonton       AB   
8  E4320295   Edmonton       AB   

                                             Address   Price NumberOfBeds  \
0            207 Lehrer PLACE, Saskatoon, SK S7R 0L4  669900            3   
1          3104 Ortona STREET, Saskatoon, SK S7M 3R4  619900         

  df = df.append(tempDF)


In [172]:
#df.to_csv('properties.csv')