# Selenium scraper for multiple websites which let you search by ZIP

## pip installs libraries.
note: %%capture suppresses output

In [1]:
%%capture
pip install selenium

In [2]:
%%capture
pip install webdriver-manager

## Define parameters.

In [141]:
STATE_TO_SEARCH = None #'CO'
    # Options include 'CO' and None, which searches all 50 states.

SITES_TO_SEARCH = ['Redfin','Rent.']
    # Options include 'Rent.' and 'Redfin'
SITE_OPTIONS = ['Rent.','Redfin'] # that can be parsed

READ_ONLY = False
    # When true no data is downloaded

PARSE_WHILE_SCRAPING = True
SAVE_RAW_HTML = False # not reccomended
SAVE_ZIP_CSVs = True # highly reccomended
RESCRAPE_EMPTY_ZIPS = False

SAVE_BY_WEEK = True
    # instead of saving day of scrape

STARTING_ZIP = None
    # Lets you search country over multiple days.

ASSUMPTION = {'PRICE':1,
              'SQFT':1,
              'BEDS':'Unknown',
              'BATHS':'Unknown'}
    #When a value is a range, what do we assume is the value?
        # 'Unknown' is an option
        # 0 means minimum
        # 1 means maximum
        # 0.5 would split the difference
    
WAIT_TIME_DICT = {'CONSTANT':{'Realtor':10,'Rent.':0,'other': 0},
                     # whether or not site redirects to CAPTCHA
                  'FAILURE' :{'other':23}
                     # after site redirects to CAPTCHA
            }

## Import libraries.

In [110]:
#scraping
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager 
from webdriver_manager.firefox import GeckoDriverManager
import requests # for headless version

#data storage
import pandas # for reading the txt with ZIPs
from datetime import date # for file names
import os # for file interactions
from bs4 import BeautifulSoup as Soup # for knowing how many pages there are per ZIP
from time import sleep
from tqdm import tqdm # progress bar
from numpy import NaN

LONG_DASH, LONGER_DASH = '–', '—'

### Import ZIPs and MHAs

In [111]:
MHAs = pandas.read_csv("sorted_zipmha22.txt",
                     delimiter = " ", names = ["ZIP","MHA"],
                    dtype = {'ZIP':str})
MHAs['State'] = MHAs.apply(lambda row: row['MHA'][:2], axis = 1)
if STATE_TO_SEARCH:
    assert type(STATE_TO_SEARCH) == str
    MHAs = MHAs[MHAs['State'] == STATE_TO_SEARCH]
ZIPs = list(MHAs['ZIP'])
ZIPs = [str(ZIP).zfill(5) for ZIP in ZIPs]
if STARTING_ZIP:
    startIndex = ZIPs.index(str(STARTING_ZIP))
    ZIPs = ZIPs[startIndex:] + ZIPs[:startIndex]
print("There are",len(ZIPs),"ZIPs to scrape")
MHAs.head()

There are 41070 ZIPs to scrape


Unnamed: 0,ZIP,MHA,State
0,501,NY218,NY
1,544,NY218,NY
2,601,XX499,XX
3,602,XX499,XX
4,603,XX499,XX


### Install Selenium Driver Manager

In [112]:
if not READ_ONLY: PATH = ChromeDriverManager().install()
                    #PATH = GeckoDriverManager().install()

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.46M/6.46M [00:00<00:00, 21.3MB/s]


# Define functions.

In [113]:
def WAIT_TIME(context, site = 'other'):
    if site in WAIT_TIME_DICT[context].keys():
        return WAIT_TIME_DICT[context][site]
    else:
        return WAIT_TIME_DICT[context]['other']
def wait(context, site = 'other'):
    sleep(WAIT_TIME(context, site = site))

### Generate URL

In [114]:
# /appartments includes houses for rent
def URL(site, ZIP, page = 1):
    assert type(site) == str
    assert type(ZIP ) == str
    return {'Realtor':"https://www.realtor.com/apartments/{}/pg-{}".format(ZIP, page),
            'Rent.'  :"https://www.rent.com/zip-{}-apartments?page={}".format(ZIP, page),
            #'Zillow' :"https://www.zillow.com/homes/{}_r/".format(ZIP),
            'trulia' :"https://www.trulia.com/for_rent/{}_zip/".format(ZIP),
            'Redfin' :"https://www.redfin.com/zipcode/{}/apartments-for-rent/page-{}".format(ZIP, page)
           }[site]

### Create folders and return a file path to write a file to

In [115]:
def weekString(aDate):
    return str(aDate.isocalendar()[:2])

def dateString(aDate):
    if SAVE_BY_WEEK:
        return weekString(aDate)
    else:
        return string(aDate)

STATES_GET_SEPARATE_FOLDERS = False
def makeFilePath(site, ZIP = None, fileType = 'HTML', dateOf = date.today(), page = '1', makeFolders = True):
    for var in [site, page, fileType]: assert type(var) == str, f"{var} is not a string"
    assert fileType in ['HTML','ZIP CSV','Compiled CSV','Processed CSV']
    filePathParts = ['Scraped_Data']
    
    if STATES_GET_SEPARATE_FOLDERS:
        if STATE_TO_SEARCH: # if searching just one state
            assert type(STATE_TO_SEARCH) == str
            filePathParts.append(STATE_TO_SEARCH)
    
    filePathParts.append(fileType+'s') # FILE -> FILEs
    
    filePathParts.append(site.replace('.',''))
    
    if not 'Compiled' in fileType: filePathParts.append(dateString(dateOf))

    if fileType == 'HTML':
        filePathParts.append(ZIP)
        fileName = page.zfill(2)
        if not ('.' in fileName): fileName += '.html'
    elif 'CSV' in fileType:
        if 'ZIP' in fileType:
            assert type(ZIP) == str
            fileName = ZIP
        elif 'Compiled' in fileType: fileName = dateString(dateOf)
        if not ('.' in fileName): fileName += '.CSV'
    
    filePath = ""
    for filePathPart in filePathParts:
        filePath = filePath + filePathPart + '/'
        if makeFolders:
            if not os.path.exists(filePath): os.mkdir(filePath)
    
    filePath += fileName
    return filePath

In [116]:
def HTML_to_file(HTML, filePath):
    # Does not check if file already exists.
        # Only run if file does not exists or should be written over.
    with open(filePath, 'w', encoding="utf-8") as f:
        f.write(HTML)
        f.close()

In [117]:
HTML_MIN_SIZE = 100 # KB
HTML_MIN_SIZE *= 1000 #KB -> B

def isValidHTML(filePath):
    if os.path.exists(filePath):
        if os.path.getsize(filePath) >= HTML_MIN_SIZE:
            return True
    # else:
        return False

In [118]:
def parseRedfinNumPages(pageSoup):
    pageSummary = pageSoup.find(class_="homes summary")
    if not pageSummary: return 1
    pageSummary = pageSummary.text
    if not 'of' in pageSummary: return 1
    propsPerPage = int(pageSummary.split(' ')[0])
    numPropsText = pageSummary.split('of ')[1].split(' ')[0]
    numProps = int(numPropsText)
    numPages = 1 + (numProps - 1) // propsPerPage
    if numPages > 9: numPages = 9
    return numPages
    
def parseRentNumPages(pageSoup):
    propsPerPage = 30
    numPropsTags = pageSoup.find_all(class_ = "truncate text-gray-600")
    if len(numPropsTags) <= 0: return 1
    numPropsText = numPropsTags[0].text
    numPropsText = numPropsText.split(" ")[0] # take only text before space
    numPropsText = numPropsText.replace(',','') # replace commas with nothing
    numProps = int(numPropsText)
    numPages = 1 + (numProps - 1) // propsPerPage
    return numPages

def parseNumPages(site, firstPage):
    if site not in ['Redfin', 'Rent.']: return -1
    pageSoup = Soup(firstPage) if (type(firstPage) == str) else firstPage
    return{'Redfin':parseRedfinNumPages,
            'Rent.':parseRentNumPages
    }[site](pageSoup)

### All interacitons with the Selenium Chrome driver are under the getPage function.

In [119]:
def startNewDriver():
    global driver
    driver = webdriver.Chrome(service = Service(executable_path = PATH))
        #driver = webdriver.Firefox(service = Service(executable_path = PATH))
    return driver

def getPage(URL, headless = False):
    if headless:
        HTML = requests.get(URL).content
        HTML = str(HTML)
    else:
        global driver
        try: driver.get(URL)
        except:
            print("Starting new driver.")
            driver = startNewDriver()
                
        try: HTML = driver.page_source
        except:
            return getPage(URL, headless = headless)
        if len(HTML) < HTML_MIN_SIZE:
            driver.quit()
            wait('FAILURE')
            driver = startNewDriver()
    if len(HTML) < HTML_MIN_SIZE:
        return getPage(URL, headless = headless)
    return HTML

In [120]:
def whichSite(page):
    if type(page) == str: page = Soup(page)
    for site in SITE_OPTIONS:
        if site in page.find("title").text:
            return site

In [121]:
def parsePage(page):
    pageSoup = Soup(page) if (type(page) == str) else page
    site = whichSite(pageSoup)
    if site == 'Rent.':
        columns = pageSoup.find_all(class_ = "flex flex-col gap-y-12")
        allCards = []
        for column in columns:
            someCards = column.find_all(attrs = {'data-tag_section':"free"})
            allCards.extend(someCards)
            someCards = column.find_all(attrs = {'data-tag_section':"paid"})
            allCards.extend(someCards)
        return allCards
    elif site == 'Redfin':
        allCards = []
        for section in pageSoup.find_all(class_="HomeCardsContainer flex flex-wrap"):
            allCards.extend(section)
        return allCards
    else: return -1

In [122]:
def priceUnavailable(apartment):
    for phrase in ['Contact for Price',
                  'Price Unavailable']:
        if phrase in apartment.text:
            return True
    return False # <- else

def priceAvailable(apartment):
    return not priceUnavailable(apartment)

In [123]:
def parseOneRentCard(card):
    details = card.find_all("dd", class_ = "font-normal text-body-color")
    hiddenPrice = "Price Unavailable"
    Sqftage = "Square Footage Unavailable"
    for detail in details:
        if '$' in detail.text:
            hiddenPrice = detail.text
        if 'Sqft' in detail.text: Sqftage = detail.text
    if priceAvailable(card):
        shownPrice  = card.find("p", class_ = "flex flex-1 items-center text-lg font-semibold text-black").text
        if LONG_DASH in hiddenPrice: price = hiddenPrice # price range
        else: price = shownPrice # shown price sometimes includes +
    else:
        price = "Price Unavailable"
    if 'Units Available' in card.text:
        card.find(attrs = {"data-tid":"available-units"}).text.strip().split(' ')[0]
        
    cardData = {
        'address':card.find_all(class_="overflow-hidden overflow-ellipsis whitespace-nowrap font-normal text-body-color")[0].text,
        'beds-baths':card.find_all(attrs = {'data-tid':"beds-baths"})[0].text,
        'price':price, 'square footage':Sqftage
    }
    return cardData

def parseOneRedfinCard(card):
    price = 'Price Unavailable'
    if priceAvailable(card):
        price = card.find('span', class_="homecardV2Price")
        if price:
            price = price.text
    
    beds  = 'Beds Unavailable'
    baths = 'Baths Unavailable'
    SqFt  = 'Square Footage Unavailable'
    stats = card.find_all("div", class_="stats")
    if card.find(class_="VerifiedBadge includeLabel margin-left-smaller padding-top-small"): verified = 'verified'
    else: verified = 'not verified'
    for statSoup in stats:
        stat = statSoup.text
        if 'Bed'     in stat: beds  = stat
        if 'Bath'    in stat: baths = stat
        if 'Sq. Ft.' in stat: SqFt  = stat
    try: homeType = card.find(class_="PropertyTypeDisplay font-size-smaller padding-top-small").find("span").text[1:]
    except: homeType = "Type Unknown"
    cardData = {
        'address':card.find(class_="homeAddressV2").text,
        'beds':beds, 'baths':baths, 'type':homeType,
        'price':price, 'square footage':SqFt, 'verified':verified
    }
    return cardData
def parseOneCard(site, card):
    return{'Rent.':parseOneRentCard,
          'Redfin':parseOneRedfinCard}[site](card)

In [124]:
def isValidRentApartment(card):
    bed_baths = card.find_all(attrs = {'data-tid':"beds-baths"})
    if bed_baths: return True
    else: return False
def isValidRedfinApartment(card):
    if card.find(class_="homeAddressV2"):
        return True
    else: return False
def isValidApartment(site, card):
    return{'Rent.':isValidRentApartment,
          'Redfin':isValidRedfinApartment}[site](card)

In [125]:
def parseRentCards(cards):
    parseCards('Rent.',cards)
def parseRedfinCards(cards):
    parseCards('Redfin',cards)
def parseCards(site, cards):
    apartmentData = []
    for card in cards:
        if isValidApartment(site, card):
            apartmentData.append(parseOneCard(site, card))
    #return apartmentData
    return pandas.json_normalize(apartmentData)

In [126]:
def handlePage(site, ZIP, HTML, pageSoup = None, page = '1'):
    if not pageSoup: pageSoup = Soup(HTML)
    if SAVE_RAW_HTML:
        filePath = makeFilePath(site, ZIP, page = page, fileType = 'HTML')
        HTML_to_file(HTML, filePath)
    if PARSE_WHILE_SCRAPING:
        cards = parsePage(pageSoup)
        if cards != -1:
            return parseCards(site, cards)
        else: return pandas.DataFrame()

def downloadZIP(site, ZIP, headless = False):
    if READ_ONLY: return pandas.DataFrame()
    # Page 1
    HTML = getPage(URL(site, ZIP), headless)
    pageSoup = Soup(HTML)
    
    DFs = [handlePage(site, ZIP, HTML, pageSoup = pageSoup)]

    numPages = parseNumPages(site, pageSoup)
    if not (numPages == -1):
        if numPages > 1:
            for page in range(2,numPages+1):
                # For pages after page 1
                page = str(page)
                HTML = getPage(URL(site, ZIP, page), headless)
                DF = handlePage(site, ZIP, HTML, page = page)
                DFs.append(DF)

    wait('CONSTANT',site = site)
    if PARSE_WHILE_SCRAPING:
        DF = pandas.concat(DFs)
        if SAVE_ZIP_CSVs:
            filePath = makeFilePath(site, ZIP, fileType = 'ZIP CSV')
            DF.to_csv(filePath, index = False)
        return DF

In [127]:
downloadZIP('Redfin','16801')

Starting new driver.


Unnamed: 0,address,beds,baths,type,price,square footage,verified
0,"819 Wheatfield Dr, State College, PA 16801",4 Beds,2 Baths,Other,"$2,100/mo","2,271 Sq. Ft.",not verified
1,"415 W College Ave, State College, PA 16801",1 Bed,1 Bath,Apartment,"$1,250/mo",525 Sq. Ft.,not verified
2,"Southgate Apartments | 713 Southgate Dr, State...",1-3 Bed,1-2 Bath,Apartment,"$1,299-$5,322/mo","728-1,559 Sq. Ft.",not verified
3,"1730 Bristol Ave, State College, PA 16801",2 Beds,2 Baths,Apartment,"$1,675/mo","1,204 Sq. Ft.",not verified
4,"Lion's Gate Apartments | 424 Wauplelani Dr, St...",0-2 Beds,1-2 Bath,Apartment,"$975-$1,500/mo",323-884 Sq. Ft.,not verified
5,"419 Keller St, State College, PA 16801",1 Bed,1 Bath,Apartment,"$1,080/mo",—Sq. Ft.,not verified
6,"148 W Hamilton Ave, State College, PA 16801",3 Beds,1 Bath,Other,"$1,915/mo",—Sq. Ft.,not verified
7,"131 N Sparks St, State College, PA 16801",3 Beds,2 Baths,Apartment,"$2,840/mo",768 Sq. Ft.,not verified
8,"219 S Sparks St, State College, PA 16801",1 Bed,1 Bath,Apartment,$920/mo,—Sq. Ft.,not verified
9,"113 S Fraser St, State College, PA 16801",1 Bed,1 Bath,Apartment,"$1,155/mo",—Sq. Ft.,not verified


# Download and save data for this week

In [142]:
def dateNow(aDate): # determines based on granularity of time whether today falls in the same bucket as another
    # dateString accounts for granularity of time parameter
    return dateString(dateOf) == dateString(date.today())

def getZIP(site, ZIP, dateOf = date.today()):
    filePath = makeFilePath(site, ZIP = ZIP, fileType = 'ZIP CSV', makeFolders = False, dateOf = dateOf)
    if os.path.exists(filePath):
        try:
            DF = pandas.read_csv(filePath)
        except:
        # Exception is raised when reading bland DataFrame
        # which happens when there is nothing for rent.
            if RESCRAPE_EMPTY_ZIPS:
                os.remove(filePath)
                if dateNow(dateOf): # if same day/week
                    return downloadZIP(site, ZIP)
            # else
            return pandas.DataFrame()
        else: # if DF is read from CSV
            badCSV = False # assume good
            for column in DF.columns:
                if 'Unnamed' in column:
                    badCSV = True
                    DF.pop(column)
            if badCSV:
                os.remove(filePath)
                DF.to_csv(filePath, index = False)
            return DF
    else: # if no local file
        # Note that no attempt is made to find a downloaded HTML
        if dateNow(dateOf):
            return downloadZIP(site, ZIP)
    return DF

In [143]:
def readSiteData(site, dateOf = date.today()): # also scrapes data for now if not done yet
    compiledFilePath = makeFilePath(site, fileType = 'Compiled CSV', dateOf = dateOf)
    if os.path.exists(compiledFilePath):
        print("Reading pre-compiled CSV")
        compiledDF = pandas.read_csv(compiledFilePath)
    else: # no pre-existing compiled data
        DFs = [] # each DF will represent one ZIP
        for ZIP in tqdm(ZIPs):
            DF = getZIP(site, ZIP)
            DFs.append(DF)
        compiledDF = pandas.concat(DFs, axis = 0)
        compiledDF.to_csv(compiledFilePath, index = False)
    return compiledDF

In [None]:
if not READ_ONLY: driver = startNewDriver()
dateOf = date(2022, 12, 1)
dateOf = date.today()
thisWeekDFs = {}
for site in SITES_TO_SEARCH:
    readSiteData(site, dateOf = dateOf)
    thisWeekDFs[site] = compiledDF
if not READ_ONLY: driver.quit()

 19%|█████████████▉                                                            | 7752/41070 [23:56<91:49:03,  9.92s/it]

In [None]:
for DF in thisWeekDFs.values():
    print(len(DF))
    print(DF.columns)

In [None]:
RedfinDF = thisWeekDFs['Redfin']
RentDF = thisWeekDFs['Rent.']

# Derive numbers from text in DataFrame

### functions

In [None]:
def removeStudio(string):
    return string.replace('Studio–','')
def parseBedsBaths(row, output):
    assert output in ['beds', 'baths']
    string = row['beds-baths']
    string = removeStudio(string)
    if output == 'beds':
        if 'Bed' in string: return string.split('Bed')[0].strip()
        else: return('Beds Unknown')
    elif output == 'baths':
        if 'Bath' in string: return string.split('Bath')[0].split('•')[1].strip()
        else: return('Baths Unknown')
def getBeds(row):  return parseBedsBaths(row, 'beds')
def getBaths(row): return parseBedsBaths(row, 'baths')

def getZIP(row):
    address = row['address']
    if type(address) == float: return NaN
    return address.split(' ')[-1]

for value in ASSUMPTION.values(): assert (value == 'Unknown') or ((value >= 0) and (value <= 1))
def parseRange(string, key):
    if not string: return NaN
    if type(string) == float: return string
    if ('Unavailable' in string) or (LONGER_DASH in string): return NaN
    for subString in [',', '$', '+', ' Sqft', '/mo', ' Sq. Ft.']:
        string = string.replace(subString, '')
    for dash in ['-', LONG_DASH]:
        if (dash in string):
            [minStr,maxStr] = string.split(dash)
            minVal = int(minStr)
            maxVal = int(maxStr)
            valRange = maxVal - minVal
            return minVal + valRange*ASSUMPTION[key]
    #else
    return int(string)
def getPrice(row): return parseRange(row['price'],          'PRICE')
def getSqft (row): return parseRange(row['square footage'], 'SQFT' )
def getBeds (row): return parseRange(row['beds'],           'BEDS')
def getBaths(row): return parseRange(row['baths'],          'BATHS')

### apply to DataFrame

In [None]:
for DF in thisWeekDFs.values():
    if 'beds-baths' in DF.columns:
        DF['beds'] = DF.apply(getBeds, axis = 1)
        DF['baths'] = DF.apply(getBaths, axis = 1)
        DF.pop('beds-baths')
        print("beds-baths column split")
    else:
        print("no column to parse")

    DF['price as number'] = DF.apply(getPrice, axis = 1)
    DF['sqft as number'] = DF.apply(getSqft, axis = 1)

In [None]:
mergedDF = RedfinDF.merge(RentDF, on = 'address', suffixes = (' from Redfin',' from Rent.'))
DF = mergedDF
DF['ZIP'] = DF.apply(getZIP, axis = 1)
print(len(DF))
print(DF.columns)

In [None]:
priceDelta = DF['price as number from Redfin'] - DF['price as number from Rent.']

In [None]:
priceDelta = abs(priceDelta)
priceDelta.mean()

In [None]:
DF = DF.merge(MHAs, on = ['ZIP'], how = 'inner').sort_values('ZIP')

In [None]:
DF.columns

In [None]:
DF.where(DF['ZIP'] == '16801').dropna().groupby(['MHA','beds']).median()

In [None]:
DF.where(DF['ZIP'] == '16801').dropna()