In [1]:
# This is my second personal project
# I'm always interested in scraping information from websites
# For this project, I want to scrape the listings from StreetEasy and compare the housing prices in different neighborhoods of Brooklyn
# Jesse Steinweg-Woods's blog "Web Scraping Indeed for Key Data Science Job Skills" gave me the motivation and inspiration on this project
# His website also has other projects that are very readworthy to a data science novice like me
# https://jessesw.com/Data-Science-Skills/
# I also visited https://github.com/ChrisMuir
# His project on web scraping helped me complete this project

# Let's import the necessary libraries
import re # Regular Expression
from bs4 import BeautifulSoup as bs # To parse HTML
from selenium import webdriver # To launch a browser window
from selenium.webdriver.common.by import By # Go to def get_html(driver):
from selenium.webdriver.support.ui import WebDriverWait # To prevent from overloading the website
from selenium.webdriver.support import expected_conditions as EC # Go to def get_html(driver):
from selenium.common.exceptions import TimeoutException # Exception handling
from selenium.common.exceptions import NoSuchElementException # Exception handling
from fake_useragent import UserAgent # To generate useragent info for browser window
import time # wait time

# Create a random user agent
ua = UserAgent()
userAgent = ua.random

# open a chrome browser window that is controlled by the function
def init_driver(file_path):
    # Before using the Webdriver, you need to download the HEX editor
    # Inside the HEX editor, open the chromedriver.exe and change the '$cdc_' variable to 'xxxx'
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    # Without adding a useragent, the CAPTCHA system will immediately recognize your browser as a bot
    options.add_argument("--user-agent={userAgent}")
    options.add_experimental_option("prefs", {"profile.default_content_settings.cookies": 2})
    driver = webdriver.Chrome(executable_path=file_path, 
                              chrome_options=options)
    driver.wait = WebDriverWait(driver, 10)
    return(driver)

# navigate the browser window to the website
def navigate_to_website(driver, site):
    driver.get(site)
    # Check to make sure a captcha page is not displayed.
    check_for_captcha(driver)

# check whether or not an web element is present on the website
def _is_element_displayed(driver, elem_text, elem_type):
    if elem_type == "class":
        try:
            out = driver.find_element_by_class_name(elem_text).is_displayed()
        except (NoSuchElementException, TimeoutException):
            out = False
    elif elem_type == "css":
        try:
            out = driver.find_element_by_css_selector(elem_text).is_displayed()
        except (NoSuchElementException, TimeoutException):
            out = False
    else:
        raise ValueError("arg 'elem_type' must be either 'class' or 'css'")
    return(out)

# pause the scraping process and ask the user to manually complete the CAPTCHA test before continuing
def _pause_for_captcha(driver):
    while True:
        time.sleep(30)
        if not _is_element_displayed(driver, "recaptcha2", "class"):
            break

# check if the CAPTCHA mechanismis triggered
def check_for_captcha(driver):
    if _is_element_displayed(driver, "recaptcha2", "class"):
        print("\nCAPTCHA!\n"\
              "Manually complete the captcha requirements.\n"\
              "Once that's done, it should resume scraping after ~30 seconds.")
        _pause_for_captcha(driver)

# collect the HTML code of the current page and go to the next page
def get_html(driver):
    output = []
    keep_going = True
    while keep_going:
        # Pull page HTML
        try:
            output.append(driver.page_source)
        except TimeoutException:
            pass
        # Check to see if a "next page" link exists.
        keep_going = _is_element_displayed(driver, "next", 
                                           "class")

        if keep_going:
            try:
                driver.wait.until(EC.element_to_be_clickable(
                    (By.CLASS_NAME, "next"))).click()
                time.sleep(3)
                # Check to make sure a captcha page is not displayed.
                check_for_captcha(driver)
            except TimeoutException:
                keep_going = False
        else:
            keep_going = False
    return(output)

# check if you mistyped the location name
def test_for_no_results(driver):
    # Check to see if the error message exists
    no_results = _is_element_displayed(driver, 
                                       "error-message", "class")
    check_for_captcha(driver)
    return(no_results)

# Split the raw page source into segments, one for each home listing.
def get_listings(list_obj):
    output = []
    for i in list_obj:
        htmlSplit = i.split('<article')[1:]
        output += htmlSplit
    return(output)

# check if an object is empty
def _is_empty(obj):
    if any([len(obj) == 0, obj == "null"]):
        return(True)
    else:
        return(False)

# Not all listings will have both bedrooms and bathrooms listed
# We need this function to consistently extract the info on bedrooms, bathrooms and sqft
def get_card_info(soup_obj):
    try:
        card = str(soup_obj.find(
            "ul", {"class" : "details_info details-info-flex"})).split("</li>")
    except (ValueError, AttributeError):
        card = "NA"
    if _is_empty(card):
        card = "NA"
    return(card)

# Extract the address info
def get_street_address(soup_obj):
    try:
        street = soup_obj.find(
            "a", {"se:clickable:target" : "true"}).get_text().strip()
    except (ValueError, AttributeError):
        street = "NA"
    if _is_empty(street):
        street = "NA"
    return(street)

# extract the neighborhood info
def get_neighborhood(soup_obj):
    try:
        neighborhood = soup_obj.find(
            "li", {"class": "details_info"}).get_text().strip().replace("in", "").split(" ", 2)[2]
    except (ValueError, AttributeError):
        neighborhood = "NA"
    if _is_empty(neighborhood):
        neighborhood = "NA"
    return(neighborhood)

# extract the price info
def get_price(soup_obj):
    try:
        price = soup_obj.find(
            'span', {'class': 'price'}).get_text()
        price = float(re.sub("[^0-9]", "", price))
    except (ValueError, AttributeError):
        price = "NA"
    return(price)

# extract the bedroom info
def get_bedrooms(list_obj):
    beds = str([n for n in list_obj if "bed" in n])
    if len(beds) > 0:
        try:
            beds = float(re.sub("[^0-9]", "", beds))
        except (ValueError, IndexError):
            beds = "NA"
    else:
        beds = "NA"
    return(beds)

# extract the bathroom info
def get_bathrooms(list_obj):
    baths = str([n for n in list_obj if "bath" in n])
    if len(baths) > 0:
        try:
            baths = float(re.sub("[^0-9]", "", baths))
        except (ValueError, IndexError):
            baths = "NA"
    else:
        baths = "NA"
    return(baths)

# extract the sqft info
def get_sqft(list_obj):
    sqft = str([n for n in list_obj if " ft" in n])
    if len(sqft) > 0:
        try:
            sqft = float(re.sub("[^0-9]", "", sqft))
        except (ValueError, IndexError):
            sqft = "NA"
    else:
        sqft = "NA"
    return(sqft)

# extract the sale type info
def get_saleType(soup_obj):
    try:
        sale_type = soup_obj.find(
            "li", {"class": "details_info"}).get_text().strip().split()[0]
    except (ValueError, AttributeError):
        sale_type = "NA"
    if _is_empty(sale_type):
        sale_type = "NA"
    return(sale_type)

# extract the url info
def get_url(soup_obj):
    try:
        url = soup_obj.find(
            'a', {'data-gtm-regular-listing' : 'true'})['href']
        url = "https://www.streeteasy.com" + url
    except:
        url = "NA"
    return(url)
        
# terminate driver connection
def close_connection(driver):
    driver.quit()