In [2]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML
import time, os
import os.path
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
    
user_agent = {'User-agent': 'Mozilla/5.0'}
chromedriver = "chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
#function: grab house title info 
def get_house_values(soup, property_name=None):
    
    '''Grab a value from Zillow HTML title
    '''
    if property_name:
        obj = soup.find("meta", property = re.compile(property_name))
        if not obj: 
            return None

        return (obj.get("content", 0))
    return soup.find("meta")

#function: stringInt to Integeor
def str_to_int(intString):
    try: 
        if intString.find('$') != -1:
            intString = intString.replace('$', '').replace(',', '')
        elif intString.find(' '):
            intString = intString.split('sqft')[0].replace(',', '')
        else: intString = intString.split(' ')[0].replace(',', '')
    finally:
        if intString.isnumeric(): return int(intString)
    
    return intString

# function: change string to data 
import dateutil.parser 
def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date.date()


# Fnction: Getting response from zillow.com
def get_response(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    else: 
        return response
    
#### grab geo info
def get_script_info(soup, property_url):
    re_latitude = re.compile('latitude')
    for e in soup.find_all(text=re_latitude):
        scriptInfo =  json.loads(e)
        try:
            if property_url.find(scriptInfo['url']):
                latitude = scriptInfo['geo']['latitude']
                longitude = scriptInfo['geo']['longitude']
        except IndexError:
            print("Invalid string")
    return (latitude, longitude) 


def save_urls(soup, house_urls):
    for link_info in soup.find_all('a', class_='list-card-link list-card-link-top-margin'):
        house_urls.append(link_info['href'])
    return house_urls.copy()

# read area zipcodes
def read_area_zipcodes(area):
    print('zip_{0}.csv'.format(area))
    zip_df = pd.read_csv('zip_{0}.csv'.format(area), sep ='(')
    zip_df.CITY = zip_df.CITY.apply( lambda x: x.split(')')[0])
    return zip_df
    

In [4]:
feature_dict = {}

key_list = ['Addr',  'City', 'State', 'Zipcode', 'SoldPrice', 'SoldDate','HomeType', 'Bedrooms','Bathrooms','FullBath','HalfBath','Flooring',
            'Heating','Cooling','Laundry', 'HomeSize','ParkingLotNum','ParkingType',
           'ExtType', 'LotSize','LotType','ParcelNum','Zoning','SpecialCondition','Materials','Roof','YearBuilt', 'SewerInfo',
           'WaterInfo', 'UtilitiesType', 'Region','RoomKitchenFeatures','ListPriceLow','min_school_dist',
            'avg_school_dist','FoundationDetails','ParkingFeatures','latitude','longitude', 'URL']

feature_dict = dict.fromkeys(key_list)

#grab detail info
def get_house_dict(soup, property_url):
    try: 
        feature_dict['URL'] = property_url
        # grab title info
        title_string = soup.find('title').text.split('|')[0]
        feature_dict['Addr'] = title_string.split(',')[0]
        feature_dict['City'] = title_string.split(',')[1]
        feature_dict['State']   = title_string.split(',')[2].split(' ')[1]
        feature_dict['Zipcode'] = title_string.split(',')[2].split(' ')[2]
        # grab sold info
        for e in soup.find('p', class_='Text-c11n-8-48-0__sc-aiai24-0 StyledParagraph-c11n-8-48-0__sc-18ze78a-0 ghsFfg'):
            if e.text.find('Sold:')!=-1:
                feature_dict['SoldPrice'] = str_to_int(e.text.split(' ')[1])
            elif e.text.find('Sold on')!=-1:
                feature_dict['SoldDate'] = to_date(e.text.split(' ')[2])
            else:continue

        # grabing basic features
        re_features = re.compile('Text-c11n-8-48-0__sc-aiai24-0')
        for feature in soup.find_all('span', class_=re_features):
            e = feature.text.split(':')
            if len(e) == 2:
                e[1]=e[1].strip()
                if e[0] == 'Bedrooms': feature_dict['Bedrooms'] = str_to_int(e[1])
                elif e[0] == 'Bathrooms': feature_dict['Bathrooms'] = str_to_int(e[1])
                elif e[0] == 'Full bathrooms': feature_dict['FullBath'] = str_to_int(e[1])
                elif e[0] == '1/2 bathrooms': feature_dict['HalfBath'] = str_to_int(e[1])
                elif e[0] == 'Flooring': feature_dict['Flooring'] = e[1]
                elif e[0] == 'Heating features': feature_dict['Heating'] = e[1]
                elif e[0] == 'Cooling features': feature_dict['Cooling'] = e[1]
                elif e[0] == 'Laundry features': feature_dict['Laundry'] = e[1]
                elif e[0] == 'Total interior livable area': feature_dict['HomeSize'] = str_to_int(e[1])
                elif e[0] == 'Total spaces': feature_dict['ParkingLotNum'] = e[1]
                elif e[0] == 'Parking features': feature_dict['ParkingType'] = e[1]
                elif e[0] == 'Exterior features': feature_dict['ExtType'] = e[1]
                elif e[0] == 'Lot size': feature_dict['LotSize'] = str_to_int(e[1])
                elif e[0] == 'Lot features': feature_dict['LotType'] = e[1]
                elif e[0] == 'Parcel number': feature_dict['ParcelNum'] = e[1]
                elif e[0] == 'Zoning': feature_dict['Zoning'] = e[1]
                elif e[0] == 'Special conditions': feature_dict['SpecialCondition'] = e[1]
                elif e[0] == 'Construction materials': feature_dict['Materials'] = e[1]
                elif e[0] == 'Roof': feature_dict['Roof'] = e[1]
                elif e[0] == 'Year built': feature_dict['YearBuilt'] = e[1]
                elif e[0] == 'Sewer information': feature_dict['SewerInfo'] = e[1]
                elif e[0] == 'Water information': feature_dict['WaterInfo'] = e[1]
                elif e[0] == 'Utilities for property': feature_dict['UtilitiesType'] = e[1]
                elif e[0] == 'Region': feature_dict['Region'] = e[1]
                elif e[0] == 'RoomKitchenFeatures': feature_dict['RoomKitchenFeatures'] = e[1]
                elif e[0] == 'FoundationDetails': feature_dict['FoundationDetails'] = e[1]
                elif e[0] == 'ParkingFeatures': feature_dict['ParkingFeatures'] = e[1]
                elif e[0] == 'ListPriceLow': feature_dict['ListPriceLow'] = str_to_int(e[1])
                elif e[0] == 'Home type': feature_dict['HomeType'] = e[1]
                else: continue
            else: continue

        # grab school info
        re_features = re.compile('ds-nearby-schools-list')
        school_dist = []
        for e in soup.find('ul', id=re_features).find_all('span', class_='Text-c11n-8-48-0__sc-aiai24-0 dQezUG'):
            if e.text.find('mi')!=-1:
                dist = float(e.text.split(' ')[0])
                school_dist.append(dist)
            else: continue  
        feature_dict['min_school_dist'] = np.min(school_dist)
        feature_dict['avg_school_dist'] = np.round(np.mean(school_dist),2)  
        
        # grab geo info
        latitude, longitude = get_script_info(soup, property_url)
        feature_dict['latitude'] = latitude
        feature_dict['longitude'] = longitude
    except: 
        print("An exception occurred")
    finally:
        return feature_dict.copy()
    
    
    
#### parse info per house

def parse_urls(url, property_url):
    #  generate a random user agent to send request
   # url = 'https://www.zillow.com'+property_url
    response = get_response(url)

    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
    else:
        soup_robot = BeautifulSoup(response.text,'html5lib')
        if soup_robot.find('div', class_='error-text-content').find('h5').text =="Please verify you're a human to continue.": 
            print(f"robot test:{0}".format(zipcode))
            
        chromedriver = "chromedriver.exe" # path to the chromedriver executable
        driver = webdriver.Chrome(chromedriver)
        print(url)
        driver.get(url)
        for i in range(5):  # scroll the pages
            driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html5lib')
        house_dict = get_house_dict(soup, property_url)
        return house_dict
        

In [6]:
import pyautogui

def collect_house_links(zipcode):
    
    zillow = 'https://www.zillow.com/san-jose-ca-{0}/'.format(zipcode)
    house_urls=[]
    response = get_response(zillow)
    re_NoFound = re.compile('No matching results')
    re_robot = re.compile("Please verify you're a human to continue.")
    df=[]
    if not response:
        print("skip zipcode:{0}".format(zipcode))
        return False
    
        #### test
        soup_robot = BeautifulSoup(response.text,'html5lib')
        
        if soup_robot.find('div', class_='error-text-content').find('h5').text =="Please verify you're a human to continue.": 
            print(f"robot test:{0}".format(zipcode))
    
    try:
        driver = webdriver.Chrome(chromedriver)    
        driver.maximize_window()
        driver.implicitly_wait(10)
        driver.get(zillow)

        time.sleep(3)
        filter_ = driver.find_element_by_xpath('//*[(@id = "more")]')
        filter_.send_keys(Keys.RETURN)

        selector_ = driver.find_element_by_xpath('//select[@aria-label="Sold in last"]')
        selector_.click()
        selector_.send_keys('12 months')
        selector_.click()

        filter_ = driver.find_element_by_xpath('//*[(@id = "more")]')
        filter_.click()
      
        for i in range(3,5,1):
            for i in range(5):
                pyautogui.moveTo(3813,2010,duration=0.5)
                pyautogui.click(3813,2010,20)

            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html5lib')
            re_NoFound = re.compile('No matching results')
            if soup.find(text = re_NoFound): break
            house_urls=save_urls(soup, house_urls)
            
    
            if driver.find_element_by_xpath('//*[@id="grid-search-results"]/div[2]/nav/ul/li[{0}]/a'.format(i)):  
                next_ = driver.find_element_by_xpath('//*[@id="grid-search-results"]/div[2]/nav/ul/li[{0}]/a'.format(i))
                next_.click()
                time.sleep(3)
                df = pd.DataFrame(house_urls,columns=['URL'])
                df.to_csv('./data/{0}.csv'.format(zipcode), index=False)
            else: break
    except NoSuchElementException as e1 :
        print(e1)
        return True
    except Exception as e2:
        print(e2)
     #   return False
    except:
        print("Grapping error in {0}".format(zipcode))
        return False
    finally:
        df = pd.DataFrame(house_urls,columns=['URL'])
        df.to_csv('./data/{0}.csv'.format(zipcode), index=False)
        return True

In [7]:
# collect all urls in one file

import os
import glob
import pandas as pd

def write_to_files(fname='house_data.csv'):
    extension = 'csv'
    all_filenames = [i for i in glob.glob('df_part*.{}'.format(extension))]
    print(fname)
    with open("house_data.csv", "w") as outfile:
        for filename in all_filenames:
            with open(filename) as infile:
                contents = infile.read()
                outfile.write(contents)

def get_zip_links():
    areas = ['alameda', 'sanjose']
    zips_df=[]
    for a in areas:
        zips_df = read_area_zipcodes(a)

    err_cnt = 0 
    for index, row in zips_df.iterrows():  
        if err_cnt >= 5: 
            print('robot checking')
            break

        zipcode = row.iloc[0]
        city = row.iloc[1]
        if os.path.exists('./data/{0}.csv'.format(zipcode)):
            continue

        print (index,zipcode,city)

        if collect_house_links(row.iloc[0]) == False:
            err_cnt = err_cnt + 1 
            time.sleep(2)
            print("skip...")

def get_house_info():
    urls_df = pd.read_csv('house_urls_testRL.csv')
    urls_df.drop_duplicates()
    urls_df
    urls_df = urls_df[::-1]
    house_data_list=[]
    prior_df = pd.read_csv('house_data.csv', index_col=None)
    collected_df_=[]
    files = [  'df_part1.csv', 'df_part2.csv', 'df_part3.csv']
    for i in range(3):
        try:
            for index, row in urls_df.iterrows():
                if index % 7-i*2 ==0:
                    if row[0].find('URL') == -1: 
                        zillow_id = row[0].split('com')[1]
                        if zillow_id not in prior_df.URL:
                            url=row[0]
                            print(url)
                            print('---')
                            home_dict = parse_urls(url, zillow_id)
                            house_data_list.append(home_dict)

                    collected_df_ = pd.DataFrame.from_dict(house_data_list)
        except Exception as e:
            print(e)
        finally:
            #collected_df_.to_csv(files[i])
            prior_df.append(collected_df_)

    collected_df_
    pd.DataFrame(collected_df_).to_csv('df_partX.csv')

In [9]:
if __name__ == "__main__":
  #  get_zip_links()
    get_house_info()
 #   write_to_files('house_testRL.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'house_urls_testRL.csv'

In [706]:
    urls_df = pd.read_csv('house_urls.csv')
    urls_df.drop_duplicates()
    urls_df
    urls_df = urls_df[::-1]
    house_data_list=[]
    prior_df = pd.read_csv('house_data.csv', index_col=None)
    collected_df_=[]
    files = [  'df_part1.csv', 'df_part2.csv', 'df_part3.csv']
    for i in range(3):
        try:
            for index, row in urls_df.iterrows():
                if index % 7-i*2 ==0:
                    if row[0].find('URL') == -1: 
                        zillow_id = row[0].split('com')[1]
                        if zillow_id not in prior_df.URL:
                            url=row[0]
                            print(url)
                            print('---')
                            home_dict = parse_urls(url, zillow_id)
                            house_data_list.append(home_dict)

                    collected_df_ = pd.DataFrame.from_dict(house_data_list)
        except Exception as e:
            print(e)
        finally:
            #collected_df_.to_csv(files[i])
            prior_df.append(collected_df_)

    collected_df_
    pd.DataFrame(collected_df_).to_csv('df_partX.csv')

https://www.zillow.com/homedetails/3023-Moonstar-Ct-San-Jose-CA-95148/19730994_zpid/
---
robot test:0
https://www.zillow.com/homedetails/3023-Moonstar-Ct-San-Jose-CA-95148/19730994_zpid/
Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=93.0.4577.82)

https://www.zillow.com/homedetails/3076-Balgray-Ct-San-Jose-CA-95148/19804072_zpid/
---
robot test:0
https://www.zillow.com/homedetails/3076-Balgray-Ct-San-Jose-CA-95148/19804072_zpid/


KeyboardInterrupt: 