In [1]:
import requests, bs4, re, time, random, csv
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

In [2]:
user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}    
sample_url = 'https://www.redfin.com/city/11961/CA/Menlo-Park/filter/include=sold-6mo'

In [3]:
response = requests.get(sample_url, headers = user_agent)
status = response.status_code

if status == 200:
    page = bs(response.text)
    print(f'Success. Sample URL Scraped.')
else:
    print(f'WARNING. Response Code {status}.')

Success. Sample URL Scraped.


In [4]:
# SCRAPE FOR DESIRED DATA
data_features = ['Sold Price', 'Beds', 'Baths', 'Floors', 'Garage Spaces', 'Lot Size (sq ft)', 'Home Size (sq ft)'
              , 'Year Built', 'School Score Avg', 'Walk Score', 'Transit Score', 'Bike Score', 'Laundry'
              , 'Heating', 'Air Conditioning', 'Pool', 'Address', 'City', 'County', 'Zip Code'
              , 'Property Type', 'Sold Status', 'URL']

with open('redfin_data.csv', 'a', encoding='UTF8') as f:
    writer = csv.writer(f)
    
    writer.writerow(data_features)

user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}    
test_url = 'https://www.redfin.com/CA/Palo-Alto/1062-Los-Robles-Ave-94306/home/1077975'

response = requests.get(test_url, headers = user_agent)
status = response.status_code

if status == 200:
    page = bs(response.text)
    print(f'Success. Response Code {status}')
else:
    print(f'WARNING. Response Code {status}')

Success. Response Code 200


In [5]:
# home stats
home_stats = page.find('div', class_='home-main-stats-variant')

home_summary = [div.text for div in home_stats.find_all('div', class_='statsValue')]

try:
    sold_price = int(home_summary[0].replace('$','').replace(',',''))
    beds = int(home_summary[1])
    baths = float(home_summary[2])
except:
    sold_price = beds = baths = np.nan    

In [6]:
# home sq ft
try: 
    home_sqft = int(page.find('div',class_='stat-block sqft-section').find('span', class_='statsValue').text.replace(',',''))
except:
    home_sqft = np.nan

In [7]:
# floors
try:
    if page.find(text='# of Stories: '):
        floors = int(page.find(text='# of Stories: ').findNext().text)
    elif page.find('div', class_='facts-table').find(text='Stories'):
        floors = int(page.find('div', class_='facts-table').find(text='Stories').next.text)
except:
    floors = np.nan

In [8]:
#garage
try:
    if page.find(text='Garage (Maximum): '):
        garage = int(page.find(text = 'Garage (Maximum): ').findNext().text)
    elif page.find(text='Garage Spaces: '):
        garage = int(page.find(text = 'Garage Spaces: ').findNext().text)
    else:
        garage = 0
except:
    garage =np.nan

In [9]:
# lot sq ft
try: 
    if page.find('div',class_='amenities-container').find(text='Lot Acres: '):
        lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Acres: ').findNext().text)*43560

    elif page.find('div',class_='amenities-container').find(text='Lot Size Acres: '):
        lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Size Acres: ').findNext().text)*43560
except: 
    lot_sqft = np.nan

In [10]:
# year built
try:
    year_built = int(page.find(text='Year Built').findNext().text)
except:
    year_built = np.nan

In [11]:
# school score avg
try:
    school_content = page.find('div', class_='schools-content') 

    schools_score = school_content.find_all('span', class_='rating-num font-size-base font-weight-bold')

    total_score = sum([int(value.text) for value in schools_score])

    school_sum = sum([1 for card in school_content.find_all('div', class_='school-card-component')])

    school_score_avg = total_score / school_sum    
except:
    school_score_avg = np.nan

In [12]:
# transportation scores
try:
    scores = [span.find('span').text for span in page.find_all('div', class_='percentage')]    

    walk_score = int(scores[0])
    transit_score = int(scores[1])
    bike_score = int(scores[2])
except:
    walk_score = transit_score = bike_score = np.nan

In [13]:
# laundry
try:
    if page.find(text=re.compile('Laundry')):
        laundry = True
    else:
        laundry = False
except: 
    laundry = np.nan

In [14]:
# heating
try:
    if page.find(text=re.compile('Heating:')):
        heating = True
    else:
        heating = False
except:
    heating = np.nan      

In [15]:
# cooling
try:
    if page.find(text=re.compile('Cooling:')):
        aircond = True
    else:
        aircond = False
except:
    aircond = np.nan

In [16]:
# pool
try:
    if page.find(text=re.compile('Pool')):
        pool = True
    else:
        pool = False        
except:
    pool = np.nan

In [17]:
# address
try:
    address = page.find('head').find('title').text.split('|')[0].split(',')[0].upper()    
except:
    address = np.nan

In [18]:
# city
try:
    city = page.find('head').find('title').text.split('|')[0].split(',')[1].strip().upper()
except:
    city = np.nan

In [19]:
# county
try:
    county = page.find(text='County').findNext().text.upper()
except:
    county = np.nan

In [20]:
# zip code
try:
    zipcode = page.find('head').find('title').text.split('|')[0].split(',')[2].split(' ')[2]    
except:
    zipcode = np.nan

In [21]:
# property type
try:
    property_type = page.find(text='Property Type').findNext().text.upper()
except:
    property_type = np.nan

In [22]:
# sold status
try:
    sold_status = page.find(text='Status').findNext().text
except:
    sold_status = np.nan

In [23]:
data = [sold_price, beds, baths, floors, garage, lot_sqft, home_sqft, year_built
            , school_score_avg, walk_score, transit_score, bike_score, laundry 
            , heating, aircond, pool, address, city, county, zipcode
            , property_type, sold_status, test_url]

In [24]:
with open('redfin_data.csv', 'a', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)    

In [25]:
pd.read_csv('redfin_data.csv')

Unnamed: 0,Sold Price,Beds,Baths,Floors,Garage Spaces,Lot Size (sq ft),Home Size (sq ft),Year Built,School Score Avg,Walk Score,...,Heating,Air Conditioning,Pool,Address,City,County,Zip Code,Property Type,Sold Status,URL
0,3500000,5,3.0,1,2,10123.344,1941,1955,7.666667,27,...,True,True,True,1062 LOS ROBLES AVE,PALO ALTO,SANTA CLARA COUNTY,94306,SINGLE FAMILY RESIDENTIAL,Closed,https://www.redfin.com/CA/Palo-Alto/1062-Los-R...
