In [1]:
# Import libraries
import requests, bs4, re, time, random, csv
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

In [2]:
# Since Redfin limits search query results to 9 pages (40 results per page)
# Prepare URLs to perform queries on a city-by-city basis in order to capture desired data
cities = ['11961/CA/Menlo-Park/'
          ,'14325/CA/Palo-Alto/'
          ,'12739/CA/Mountain-View/'
          ,'19457/CA/Sunnyvale/'
          ,'4561/CA/Cupertino/'
          ,'17675/CA/Santa-Clara/'
          ,'11018/CA/Los-Altos/']

# Homes sold on Redfin within the last 6 months, by city
query_urls = ['https://www.redfin.com/city/' + city + 'filter/property-type=house,include=sold-6mo' for city in cities]
query_urls

['https://www.redfin.com/city/11961/CA/Menlo-Park/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/14325/CA/Palo-Alto/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/12739/CA/Mountain-View/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/19457/CA/Sunnyvale/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/4561/CA/Cupertino/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/17675/CA/Santa-Clara/filter/property-type=house,include=sold-6mo',
 'https://www.redfin.com/city/11018/CA/Los-Altos/filter/property-type=house,include=sold-6mo']

In [3]:
# Building upon search query URLs generated above
# Prepare additional URLs to include all pages result URLs, by city
user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}    
    
query_list = []
city_total = 1

for query_url in query_urls:
    
    time.sleep(2 + 1*random.random())
    
    response = requests.get(query_url, headers = user_agent)
    status = response.status_code
    
    if status == 200:
        page = bs(response.text)
        
        page_num = int(page.find('div', class_='viewingPage').find('span', class_='pageText').text.split(' ')[-1])
        
        for num in range(1, page_num + 1):
            query_list.append(query_url + '/page-' + str(num))
        
        print(f'Success. All page results URLs for city #{city_total} added')
        
        city_total += 1

    else:
        print(f'WARNING. Response Code {status} on query #{city_total}')
        
print(f'Collected a total of {len(query_list)} URLs')

Success. All page results URLs for city #1 added
Success. All page results URLs for city #2 added
Success. All page results URLs for city #3 added
Success. All page results URLs for city #4 added
Success. All page results URLs for city #5 added
Success. All page results URLs for city #6 added
Success. All page results URLs for city #7 added
Collected a total of 44 URLs


In [4]:
# Building upon results page URLs generated above
# Scrape for individual listing URLs on each returned results page, for a total of 1500+ URLs
url_page = 1
listing_urls = []

for each_url in query_list:
    
    time.sleep(2 + 1*random.random())
        
    response = requests.get(each_url, headers = user_agent)
    status = response.status_code
    
    if status == 200:
        page_results = bs(response.text)
        
        tags_of_interest = page_results.find_all('a', {'class': 'slider-item'})
        
        for obj in tags_of_interest:
            listing_urls.append('https://www.redfin.com' + obj['href'])
    
        print(f'Search Results Page #{url_page} added')
        
        url_page += 1
        
    else:
        print(f'WARNING. Response Code {status} on Search Results Page #{url_page}')
              
print(f'Collected a total of {len(listing_urls)} URLs')

Search Results Page #1 added
Search Results Page #2 added
Search Results Page #3 added
Search Results Page #4 added
Search Results Page #5 added
Search Results Page #6 added
Search Results Page #7 added
Search Results Page #8 added
Search Results Page #9 added
Search Results Page #10 added
Search Results Page #11 added
Search Results Page #12 added
Search Results Page #13 added
Search Results Page #14 added
Search Results Page #15 added
Search Results Page #16 added
Search Results Page #17 added
Search Results Page #18 added
Search Results Page #19 added
Search Results Page #20 added
Search Results Page #21 added
Search Results Page #22 added
Search Results Page #23 added
Search Results Page #24 added
Search Results Page #25 added
Search Results Page #26 added
Search Results Page #27 added
Search Results Page #28 added
Search Results Page #29 added
Search Results Page #30 added
Search Results Page #31 added
Search Results Page #32 added
Search Results Page #33 added
Search Results Page

In [5]:
# For each URL above, scrape for desired data and write to CSV file
data_features = ['Sold Price', 'Beds', 'Baths', 'Floors', 'Garage Spaces', 'Lot Size (sq ft)', 'Home Size (sq ft)'
              , 'Year Built', 'School Score Avg', 'Walk Score', 'Transit Score', 'Bike Score', 'Laundry'
              , 'Heating', 'Air Conditioning', 'Pool', 'Address', 'City', 'County', 'Zip Code'
              , 'Property Type', 'Sold Status', 'URL']

with open('redfin_data.csv', 'a', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data_features)

url_count = 1

for url in listing_urls:
    
    time.sleep(1 + 1*random.random())
    
    response = requests.get(url, headers = user_agent)
    status = response.status_code
        
    if status == 200:
        page = bs(response.text)

    # Collect variables:    
    
        # home stats
        home_stats = page.find('div', class_='home-main-stats-variant')

        home_summary = [div.text for div in home_stats.find_all('div', class_='statsValue')]

        try:
            sold_price = int(home_summary[0].replace('$','').replace(',',''))
            beds = int(home_summary[1])
            baths = float(home_summary[2])
        except:
            sold_price = beds = baths = np.nan   

        # floors
        try:
            if page.find(text='# of Stories: '):
                floors = int(page.find(text='# of Stories: ').findNext().text)
            elif page.find('div', class_='facts-table').find(text='Stories'):
                floors = int(page.find('div', class_='facts-table').find(text='Stories').next.text)
        except:
            floors = np.nan

        #garage
        try:
            if page.find(text='Garage (Maximum): '):
                garage = int(page.find(text = 'Garage (Maximum): ').findNext().text)
            elif page.find(text='Garage Spaces: '):
                garage = int(page.find(text = 'Garage Spaces: ').findNext().text)
            else:
                garage = 0
        except:
            garage =np.nan
            
        # lot sq ft
        try: 
            if page.find('div',class_='amenities-container').find(text='Lot Acres: '):
                lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Acres: ').findNext().text)*43560

            elif page.find('div',class_='amenities-container').find(text='Lot Size Acres: '):
                lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Size Acres: ').findNext().text)*43560
        except: 
            lot_sqft = np.nan    

        # home sq ft
        try: 
            home_sqft = int(page.find('div',class_='stat-block sqft-section').find('span', class_='statsValue').text.replace(',',''))
        except:
            home_sqft = np.nan
            
        # year built
        try:
            year_built = int(page.find(text='Year Built').findNext().text)
        except:
            year_built = np.nan   
            
        # school score avg
        try:
            school_content = page.find('div', class_='schools-content') 

            schools_score = school_content.find_all('span', class_='rating-num font-size-base font-weight-bold')

            total_score = sum([int(value.text) for value in schools_score])

            school_sum = sum([1 for card in school_content.find_all('div', class_='school-card-component')])

            school_score_avg = total_score / school_sum    
        except:
            school_score_avg = np.nan            
            
        # transportation scores
        try:
            scores = [span.find('span').text for span in page.find_all('div', class_='percentage')]    

            walk_score = int(scores[0])
            transit_score = int(scores[1])
            bike_score = int(scores[2])
        except:
            walk_score = transit_score = bike_score = np.nan 
                
        # laundry
        try:
            if page.find(text=re.compile('Laundry')):
                laundry = True
            else:
                laundry = False
        except: 
            laundry = np.nan  
            
        # heating
        try:
            if page.find(text=re.compile('Heating:')):
                heating = True
            else:
                heating = False            
        except:
            heating = np.nan         

        # cooling
        try:
            if page.find(text=re.compile('Cooling:')):
                aircond = True
            else:
                aircond = False
        except:
            aircond = np.nan             
                          
        # pool
        try:
            if page.find(text=re.compile('Pool')):
                pool = True
            else:
                pool = False  
        except:
            pool = np.nan                
                
        # address
        try:
            address = page.find('head').find('title').text.split('|')[0].split(',')[0].upper()
        except:
            address = np.nan    
    
        # city
        try:
            city = page.find('head').find('title').text.split('|')[0].split(',')[1].strip().upper()
        except:
            city = np.nan    
    
        # county
        try:
            county = page.find(text='County').findNext().text.upper()
        except:
            county = np.nan
    
        # zip code
        try:
            zipcode = page.find('head').find('title').text.split('|')[0].split(',')[2].split(' ')[2]
        except:
            zipcode = np.nan
    
        # property type
        try:
            property_type = page.find(text='Property Type').findNext().text.upper()
        except:
            property_type = np.nan
    
        # sold status
        try:
            sold_status = page.find(text='Status').findNext().text
        except:
            sold_status = np.nan    
    
        data = [sold_price, beds, baths, floors, garage, lot_sqft, home_sqft
                , year_built, school_score_avg, walk_score, transit_score
                , bike_score, laundry, heating, aircond, pool, address
                , city, county, zipcode, property_type, sold_status, url]

        with open('redfin_data.csv', 'a', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(data)
        
        print(f'Success. URL No. {url_count} added')
        
        url_count += 1  
        
    else:
        print(f'WARNING. Response Code {status} on URL No. {url_count}')

Success. URL No. 1 added
Success. URL No. 2 added
Success. URL No. 3 added
Success. URL No. 4 added
Success. URL No. 5 added
Success. URL No. 6 added
Success. URL No. 7 added
Success. URL No. 8 added
Success. URL No. 9 added
Success. URL No. 10 added
Success. URL No. 11 added
Success. URL No. 12 added
Success. URL No. 13 added
Success. URL No. 14 added
Success. URL No. 15 added
Success. URL No. 16 added
Success. URL No. 17 added
Success. URL No. 18 added
Success. URL No. 19 added
Success. URL No. 20 added
Success. URL No. 21 added
Success. URL No. 22 added
Success. URL No. 23 added
Success. URL No. 24 added
Success. URL No. 25 added
Success. URL No. 26 added
Success. URL No. 27 added
Success. URL No. 28 added
Success. URL No. 29 added
Success. URL No. 30 added
Success. URL No. 31 added
Success. URL No. 32 added
Success. URL No. 33 added
Success. URL No. 34 added
Success. URL No. 35 added
Success. URL No. 36 added
Success. URL No. 37 added
Success. URL No. 38 added
Success. URL No. 39 a

Success. URL No. 309 added
Success. URL No. 310 added
Success. URL No. 311 added
Success. URL No. 312 added
Success. URL No. 313 added
Success. URL No. 314 added
Success. URL No. 315 added
Success. URL No. 316 added
Success. URL No. 317 added
Success. URL No. 318 added
Success. URL No. 319 added
Success. URL No. 320 added
Success. URL No. 321 added
Success. URL No. 322 added
Success. URL No. 323 added
Success. URL No. 324 added
Success. URL No. 325 added
Success. URL No. 326 added
Success. URL No. 327 added
Success. URL No. 328 added
Success. URL No. 329 added
Success. URL No. 330 added
Success. URL No. 331 added
Success. URL No. 332 added
Success. URL No. 333 added
Success. URL No. 334 added
Success. URL No. 335 added
Success. URL No. 336 added
Success. URL No. 337 added
Success. URL No. 338 added
Success. URL No. 339 added
Success. URL No. 340 added
Success. URL No. 341 added
Success. URL No. 342 added
Success. URL No. 343 added
Success. URL No. 344 added
Success. URL No. 345 added
S

Success. URL No. 613 added
Success. URL No. 614 added
Success. URL No. 615 added
Success. URL No. 616 added
Success. URL No. 617 added
Success. URL No. 618 added
Success. URL No. 619 added
Success. URL No. 620 added
Success. URL No. 621 added
Success. URL No. 622 added
Success. URL No. 623 added
Success. URL No. 624 added
Success. URL No. 625 added
Success. URL No. 626 added
Success. URL No. 627 added
Success. URL No. 628 added
Success. URL No. 629 added
Success. URL No. 630 added
Success. URL No. 631 added
Success. URL No. 632 added
Success. URL No. 633 added
Success. URL No. 634 added
Success. URL No. 635 added
Success. URL No. 636 added
Success. URL No. 637 added
Success. URL No. 638 added
Success. URL No. 639 added
Success. URL No. 640 added
Success. URL No. 641 added
Success. URL No. 642 added
Success. URL No. 643 added
Success. URL No. 644 added
Success. URL No. 645 added
Success. URL No. 646 added
Success. URL No. 647 added
Success. URL No. 648 added
Success. URL No. 649 added
S

Success. URL No. 917 added
Success. URL No. 918 added
Success. URL No. 919 added
Success. URL No. 920 added
Success. URL No. 921 added
Success. URL No. 922 added
Success. URL No. 923 added
Success. URL No. 924 added
Success. URL No. 925 added
Success. URL No. 926 added
Success. URL No. 927 added
Success. URL No. 928 added
Success. URL No. 929 added
Success. URL No. 930 added
Success. URL No. 931 added
Success. URL No. 932 added
Success. URL No. 933 added
Success. URL No. 934 added
Success. URL No. 935 added
Success. URL No. 936 added
Success. URL No. 937 added
Success. URL No. 938 added
Success. URL No. 939 added
Success. URL No. 940 added
Success. URL No. 941 added
Success. URL No. 942 added
Success. URL No. 943 added
Success. URL No. 944 added
Success. URL No. 945 added
Success. URL No. 946 added
Success. URL No. 947 added
Success. URL No. 948 added
Success. URL No. 949 added
Success. URL No. 950 added
Success. URL No. 951 added
Success. URL No. 952 added
Success. URL No. 953 added
S

Success. URL No. 1213 added
Success. URL No. 1214 added
Success. URL No. 1215 added
Success. URL No. 1216 added
Success. URL No. 1217 added
Success. URL No. 1218 added
Success. URL No. 1219 added
Success. URL No. 1220 added
Success. URL No. 1221 added
Success. URL No. 1222 added
Success. URL No. 1223 added
Success. URL No. 1224 added
Success. URL No. 1225 added
Success. URL No. 1226 added
Success. URL No. 1227 added
Success. URL No. 1228 added
Success. URL No. 1229 added
Success. URL No. 1230 added
Success. URL No. 1231 added
Success. URL No. 1232 added
Success. URL No. 1233 added
Success. URL No. 1234 added
Success. URL No. 1235 added
Success. URL No. 1236 added
Success. URL No. 1237 added
Success. URL No. 1238 added
Success. URL No. 1239 added
Success. URL No. 1240 added
Success. URL No. 1241 added
Success. URL No. 1242 added
Success. URL No. 1243 added
Success. URL No. 1244 added
Success. URL No. 1245 added
Success. URL No. 1246 added
Success. URL No. 1247 added
Success. URL No. 124

Success. URL No. 1506 added
Success. URL No. 1507 added
Success. URL No. 1508 added
Success. URL No. 1509 added
Success. URL No. 1510 added
Success. URL No. 1511 added
Success. URL No. 1512 added
Success. URL No. 1513 added
Success. URL No. 1514 added
Success. URL No. 1515 added
Success. URL No. 1516 added
Success. URL No. 1517 added
Success. URL No. 1518 added
Success. URL No. 1519 added
Success. URL No. 1520 added
Success. URL No. 1521 added
Success. URL No. 1522 added
Success. URL No. 1523 added
Success. URL No. 1524 added
Success. URL No. 1525 added
Success. URL No. 1526 added
Success. URL No. 1527 added
Success. URL No. 1528 added
Success. URL No. 1529 added
Success. URL No. 1530 added
Success. URL No. 1531 added
Success. URL No. 1532 added
Success. URL No. 1533 added
Success. URL No. 1534 added
Success. URL No. 1535 added
Success. URL No. 1536 added
Success. URL No. 1537 added
Success. URL No. 1538 added
Success. URL No. 1539 added
Success. URL No. 1540 added
Success. URL No. 154