In [None]:
import requests, bs4, re, time, random, csv
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

In [None]:
test_url = 'https://www.redfin.com/city/11961/CA/Menlo-Park/filter/property-type=house,include=sold-6mo'
user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}

response = requests.get(test_url, headers = user_agent)
status = response.status_code

if status == 200:
    test_page = bs(response.text)
        
    tags_of_interest = test_page.find_all('a', {'class': 'slider-item'})
    listing_urls = ['https://www.redfin.com' + item['href'] for item in tags_of_interest]

    print(f'Success. Test results page scraped. {len(listing_urls)} URLs collected')
    
else:
    print(f'WARNING. Response code {status} received.')

In [None]:
listing_urls

In [None]:
# SCRAPE FOR DESIRED DATA
data_features = ['Sold Price', 'Beds', 'Baths', 'Floors', 'Garage Spaces', 'Lot Size (sq ft)', 'Home Size (sq ft)'
              , 'Year Built', 'School Score Avg', 'Walk Score', 'Transit Score', 'Bike Score', 'Laundry'
              , 'Heating', 'Air Conditioning', 'Pool', 'Address', 'City', 'County', 'Zip Code'
              , 'Property Type', 'Sold Status' 'URL']

with open('redfin_data.csv', 'a', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data_features)

url_count = 1

for url in listing_urls:
    
    time.sleep(3 + 3*random.random())
    
    response = requests.get(url, headers = user_agent)
    status = response.status_code
        
    if status == 200:
        page = bs(response.text)

    # Collect variables:    
    
        # home stats
        home_stats = page.find('div', class_='home-main-stats-variant')

        home_summary = [div.text for div in home_stats.find_all('div', class_='statsValue')]

        try:
            sold_price = int(home_summary[0].replace('$','').replace(',',''))
            beds = int(home_summary[1])
            baths = float(home_summary[2])
        except:
            sold_price = beds = baths = np.nan   

        # floors
        try:
            if page.find(text='# of Stories: '):
                floors = int(page.find(text='# of Stories: ').findNext().text)
            elif page.find('div', class_='facts-table').find(text='Stories'):
                floors = int(page.find('div', class_='facts-table').find(text='Stories').next.text)
        except:
            floors = np.nan

        #garage
        try:
            if page.find(text='Garage (Maximum): '):
                garage = int(page.find(text = 'Garage (Maximum): ').findNext().text)
            elif page.find(text='Garage Spaces: '):
                garage = int(page.find(text = 'Garage Spaces: ').findNext().text)
            else:
                garage = 0
        except:
            garage =np.nan
            
        # lot sq ft
        try: 
            if page.find('div',class_='amenities-container').find(text='Lot Acres: '):
                lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Acres: ').findNext().text)*43560

            elif page.find('div',class_='amenities-container').find(text='Lot Size Acres: '):
                lot_sqft = float(page.find('div',class_='amenities-container').find(text='Lot Size Acres: ').findNext().text)*43560
        except: 
            lot_sqft = np.nan    

        # home sq ft
        try: 
            home_sqft = int(page.find('div',class_='stat-block sqft-section').find('span', class_='statsValue').text.replace(',',''))
        except:
            home_sqft = np.nan
            
        # year built
        try:
            year_built = int(page.find(text='Year Built').findNext().text)
        except:
            year_built = np.nan   
            
        # school score avg
        try:
            school_content = page.find('div', class_='schools-content') 

            schools_score = school_content.find_all('span', class_='rating-num font-size-base font-weight-bold')

            total_score = sum([int(value.text) for value in schools_score])

            school_sum = sum([1 for card in school_content.find_all('div', class_='school-card-component')])

            school_score_avg = total_score / school_sum    
        except:
            school_score_avg = np.nan            
            
        # transportation scores
        try:
            scores = [span.find('span').text for span in page.find_all('div', class_='percentage')]    

            walk_score = int(scores[0])
            transit_score = int(scores[1])
            bike_score = int(scores[2])
        except:
            walk_score = transit_score = bike_score = np.nan 
                
        # laundry
        try:
            if page.find(text=re.compile('Laundry')):
                laundry = True
            else:
                laundry = False
        except: 
            laundry = np.nan  
            
        # heating
        try:
            if page.find(text=re.compile('Heating:')):
                heating = True
            else:
                heating = False            
        except:
            heating = np.nan         

        # cooling
        try:
            if page.find(text=re.compile('Cooling:')):
                aircond = True
            else:
                aircond = False
        except:
            aircond = np.nan             
                          
        # pool
        try:
            if page.find(text=re.compile('Pool')):
                pool = True
            else:
                pool = False  
        except:
            pool = np.nan                
                
        # address
        try:
            address = page.find('head').find('title').text.split('|')[0].split(',')[0].upper()
        except:
            address = np.nan    
    
        # city
        try:
            city = page.find('head').find('title').text.split('|')[0].split(',')[1].strip().upper()
        except:
            city = np.nan    
    
        # county
        try:
            county = page.find(text='County').findNext().text.upper()
        except:
            county = np.nan
    
        # zip code
        try:
            zipcode = page.find('head').find('title').text.split('|')[0].split(',')[2].split(' ')[2]
        except:
            zipcode = np.nan
    
        # property type
        try:
            property_type = page.find(text='Property Type').findNext().text.upper()
        except:
            property_type = np.nan
    
        # sold status
        try:
            sold_status = page.find(text='Status').findNext().text
        except:
            sold_status = np.nan    
    
        data = [sold_price, beds, baths, floors, garage, lot_sqft, home_sqft
                , year_built, school_score_avg, walk_score, transit_score
                , bike_score, laundry, heating, aircond, pool, address
                , city, county, zipcode, property_type, sold_status, url]

        with open('redfin_data.csv', 'a', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(data)
        
        print(f'Success. URL No. {url_count} Scrapped')
        
        url_count += 1  
        
    else:
        print(f'WARNING. Response Code {status} on URL No. {url_count}')


In [None]:
pd.read_csv('redfin_data.csv').shape