# Scrape Redfin

In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import requests
import re
import pandas as pd
import time


In [2]:
# open browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)


### Start Scraping and load data into `homecard_list`

In [3]:
homecard_list = []
sub_dict= {}

# get first 10 pages
for i in range(10):
    
    url = f"https://www.redfin.com/county/321/CA/Los-Angeles-County/Page-{i+1}"
    browser.visit(url)
    time.sleep(2)
    
    # create soup object
    html = browser.html
    soup = bs(html, "html5lib")

    # get list of house cards on the first page
    homecards = soup.find_all("div","HomeCardContainer")
    
    for homecard in homecards:

        sub_dict= {}
        # store values into sub_dict
        sub_dict["beds"] = homecard.find("div","HomeStatsV2").find_all("div","stats")[0].text # numer of beds
        sub_dict["baths"] = homecard.find("div","HomeStatsV2").find_all("div","stats")[1].text # number of baths
        sub_dict["area"]  = homecard.find("div","HomeStatsV2").find_all("div","stats")[2].text # area
        sub_dict["price"] = homecard.find("span","homecardV2Price").text # house price
        sub_dict["address"] = homecard.find("div","homeAddressV2").span.text # address
        sub_dict["link"] = url+homecard.a["href"]

        # append dict to list
        homecard_list.append(sub_dict)


        


### Save `homecard_list` into a pandas dataframe

In [4]:
housedata = pd.DataFrame(columns = ["beds","baths","area","price","address","link"],data = homecard_list)
housedata.head()

Unnamed: 0,beds,baths,area,price,address,link
0,3 Beds,1.5 Baths,"1,290 Sq. Ft.","$879,000","2620 E Glenoaks Blvd, Glendale, CA 91206",https://www.redfin.com/county/321/CA/Los-Angel...
1,4 Beds,2 Baths,"1,628 Sq. Ft.","$599,000","11037 Chadsey Dr, Whittier, CA 90604",https://www.redfin.com/county/321/CA/Los-Angel...
2,2 Beds,2 Baths,"1,073 Sq. Ft.","$379,900","25925 Oak St #112, Lomita, CA 90717",https://www.redfin.com/county/321/CA/Los-Angel...
3,3 Beds,1 Bath,"1,034 Sq. Ft.","$849,000","2716 182nd St, Redondo Beach, CA 90278",https://www.redfin.com/county/321/CA/Los-Angel...
4,4 Beds,2 Baths,"1,920 Sq. Ft.","$664,900","1515 Temple Ave, Long Beach, CA 90804",https://www.redfin.com/county/321/CA/Los-Angel...


### Clean up the data

In [5]:
housedata["beds"] = [re.split("\s", bed)[0] if re.split("\s", bed)[0].isnumeric() else "0" for bed in housedata["beds"]]
housedata["baths"] = [re.split("\s", bath)[0] if bath!="-Baths" else "0" for bath in housedata["baths"]]
housedata["area"] = [re.split("\s", a)[0].replace(",","") for a in housedata["area"]]
housedata["price"] = [p.replace("$","").replace(",","") for p in housedata["price"]]
housedata["zip_code"] = [re.split("\s",add)[-1] for add in housedata["address"]]
housedata.head()



Unnamed: 0,beds,baths,area,price,address,link,zip_code
0,3,1.5,1290,879000,"2620 E Glenoaks Blvd, Glendale, CA 91206",https://www.redfin.com/county/321/CA/Los-Angel...,91206
1,4,2.0,1628,599000,"11037 Chadsey Dr, Whittier, CA 90604",https://www.redfin.com/county/321/CA/Los-Angel...,90604
2,2,2.0,1073,379900,"25925 Oak St #112, Lomita, CA 90717",https://www.redfin.com/county/321/CA/Los-Angel...,90717
3,3,1.0,1034,849000,"2716 182nd St, Redondo Beach, CA 90278",https://www.redfin.com/county/321/CA/Los-Angel...,90278
4,4,2.0,1920,664900,"1515 Temple Ave, Long Beach, CA 90804",https://www.redfin.com/county/321/CA/Los-Angel...,90804


In [6]:
# rename the column and reorder
redfin_data_table = (housedata.loc[:,["price","zip_code","address","beds","baths","link"]]
                        .rename(columns = {"price":"house_price","address":"house_address","link":"house_link"}))
redfin_data_table.head()


Unnamed: 0,house_price,zip_code,house_address,beds,baths,house_link
0,879000,91206,"2620 E Glenoaks Blvd, Glendale, CA 91206",3,1.5,https://www.redfin.com/county/321/CA/Los-Angel...
1,599000,90604,"11037 Chadsey Dr, Whittier, CA 90604",4,2.0,https://www.redfin.com/county/321/CA/Los-Angel...
2,379900,90717,"25925 Oak St #112, Lomita, CA 90717",2,2.0,https://www.redfin.com/county/321/CA/Los-Angel...
3,849000,90278,"2716 182nd St, Redondo Beach, CA 90278",3,1.0,https://www.redfin.com/county/321/CA/Los-Angel...
4,664900,90804,"1515 Temple Ave, Long Beach, CA 90804",4,2.0,https://www.redfin.com/county/321/CA/Los-Angel...


In [None]:
# export redfin data table to csv
redfin_data_table.to_csv("Redfin_Data_Tabele.csv",index = False)


In [8]:
# create zip code file
city = [re.split(",",city)[1].replace(" ","") for city in redfin_data_table["house_address"]]
zip_code = redfin_data_table["zip_code"]

zip_code_table = pd.DataFrame(columns = ["city","zip_code"], data = zip(city, zip_code))
zip_code_table["state"] = "CA"
zip_code_table = zip_code_table.drop_duplicates()

zip_code_table.head()

Unnamed: 0,city,zip_code,state
0,Glendale,91206,CA
1,Whittier,90604,CA
2,Lomita,90717,CA
3,RedondoBeach,90278,CA
4,LongBeach,90804,CA


In [None]:
zip_code_table.to_csv("Zip_Code.csv",index= False)