# Beautiful Soup Web Scraping Project
Using beautiful soup and selenium to collect data from trulia. Getting information about houses for sale in San Diego and saving to a csv file.
Will add more later to collect data in diffrent cities.

## Imports

In [2]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
import urllib.parse

## HTML Call
### using selenium webdriver to bypass blocks

In [3]:
driver = webdriver.Edge()
response = driver.get("https://www.trulia.com/CA/San_Diego/")
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul[data-testid="search-result-list-container"]')))
print(response)
#should have a window pop up.

None


In [23]:
#previous way that I was getting the html call, had to switch to 
#using web driver because the website was blocking my calls
# result = requests.get("https://www.trulia.com/CA/San_Diego/")
# print(result.status_code)
# #200 works, 404 not working
# src = result.content

## Create Soup Object

In [13]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [14]:
print(soup)

<html lang="en"><head><style>.LGLeeN-keyboard-shortcuts-view{display:-webkit-box;display:-webkit-flex;display:-moz-box;display:-ms-flexbox;display:flex}.LGLeeN-keyboard-shortcuts-view table,.LGLeeN-keyboard-shortcuts-view tbody,.LGLeeN-keyboard-shortcuts-view td,.LGLeeN-keyboard-shortcuts-view tr{background:inherit;border:none;margin:0;padding:0}.LGLeeN-keyboard-shortcuts-view table{display:table}.LGLeeN-keyboard-shortcuts-view tr{display:table-row}.LGLeeN-keyboard-shortcuts-view td{-moz-box-sizing:border-box;box-sizing:border-box;display:table-cell;color:#000;padding:6px;vertical-align:middle;white-space:nowrap}.LGLeeN-keyboard-shortcuts-view td .VdnQmO-keyboard-shortcuts-view--shortcut-key{background-color:#e8eaed;border-radius:2px;border:none;-moz-box-sizing:border-box;box-sizing:border-box;color:inherit;display:inline-block;font-family:Google Sans Text,Roboto,Arial,sans-serif;line-height:16px;margin:0 2px;min-height:20px;min-width:20px;padding:2px 4px;position:relative;text-align:c

## Results

In [15]:
result_container = soup.find_all('li', {'class':'sc-fc01d244-0'})
len(result_container)

42

## Filtering out the elements that have the attribute "data-testid' from the result container

In [16]:
results_update = []

for results in result_container: 
    if results.has_attr('data-testid'):
        results_update.append(results)

In [17]:
len(results_update)
#should be 40 results

40

## Extracting links for the results

Want to extract links from the results in the results_update list. Will have links for all 40 elements. This will be used to extract data from the property page, such as the house details and pricing.

In [18]:
url_1 = 'https://www.trulia.com'
url_2 = []
count = 0
#loop through the results in the results_update list for item
for item in results_update:
    for link in item.find_all('div', {'data-testid':'property-card-details'}):
        #print(link.find('a').get('href'))
        url_2.append(link.find('a').get('href'))
        #loop through the results_update and find all the divs with the 
        #attribute of property card details 
        #then get the 'a' link and add to url_2 list

In [19]:
print(url_2)
#checking if we got some links
len(url_2)

['/p/ca/san-diego/5418-harvest-run-dr-san-diego-ca-92130--1064193329', '/p/ca/san-diego/4368-42nd-st-3-san-diego-ca-92105--2079727955', '/p/ca/san-diego/6415-benson-ave-san-diego-ca-92114--2079818824', '/p/ca/san-diego/16326-avenida-venusto-b-san-diego-ca-92128--2079970422', '/p/ca/san-diego/3820-3820-quarter-mile-dr-san-diego-ca-92130--2704030360', '/p/ca/san-diego/700-w-east-st-1901-san-diego-ca-92101--2116268561', '/p/ca/coronado/50-montego-ct-coronado-ca-92118--1001808762', '/p/ca/san-diego/6296-lisieux-ter-san-diego-ca-92120--2079888064', '/p/ca/san-diego/1328-s-58th-st-san-diego-ca-92114--2116275116', '/p/ca/la-jolla/617-westbourne-st-la-jolla-ca-92037--2079479402', '/p/ca/san-diego/700-w-harbor-dr-w-2902-san-diego-ca-92101--2124227014', '/p/ca/san-diego/725-727-san-fernando-pl-san-diego-ca-92109--2189546261', '/p/ca/coronado/1750-avenida-del-mundo-1108-coronado-ca-92118--2121524015', '/p/ca/san-diego/7980-mission-center-ct-f-san-diego-ca-92108--2079754406', '/p/ca/san-diego/8334

40

## Join URL 1 and URL 2 together to make one whole link

In [20]:
url_joined = []

for link_2 in url_2: 
    url_joined.append(urllib.parse.urljoin(url_1, link_2))
    
url_joined

['https://www.trulia.com/p/ca/san-diego/5418-harvest-run-dr-san-diego-ca-92130--1064193329',
 'https://www.trulia.com/p/ca/san-diego/4368-42nd-st-3-san-diego-ca-92105--2079727955',
 'https://www.trulia.com/p/ca/san-diego/6415-benson-ave-san-diego-ca-92114--2079818824',
 'https://www.trulia.com/p/ca/san-diego/16326-avenida-venusto-b-san-diego-ca-92128--2079970422',
 'https://www.trulia.com/p/ca/san-diego/3820-3820-quarter-mile-dr-san-diego-ca-92130--2704030360',
 'https://www.trulia.com/p/ca/san-diego/700-w-east-st-1901-san-diego-ca-92101--2116268561',
 'https://www.trulia.com/p/ca/coronado/50-montego-ct-coronado-ca-92118--1001808762',
 'https://www.trulia.com/p/ca/san-diego/6296-lisieux-ter-san-diego-ca-92120--2079888064',
 'https://www.trulia.com/p/ca/san-diego/1328-s-58th-st-san-diego-ca-92114--2116275116',
 'https://www.trulia.com/p/ca/la-jolla/617-westbourne-st-la-jolla-ca-92037--2079479402',
 'https://www.trulia.com/p/ca/san-diego/700-w-harbor-dr-w-2902-san-diego-ca-92101--2124227

## Get Data From Links

Want to get address, bedrooms, bathrooms, sqft, year built, parking, and price

In [179]:
first_link = url_joined[0]
print(first_link)

https://www.trulia.com/p/ca/san-diego/3820-3820-quarter-mile-dr-san-diego-ca-92130--2704030360


### Get request using a new driver and creating a new soup object

In [181]:
#get request using a new driver and creating soup object
driver2 = webdriver.Edge()
response = driver2.get(first_link)
print(response)

None


In [182]:
soup2 = BeautifulSoup(driver2.page_source, 'html.parser')
print(soup2)

<html lang="en"><head><meta charset="utf-8"/><title>3820-3820 Quarter Mile Dr, San Diego, CA 92130 | MLS# 230009666 | Trulia</title><meta content="3820-3820 Quarter Mile Dr, San Diego, CA 92130 is a 1,921 sqft, 3 bed, 3 bath Condo listed for $1,539,000. View 19 photos, review home and neighborhood details, and contact an agent to learn more" name="description"/><meta content="width=device-width, initial-scale=1.0, maximum-scale=5.0, viewport-fit=cover" name="viewport"/><meta content="5953837487" property="fb:admins"/><meta content="183577541666001" property="fb:app_id"/><meta content="Trulia" property="application-name"/><meta content="https://www.trulia.com" property="msapplication-starturl"/><meta content="Trulia: Real Estate Search" property="msapplication-tooltip"/><meta content="/browserconfig.xml" property="msapplication-config"/><meta content="en_US" property="og:locale"/><meta content="Trulia Real Estate Search" property="og:site_name"/><meta content="website" property="og:type

### Getting Address

In [185]:
soup2.find('span',{'data-testid':'home-details-summary-headline'}).get_text()

'3820-3820 Quarter Mile Dr'

### Bedrooms

In [187]:
soup2.find('li', {'data-testid':'bed'}).get_text()

'3 Beds'

### Bathrooms

In [188]:
soup2.find('li', {'data-testid':'bath'}).get_text()

'3 Baths'

### Sqft

In [189]:
soup2.find('li', {'data-testid':'floor'}).get_text()

'1,921 sqft'

### Year Built

In [206]:
soup2.find('div', string = 'Year Built').findNext('div').findNext('div').get_text()

'Year Built: 2000'

### Parking

In [195]:
soup2.find('div', string = 'Parking').findNext('div').findNext('div').get_text()

'2 Car Garage'

### Price

In [197]:
soup2.find('h3', {'data-testid':'on-market-price-details'}).get_text()

'$1,539,000'

## Loop through all links and grab all 7 data points

In [198]:
# create lists with data 
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []

#loop through all joined links
for link in url_joined:
    #start new driver with link
    driver3 = webdriver.Edge()
    response = driver3.get(link)
    
    #create soup object
    soup3 = BeautifulSoup(driver3.page_source, 'html.parser')
    
    try:
        address.append(soup3.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
    
    try:
        bedrooms.append(soup3.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
    
    try:
        bathrooms.append(soup3.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    try:
        area.append(soup3.find('li', {'data-testid':'floor'}).get_text())
    except: 
        area.append('')
    
    try:
        year_built.append(soup3.find('div', string = 'Year Built').findNext('div').findNext('div').get_text())
    except: 
        year_built.append('')
        
    try:
        parking.append(soup3.find('div', string = 'Parking').findNext('div').findNext('div').get_text())
    except:
        parking.append('')
    
    try:
        price.append(soup3.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    output = {'Address':address, 'Bedrooms':bedrooms, 'Bathrooms':bathrooms, 'Area':area, 
             'Year Built':year_built, 'Parking':parking, 'Price':price}
    
    #closing the driver at the end
    driver3.close()
    

In [199]:
#show output 
output

{'Address': ['3820-3820 Quarter Mile Dr',
  '700 W  East St #1901',
  '50 Montego Ct',
  '6296 Lisieux Ter',
  '1328 S  58th St',
  '617 Westbourne St',
  '700 W  Harbor Dr   W  #2902',
  '7980 Mission Center Ct #F',
  '8334 Glen Vista Ct',
  '721-723 San Fernando Pl',
  '1307 Caminito Gabaldon #E',
  '3050 Rue D Orleans #321',
  '3340 Del Sol Blvd #12',
  '13522 Calais Dr',
  '5159 Marlborough Dr',
  '211 Almazon St   #211',
  '3907 Georgia St #26',
  '3749 Kingsley St',
  '1806 McKee St #A3',
  '3851 Curtis St',
  '6711 Kenmar Way',
  '8909 Fallwood Ave',
  '823 9th St',
  '3244-50 Reynard Way',
  '4686 Mount Alifan Dr',
  '3246 Geronimo Ave',
  '6391 Caminito Marcial',
  '700 W  Harbor Dr   W  #2003',
  '12539 El Camino Real #C',
  '9328 Song Bird Way',
  '10209 W  River Bluff Dr',
  '9300 Song Bird Way',
  '17287 Regalo Ln',
  '211 Almazon St',
  '9534 Vervain St',
  '1565 Northrim Ct #295',
  '4677 Pescadero Ave',
  '6734 University Ave'],
 'Bedrooms': ['3 Beds',
  '3 Beds',
  '3 

In [203]:
#putting output in a DataFrame
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,3820-3820 Quarter Mile Dr,3 Beds,3 Baths,"1,947 sqft",Year Built: 2004,2 Car Garage,"$1,539,000"
1,700 W East St #1901,3 Beds,2 Baths,"1,315 sqft",Year Built: 2007,Garage,"$1,240,000"
2,50 Montego Ct,3 Beds,3 Baths,"1,492 sqft",Year Built: 1982,Garage,"$1,490,000"
3,6296 Lisieux Ter,3 Beds,2 Baths,"1,762 sqft",Year Built: 1976,Garage,"$1,395,000"
4,1328 S 58th St,4 Beds,2 Baths,"2,178 sqft",Year Built: 1963,Garage,"$859,000"
5,617 Westbourne St,3 Beds,2 Baths,"1,718 sqft",Year Built: 1980,Garage,"$1,495,000"
6,700 W Harbor Dr W #2902,2 Beds,2 Baths,"2,047 sqft",Year Built: 2004,2 Car Garage,"$2,999,900"
7,7980 Mission Center Ct #F,2 Beds,1 Bath,726 sqft,Year Built: 1981,Garage,"$449,000"
8,8334 Glen Vista Ct,4 Beds,2 Baths,"1,986 sqft",Year Built: 1975,2 Car Garage,"$749,995"
9,721-723 San Fernando Pl,6 Beds,6 Baths,"2,870 sqft",Year Built: 1991,4 Car Garage,"$3,750,000"


## Accessing Multiple Pages of Results
Getting multiple pages worth of data.

### Gathering all links from 25 pages

In [28]:
url_1 = 'https://www.trulia.com'
url_joined = []

for i in range(1,26):
    #https://www.trulia.com/CA/San_Diego/2_p/
    website = 'https://www.trulia.com/CA/San_Diego/' + str(i) + '_p/'
    
    #request
    driver4 = webdriver.Edge()
    response = driver4.get(website)
    WebDriverWait(driver4, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul[data-testid="search-result-list-container"]')))
    driver4.maximize_window()
    time.sleep(10) 

    #create soup object
    soup4 = BeautifulSoup(driver4.page_source, 'html.parser')
    
    #result container 
    result_container = soup4.find_all('li', {'class':'sc-fc01d244-0'})
    len(result_container)
    
    results_update = []

    for results in result_container: 
        if results.has_attr('data-testid'):
            results_update.append(results)
    
    #relative url
    relative_url = []
    #loop through the results in the results_update list for item
    for item in results_update:
        for link in item.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    len(relative_url)
    #joining urls
    for link_2 in relative_url: 
        url_joined.append(urllib.parse.urljoin(url_1, link_2))
    


In [29]:
len(url_joined)

918

### Going through each link and generating data

In [30]:
# create lists with data 
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []


for link in url_joined:
    driver3 = webdriver.Edge()
    response = driver3.get(link)
    #create soup object
    soup3 = BeautifulSoup(driver3.page_source, 'html.parser')
    
    try:
        address.append(soup3.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    try:
        bedrooms.append(soup3.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
    
    try:
        bathrooms.append(soup3.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    try:
        area.append(soup3.find('li', {'data-testid':'floor'}).get_text())
    except: 
        area.append('')
    
    try:
        year_built.append(soup3.find('div', string = 'Year Built').findNext('div').findNext('div').get_text())
    except: 
        year_built.append('')
        
    try:
        parking.append(soup3.find('div', string = 'Parking').findNext('div').findNext('div').get_text())
    except:
        parking.append('')
    
    try:
        price.append(soup3.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    output = {'Address':address, 'Bedrooms':bedrooms, 'Bathrooms':bathrooms, 'Area':area, 
             'Year Built':year_built, 'Parking':parking, 'Price':price}
    
    #closing the driver at the end
    driver3.close()
    
    
    

In [31]:
#putting output in a DataFrame
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,5418 Harvest Run Dr,6 Beds,6 Baths,"4,687 sqft",Year Built: 2001,3 Car Garage,"$3,295,000"
1,4368 42nd St #3,2 Beds,2 Baths,947 sqft,Year Built: 1984,Garage,"$629,000"
2,6415 Benson Ave,5 Beds,3 Baths,"1,789 sqft (on 19,467 acres)",Year Built: 1987,2 Car Garage,"$699,000"
3,16326 Avenida Venusto #B,3 Beds,3 Baths,"1,632 sqft (on 8.45 acres)",Year Built: 1990,2 Car Garage,"$799,000"
4,3820-3820 Quarter Mile Dr,3 Beds,3 Baths,"1,947 sqft",Year Built: 2004,2 Car Garage,"$1,539,000"
...,...,...,...,...,...,...,...
913,8517 Even Seth Cir,4 Beds,3 Baths,"2,217 sqft",Year Built: 2018,3 Car Garage,"$1,199,000"
914,13104 Dana Vista St #350,4 Beds,2 Baths,"1,980 sqft",Year Built: 2006,Carport,"$395,500"
915,3906 Stanford Dr,4 Beds,3 Baths,"2,398 sqft",Year Built: 1992,3 Car Garage,"$975,000"
916,3365 Donna Dr,3 Beds,3 Baths,"1,680 sqft",Year Built: 1976,2 Car Garage,"$1,495,000"


### Output to CSV file

In [32]:
df.to_csv('SanDiego_Data', encoding='utf-8', index=False)
