## Web Scraping with Beautiful Soup
### Colorado Springs Data Collection from Trulia
Second part of the project. Collecting data from other cities.
Cities: SD, Albuquerque, Colorado Springs, Philadelphia, Indianapolis, Las Vegas, Washington DC, Miami, New York, San Francisco

In [1]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
import urllib.parse

### Collecting Links From 25 Pages of Trulia Houses for Sale Results
Need to scroll through the results on the page so that all the houses load and we get all the links. Added 10 second timer to ensure that there is enough time to scroll through. Otherwise, only 7 links per page are scraped when there should be around 40. 

In [2]:
url_1 = 'https://www.trulia.com'
url_joined = []

for i in range(1,26):
    #https://www.trulia.com/CO/Colorado_Springs/
    website = 'https://www.trulia.com/CO/Colorado_Springs/' + str(i) + '_p/'
    
    #request
    driver4 = webdriver.Edge()
    response = driver4.get(website)
    WebDriverWait(driver4, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul[data-testid="search-result-list-container"]')))
    driver4.maximize_window()
    time.sleep(10) 

    #create soup object
    soup4 = BeautifulSoup(driver4.page_source, 'html.parser')
    
    #result container 
    result_container = soup4.find_all('li', {'class':'sc-fc01d244-0'})
    len(result_container)
    
    results_update = []

    for results in result_container: 
        if results.has_attr('data-testid'):
            results_update.append(results)
    
    #relative url
    relative_url = []
    #loop through the results in the results_update list for item
    for item in results_update:
        for link in item.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    len(relative_url)
    #joining urls
    for link_2 in relative_url: 
        url_joined.append(urllib.parse.urljoin(url_1, link_2))
    

In [3]:
len(url_joined)

1007

### Collecting Data From the Links Created Above
(This block takes a long time to compile, approx 3-5 hours)

In [4]:
# create lists with data 
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []


for link in url_joined:
    driver3 = webdriver.Edge()
    response = driver3.get(link)
    #create soup object
    soup3 = BeautifulSoup(driver3.page_source, 'html.parser')
    
    try:
        address.append(soup3.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    try:
        bedrooms.append(soup3.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
    
    try:
        bathrooms.append(soup3.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    try:
        area.append(soup3.find('li', {'data-testid':'floor'}).get_text())
    except: 
        area.append('')
    
    try:
        year_built.append(soup3.find('div', string = 'Year Built').findNext('div').findNext('div').get_text())
    except: 
        year_built.append('')
        
    try:
        parking.append(soup3.find('div', string = 'Parking').findNext('div').findNext('div').get_text())
    except:
        parking.append('')
    
    try:
        price.append(soup3.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    output = {'Address':address, 'Bedrooms':bedrooms, 'Bathrooms':bathrooms, 'Area':area, 
             'Year Built':year_built, 'Parking':parking, 'Price':price}
    
    #closing the driver at the end
    driver3.close()
    

In [5]:
#putting output in a DataFrame
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,2145 Wickes Rd,4 Beds,4 Baths,"2,855 sqft",Year Built: 1991,2 Car Garage,"$595,000"
1,6598 Fielding Ter,5 Beds,4 Baths,"2,843 sqft",Year Built: 2004,2 Car Garage,"$499,900"
2,1057 Acapulco Ct,2 Beds,1 Bath,858 sqft,Year Built: 1983,Garage,"$245,000"
3,4991 Copen Dr,3 Beds,2 Baths,"1,832 sqft",Year Built: 1997,2 Car Garage,"$430,000"
4,7161 Snowbell Ln,3 Beds,3 Baths,"4,346 sqft",Year Built: 2017,2 Car Garage,"$629,900"
...,...,...,...,...,...,...,...
1002,12405 McCune Rd,3 Beds,2 Baths,"1,534 sqft (on 4.77 acres)",Year Built: 1978,2 Car Garage,"$525,000"
1003,2606 Lake Of The Rockies Dr,2 Beds,2 Baths,"3,405 sqft",Year Built: 2019,2 Car Garage,"$625,000"
1004,7356 Colonial Dr,4 Beds,2 Baths,"1,786 sqft",Year Built: 1978,1 Car Garage,"$400,000"
1005,867 Caribou Cir,6 Beds,5 Baths,"5,701 sqft (on 0.91 acres)",Year Built: 1991,3 Car Garage,"$895,000"


In [6]:
#adding new column to the dataframe that specifies the city/location
df['Location'] = 'Colorado Springs'
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,2145 Wickes Rd,4 Beds,4 Baths,"2,855 sqft",Year Built: 1991,2 Car Garage,"$595,000",Colorado Springs
1,6598 Fielding Ter,5 Beds,4 Baths,"2,843 sqft",Year Built: 2004,2 Car Garage,"$499,900",Colorado Springs
2,1057 Acapulco Ct,2 Beds,1 Bath,858 sqft,Year Built: 1983,Garage,"$245,000",Colorado Springs
3,4991 Copen Dr,3 Beds,2 Baths,"1,832 sqft",Year Built: 1997,2 Car Garage,"$430,000",Colorado Springs
4,7161 Snowbell Ln,3 Beds,3 Baths,"4,346 sqft",Year Built: 2017,2 Car Garage,"$629,900",Colorado Springs
...,...,...,...,...,...,...,...,...
1002,12405 McCune Rd,3 Beds,2 Baths,"1,534 sqft (on 4.77 acres)",Year Built: 1978,2 Car Garage,"$525,000",Colorado Springs
1003,2606 Lake Of The Rockies Dr,2 Beds,2 Baths,"3,405 sqft",Year Built: 2019,2 Car Garage,"$625,000",Colorado Springs
1004,7356 Colonial Dr,4 Beds,2 Baths,"1,786 sqft",Year Built: 1978,1 Car Garage,"$400,000",Colorado Springs
1005,867 Caribou Cir,6 Beds,5 Baths,"5,701 sqft (on 0.91 acres)",Year Built: 1991,3 Car Garage,"$895,000",Colorado Springs


In [7]:
#saving the edited csv file
df.to_csv('ColoradoSprings_Data.csv', encoding='utf-8', index=False)

In [8]:
#also creating an excel file just in case
df.to_excel('ColoradoSprings_Data.xlsx',  encoding='utf-8', index=False)