## Web Scraping with Beautiful Soup
### Philadelphia Data Collection from Trulia
Second part of the project. Collecting data from other cities.
Cities: SD, Albuquerque, Colorado Springs, Philadelphia, Indianapolis, Las Vegas, Washington DC, Miami, New York, San Francisco

In [1]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
import urllib.parse

### Collecting Links From 25 Pages of Trulia Houses for Sale Results
Need to scroll through the results on the page so that all the houses load and we get all the links. Added 10 second timer to ensure that there is enough time to scroll through. Otherwise, only 7 links per page are scraped when there should be around 40. 

In [2]:
url_1 = 'https://www.trulia.com'
url_joined = []

for i in range(1,26):
    #https://www.trulia.com/PA/Philadelphia/
    website = 'https://www.trulia.com/PA/Philadelphia/' + str(i) + '_p/'
    
    #request
    driver4 = webdriver.Edge()
    response = driver4.get(website)
    WebDriverWait(driver4, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul[data-testid="search-result-list-container"]')))
    driver4.maximize_window()
    time.sleep(10) 

    #create soup object
    soup4 = BeautifulSoup(driver4.page_source, 'html.parser')
    
    #result container 
    result_container = soup4.find_all('li', {'class':'sc-fc01d244-0'})
    len(result_container)
    
    results_update = []

    for results in result_container: 
        if results.has_attr('data-testid'):
            results_update.append(results)
    
    #relative url
    relative_url = []
    #loop through the results in the results_update list for item
    for item in results_update:
        for link in item.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    len(relative_url)
    #joining urls
    for link_2 in relative_url: 
        url_joined.append(urllib.parse.urljoin(url_1, link_2))
    

In [3]:
len(url_joined)

965

### Collecting Data From the Links Created Above
(This block takes a long time to compile, approx 3-5 hours)

In [4]:
# create lists with data 
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []


for link in url_joined:
    driver3 = webdriver.Edge()
    response = driver3.get(link)
    #create soup object
    soup3 = BeautifulSoup(driver3.page_source, 'html.parser')
    
    try:
        address.append(soup3.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    try:
        bedrooms.append(soup3.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
    
    try:
        bathrooms.append(soup3.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    try:
        area.append(soup3.find('li', {'data-testid':'floor'}).get_text())
    except: 
        area.append('')
    
    try:
        year_built.append(soup3.find('div', string = 'Year Built').findNext('div').findNext('div').get_text())
    except: 
        year_built.append('')
        
    try:
        parking.append(soup3.find('div', string = 'Parking').findNext('div').findNext('div').get_text())
    except:
        parking.append('')
    
    try:
        price.append(soup3.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    output = {'Address':address, 'Bedrooms':bedrooms, 'Bathrooms':bathrooms, 'Area':area, 
             'Year Built':year_built, 'Parking':parking, 'Price':price}
    
    #closing the driver at the end
    driver3.close()
    

In [5]:
#putting output in a DataFrame
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,167 Mifflin St,2 Beds,2 Baths,"1,158 sqft",Year Built: 1905,Open Parking,"$340,000"
1,837 Clyde Ln,3 Beds,2 Baths,"1,562 sqft",Year Built: 1974,3 Open Spaces,"$400,000"
2,1918 E Somerset St,3 Beds,1 Bath,984 sqft,Year Built: 1910,Open Parking,"$209,000"
3,1700 Johnston St,3 Beds,2 Baths,"1,200 sqft",Year Built: 1920,Open Parking,"$285,000"
4,1937 E Letterly St,2 Beds,2 Baths,"1,276 sqft",Year Built: 1875Year Renovated: 2022,Open Parking,"$354,999"
...,...,...,...,...,...,...,...
960,,,,,,,
961,,,,,,,
962,,,,,,,
963,,,,,,,


In [6]:
#adding new column to the dataframe that specifies the city/location
df['Location'] = 'Philadelphia'
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,167 Mifflin St,2 Beds,2 Baths,"1,158 sqft",Year Built: 1905,Open Parking,"$340,000",Philadelphia
1,837 Clyde Ln,3 Beds,2 Baths,"1,562 sqft",Year Built: 1974,3 Open Spaces,"$400,000",Philadelphia
2,1918 E Somerset St,3 Beds,1 Bath,984 sqft,Year Built: 1910,Open Parking,"$209,000",Philadelphia
3,1700 Johnston St,3 Beds,2 Baths,"1,200 sqft",Year Built: 1920,Open Parking,"$285,000",Philadelphia
4,1937 E Letterly St,2 Beds,2 Baths,"1,276 sqft",Year Built: 1875Year Renovated: 2022,Open Parking,"$354,999",Philadelphia
...,...,...,...,...,...,...,...,...
960,,,,,,,,Philadelphia
961,,,,,,,,Philadelphia
962,,,,,,,,Philadelphia
963,,,,,,,,Philadelphia


In [7]:
#saving the edited csv file
df.to_csv('Philadelphia_Data.csv', encoding='utf-8', index=False)

In [8]:
#also creating an excel file just in case
df.to_excel('Philadelphia_Data.xlsx',  encoding='utf-8', index=False)