## Web Scraping with Beautiful Soup
### Indianapolis Data Collection from Trulia
Second part of the project. Collecting data from other cities.
Cities: SD, Albuquerque, Colorado Springs, Philadelphia, Indianapolis, Las Vegas, Washington DC, Miami, New York, San Francisco

In [65]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
import urllib.parse
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import fake_useragent
from fake_useragent import UserAgent
import lxml.html
from lxml.html import fromstring
from itertools import cycle

### Collecting Links From 25 Pages of Trulia Houses for Sale Results
Need to scroll through the results on the page so that all the houses load and we get all the links. Added 10 second timer to ensure that there is enough time to scroll through. Otherwise, only 7 links per page are scraped when there should be around 40. 

In [2]:
url_1 = 'https://www.trulia.com'
url_joined = []

for i in range(1,26):
    #https://www.trulia.com/IN/Indianapolis/
    website = 'https://www.trulia.com/IN/Indianapolis/' + str(i) + '_p/'
    
    #request
    driver4 = webdriver.Edge()
    response = driver4.get(website)
    WebDriverWait(driver4, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul[data-testid="search-result-list-container"]')))
    driver4.maximize_window()
    time.sleep(10) 

    #create soup object
    soup4 = BeautifulSoup(driver4.page_source, 'html.parser')
    
    #result container 
    result_container = soup4.find_all('li', {'class':'sc-fc01d244-0'})
    len(result_container)
    
    results_update = []

    for results in result_container: 
        if results.has_attr('data-testid'):
            results_update.append(results)
    
    #relative url
    relative_url = []
    #loop through the results in the results_update list for item
    for item in results_update:
        for link in item.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    len(relative_url)
    #joining urls
    for link_2 in relative_url: 
        url_joined.append(urllib.parse.urljoin(url_1, link_2))
    

In [3]:
len(url_joined)

999

In [38]:
url_list_copy = url_joined.copy()
copy2 = url_list_copy[595:999]


In [64]:
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
for i in parser.xpath('//tbody/tr')[:10]:
    if i.xpath('.//td[7][contains(text(), "yes")]'):
        proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
        proxies.add(proxy)
proxies

{'108.187.44.89:3129',
 '185.20.71.38:443',
 '41.217.220.69:32650',
 '88.99.234.110:2021'}

### Collecting Data From the Links Created Above
(This block takes a long time to compile, approx 3-5 hours)

In [68]:
# create lists with data 
address = []
bedrooms = []
bathrooms = []
area = []
year_built = []
parking = []
price = []

proxy_pool = cycle(proxies)

for link in copy2:
    proxy = next(proxy_pool)
    options = Options()
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f'user-agent={userAgent}')
    driver3 = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
    options.add_argument("--headless")
    options.add_argument('--proxy-server=%s' % proxy)
    
    #driver3 = webdriver.Edge()
    driver3.delete_all_cookies()
    time.sleep(5)
    
    window_size = driver3.execute_script("""
        return [window.outerWidth - window.innerWidth + arguments[0],
          window.outerHeight - window.innerHeight + arguments[1]];
        """, 100, 100)
    driver3.set_window_size(*window_size)
    
    response = driver3.get(link)
    #if there is a captcha, trying to fix
#     if driver3.find_element(By.ID, 'px-captcha'):
#         element = driver3.find_element(By.CSS_SELECTOR, '#px-captcha') 
#         action = ActionChains(driver3)
#         click = ActionChains(driver3)
#         action.click_and_hold(element)
#         action.perform()
#         time.sleep(15)
#         action.release(element)
#         time.sleep(20)
#         response = driver3.get(link)
#     else: 
#         continue
    #create soup object
    
    driver3.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(10)
    #checking if the url contains the word captcha, if it does, wait for 20 seconds and then try again
    if 'captcha' in driver3.current_url:
        time.sleep(20)
        driver3.delete_all_cookies()
        driver3.quit()
        options = Options()
        ua = UserAgent()
        userAgent = ua.random
        options.add_argument(f'user-agent={userAgent}')
        driver3 = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
        window_size = driver3.execute_script("""
            return [window.outerWidth - window.innerWidth + arguments[0],
              window.outerHeight - window.innerHeight + arguments[1]];
            """, 100, 100)
        driver3.set_window_size(*window_size)
        response = driver3.get(link)
        
    soup3 = BeautifulSoup(driver3.page_source, 'html.parser')
    
    try:
        address.append(soup3.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    try:
        bedrooms.append(soup3.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
    
    try:
        bathrooms.append(soup3.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
    
    try:
        area.append(soup3.find('li', {'data-testid':'floor'}).get_text())
    except: 
        area.append('')
    
    try:
        year_built.append(soup3.find('div', string = 'Year Built').findNext('div').findNext('div').get_text())
    except: 
        year_built.append('')
        
    try:
        parking.append(soup3.find('div', string = 'Parking').findNext('div').findNext('div').get_text())
    except:
        parking.append('')
    
    try:
        price.append(soup3.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    output = {'Address':address, 'Bedrooms':bedrooms, 'Bathrooms':bathrooms, 'Area':area, 
             'Year Built':year_built, 'Parking':parking, 'Price':price}
    
    #closing the driver at the end
    driver3.quit()
    time.sleep(50)

    

  driver3 = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
  driver3 = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')


In [69]:
#putting output in a DataFrame
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price
0,5170 Salter Ct,,,,Year Built: 1993,3 Car Garage,"$939,900"
1,55 W Dudley Ave,,,,Year Built: 1955,2 Car Garage,"$260,000"
2,350 N Meridian St #401,,,,Year Built: 1924,Garage,"$475,000"
3,10844 Penwell Way,,,,Year Built: 2023,2 Car Garage,"$268,900"
4,Richland Plan in Village at New Bethel - Patio...,,,,Year Built: 2023,No Info,"$294,900+"
...,...,...,...,...,...,...,...
399,1948 Winton Dr,,,,Year Built: 2013,3 Car Garage,"$399,900"
400,452 Park Dr,,,,Year Built: 1958,Garage,"$230,000"
401,934 Dreamy St,,,,Year Built: 1971,2 Car Garage,"$325,000"
402,2639 Kilgobbin Cres,,,,Year Built: 2017,2 Car Garage,"$435,000"


In [70]:
#adding new column to the dataframe that specifies the city/location
df['Location'] = 'Indianapolis'
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,5170 Salter Ct,,,,Year Built: 1993,3 Car Garage,"$939,900",Indianapolis
1,55 W Dudley Ave,,,,Year Built: 1955,2 Car Garage,"$260,000",Indianapolis
2,350 N Meridian St #401,,,,Year Built: 1924,Garage,"$475,000",Indianapolis
3,10844 Penwell Way,,,,Year Built: 2023,2 Car Garage,"$268,900",Indianapolis
4,Richland Plan in Village at New Bethel - Patio...,,,,Year Built: 2023,No Info,"$294,900+",Indianapolis
...,...,...,...,...,...,...,...,...
399,1948 Winton Dr,,,,Year Built: 2013,3 Car Garage,"$399,900",Indianapolis
400,452 Park Dr,,,,Year Built: 1958,Garage,"$230,000",Indianapolis
401,934 Dreamy St,,,,Year Built: 1971,2 Car Garage,"$325,000",Indianapolis
402,2639 Kilgobbin Cres,,,,Year Built: 2017,2 Car Garage,"$435,000",Indianapolis


In [71]:
#saving the edited csv file
df.to_csv('Indianapolis_Data2.csv', encoding='utf-8', index=False)

In [8]:
#also creating an excel file just in case
df.to_excel('Indianapolis_Data.xlsx',  encoding='utf-8', index=False)

In [72]:
df2 = pd.read_csv('Indianapolis_Data.csv')

In [78]:
dfDrop = df2.drop(df2.index[595:758])

In [79]:
dfDrop

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,2926 Forest Manor Ave,3 Beds,1 Bath,864 sqft,Year Built: 1911,Garage,"$85,000",Indianapolis
1,8820 Yardley Ct #208,2 Beds,2 Baths,"1,070 sqft",Year Built: 1993,No Info,"$159,900",Indianapolis
2,2169 Kildare Ave,2 Beds,1 Bath,"1,544 sqft",Year Built: 1940,Garage,"$152,000",Indianapolis
3,4741 Bridgefield Dr,2 Beds,2 Baths,"1,312 sqft",Year Built: 1998,2 Car Garage,"$180,000",Indianapolis
4,Grandover Plan in Edgewood Farms by D.R. Horto...,3 Beds,3 Baths,"2,141 sqft",Year Built: 2023,No Info,"$400,000+",Indianapolis
...,...,...,...,...,...,...,...,...
590,809 W Edgewood Ave,3 Beds,2 Baths,"1,767 sqft (on 0.55 acres)",Year Built: 1969,1 Car Garage,"$275,000",Indianapolis
591,2245 Brookside Ave,3 Beds,1 Bath,"1,680 sqft",Year Built: 1907,No Info,"$125,000",Indianapolis
592,3818 N Park Ave,3 Beds,2 Baths,"2,034 sqft",Year Built: 1905,Garage,"$269,000",Indianapolis
593,,,,,,,,Indianapolis


In [86]:
combined = pd.concat([dfDrop, df2], ignore_index=True, sort=False)

In [87]:
combined

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Price,Location
0,2926 Forest Manor Ave,3 Beds,1 Bath,864 sqft,Year Built: 1911,Garage,"$85,000",Indianapolis
1,8820 Yardley Ct #208,2 Beds,2 Baths,"1,070 sqft",Year Built: 1993,No Info,"$159,900",Indianapolis
2,2169 Kildare Ave,2 Beds,1 Bath,"1,544 sqft",Year Built: 1940,Garage,"$152,000",Indianapolis
3,4741 Bridgefield Dr,2 Beds,2 Baths,"1,312 sqft",Year Built: 1998,2 Car Garage,"$180,000",Indianapolis
4,Grandover Plan in Edgewood Farms by D.R. Horto...,3 Beds,3 Baths,"2,141 sqft",Year Built: 2023,No Info,"$400,000+",Indianapolis
...,...,...,...,...,...,...,...,...
1347,,,,,,,,Indianapolis
1348,,,,,,,,Indianapolis
1349,,,,,,,,Indianapolis
1350,,,,,,,,Indianapolis


In [88]:
combined.to_csv('Indianapolis_DataFinal.csv', encoding='utf-8', index=False)