## House Data for Active Listings

This script loads data to a dataframe for each active listing

In [None]:
import requests
from bs4 import BeautifulSoup
import regex as re
import numpy as np
import time
import pandas as pd

In [None]:
def get_html_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html.parser')
    return soup

In [None]:
def get_prop_details(soup):
    prop_details=soup.find("property-details",class_="ember-view")
    return prop_details

In [None]:
def get_MLS(prop_details):
    try:
        MLS_regex = re.compile('MLS')
        obj = prop_details.find(text=MLS_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_address(soup):
    try:
        address=soup.find("span", itemprop="streetAddress").text +" "+ soup.find("span", itemprop="addressLocality").text+" "+soup.find("span",itemprop="addressRegion").text+" "+soup.find(itemprop="postalCode").get("content")+" "+soup.find(itemprop="addressCountry").get("content")
        return address
    except:
        return 'None'
    
def get_price(soup):
    try:
        price=soup.find(itemprop="price").get("content")
        return price
    except:
        return np.nan
    
def get_beds(soup):
    try:
        Beds_regex = re.compile('beds')
        obj = soup.find(text=Beds_regex).find_previous()
        return obj.text[0]
    except:
        return np.nan
    
def get_baths(soup):
    try:
        Baths_regex = re.compile('baths')
        obj = soup.find(text=Baths_regex).find_previous()
        return obj.text[0]
    except:
        return np.nan

def get_parking(soup):
    try:
        Parking_regex = re.compile('parking')
        obj = soup.find(text=Parking_regex).find_previous()
        return obj.text[0]
    except:
        return np.nan
    
def get_description(soup):
    try:
        obj = soup.find("p", class_="description padded")
        return obj.text.strip()
    except:
        return 'None'

def get_Type(prop_details):
    try:
        Type_regex = re.compile('Type')
        obj = prop_details.find(text=Type_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_Levels(prop_details):
    try:
        level_regex = re.compile('Levels')
        obj = prop_details.find(text=level_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_Size(prop_details):
    try:
        Size_regex = re.compile('Size')
        obj = prop_details.find(text=Size_regex).find_next()
        return obj.text.strip(" sq. ft.")
    except:
        return 'None'

def get_DateAvailable(prop_details):
    try:
        Date_Available_regex = re.compile('Date Available')
        obj = prop_details.find(text=Date_Available_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_DaysActive(prop_details):
    try:
        Days_Active_regex = re.compile('Days Active')
        obj = prop_details.find(text=Days_Active_regex).find_next()
        return obj.text
    except:
        return np.nan

def get_LaundryLevel(prop_details):
    try:
        Laundry_Level_regex = re.compile('Laundry Level')
        obj = prop_details.find(text=Laundry_Level_regex).find_next()
        return obj.text
    except:
        return 'None'
    
def get_CentralVac(prop_details):
    try:
        CentralVac_regex = re.compile('Central Vac')
        obj = prop_details.find(text=CentralVac_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_Fireplace(prop_details):
    try:
        Fireplace_regex = re.compile('Fireplace')
        obj = prop_details.find(text=Fireplace_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_Acreage(prop_details):
    try:
        Acreage_regex = re.compile('Acreage')
        obj = prop_details.find(text=Acreage_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_LotSize(prop_details):
    try:
        LotSize_regex = re.compile('Lot Size')
        obj = prop_details.find(text=LotSize_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_garage(prop_details):
    try:
        Garage_regex = re.compile('Garage')
        obj = prop_details.find(text=Garage_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_Exterior(prop_details):
    try:
        Exterior_regex = re.compile('Exterior')
        obj = prop_details.find(text=Exterior_regex).find_next()
        return obj.text
    except:
        return 'None'
    
def get_Age(prop_details):
    try:
        Age_regex = re.compile('Approx. Age')
        obj = prop_details.find(text=Age_regex).find_next()
        return obj.text.strip("years")
    except:
        return 'None'
    
def get_Basement(prop_details):
    try:
        Basement_regex = re.compile('Basement')
        obj = prop_details.find(text=Basement_regex).find_next()
        return obj.text
    except:
        return 'None'
    
def get_Driveway(prop_details):
    try:
        Driveway_regex = re.compile('Driveway')
        obj = prop_details.find(text=Driveway_regex).find_next()
        return obj.text
    except:
        return 'None'

def get_GarageSpaces(prop_details):
    try:
        GarageSpaces_regex = re.compile('Garage Spaces')
        obj = prop_details.find(text=GarageSpaces_regex).find_next()
        return obj.text
    except:
        return "None"
    
def get_Heat(prop_details):
    try:
        Heat_regex = re.compile('Heat')
        obj = prop_details.find(text=Heat_regex).find_next()
        return obj.text
    except:
        return "None"
    
def get_AC(prop_details):
    try:
        AC_regex = re.compile('A/C')
        obj = prop_details.find(text=AC_regex).find_next()
        return obj.text
    except:
        return "None"

def get_fuel(prop_details):
    try:
        fuel_regex = re.compile('Heating Fuel')
        obj = prop_details.find(text=fuel_regex).find_next()
        return obj.text
    except:
        return "None"

In [None]:
def get_house_data(links):
    house_data = []
    for link in links:
        soup = get_html_data(link)
        prop_details = get_prop_details(soup)
        MLS = get_MLS(prop_details)
        address = get_address(soup)
        price = get_price(soup)
        beds = get_beds(soup)
        baths = get_baths(soup)
        parking_spots=get_parking(soup)
        description = get_description(soup)
        Type = get_Type(prop_details)
        Levels = get_Levels(prop_details)
        Size = get_Size(prop_details)
        DateAvailable = get_DateAvailable(prop_details)
        DaysActive = get_DaysActive(prop_details)
        LaundryLevel = get_LaundryLevel(prop_details)
        CentralVac = get_CentralVac(prop_details)
        Fireplace = get_Fireplace(prop_details)
        Acreage = get_Acreage(prop_details)
        LotSize = get_LotSize(prop_details)
        Garage = get_garage(prop_details)
        Exterior = get_Exterior(prop_details)
        Age = get_Age(prop_details)
        Basement = get_Basement(prop_details)
        Driveway = get_Driveway(prop_details)
        GarageSpaces = get_GarageSpaces(prop_details)
        Heating = get_Heat(prop_details)
        AC = get_AC(prop_details)
        Fuel = get_fuel(prop_details)
        house_data.append([MLS,address,price,beds,baths,parking_spots,description,Type,Levels,Size,DateAvailable,DaysActive,
                           LaundryLevel,CentralVac,Fireplace,Acreage,LotSize,Garage,Exterior,Age,Basement,Driveway,GarageSpaces,
                           Heating,AC,Fuel])
        
    return house_data

In [None]:
f = open(r'C:\Users\17059\Desktop\Final Project\zoocasa\house_links\active\active_house_links_2019-11-06_230032.txt', 'r')
x = f.readlines()
f.close()

links = []

for line in x:
    links.append(line.rstrip('\n'))
    
print(links)

In [None]:
print(len(links))

In [None]:
#links1 = links[2201:2391]

In [None]:
house_data = get_house_data(links)

In [None]:
print(house_data)

In [None]:
file_name = r"C:\Users\17059\Desktop\Final Project\zoocasa\houses_files\active\active_%s_%s.csv" % (str(time.strftime("%Y-%m-%d")),str(time.strftime("%H%M%S")))
columns = ["MLS","Address","Price","Beds","Baths","ParkingSpots","Description","Type","Levels","Size","DateAvailable","DaysActive","LaundryLevel","CentralVac","Fireplace","Acreage","LotSize","Garage","Exterior","Age","Basement","Driveway","GarageSpaces","Heating","AC","Fuel"]
pd.DataFrame(house_data, columns = columns).to_csv(file_name, index = False, encoding = "UTF-8")