In [4]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import collections


def soup_maker(url):
    '''
    Returns a beautiful soup object of the webpage when the webpage URL is passed. The data is scraped
    from the beautiful soup object by the rest of the functions
    '''
    assert isinstance(url,str)
    r = requests.get(url)
    markup = r.content
    soup = bs(markup, 'lxml')
    return soup


def dwelling_all_details(url,d):
    '''
    Takes url of a dwelling as the input. Returns all the dwelling details
    '''
    
    assert isinstance(url,str)
    all_details = []
    soup = soup_maker(url)
    
    #this bit gets information that's stored in div, span classes
    
    dwelling_address = soup.title.string
    all_details.append(('Address',dwelling_address.string))
    dwelling_price = soup.find('div', {'class': 'price'})
    all_details.append(('Price',dwelling_price.string))
    dwelling_pricechange = soup.find('span', {'class': 'change_down'})
    if dwelling_pricechange != None:
        all_details.append(('Price Change',dwelling_pricechange.string))
    else:
        all_details.append(('Price Change',0))
    dwelling_pricechangedate = soup.find('span', {'class': 'price_change_date'})
    if dwelling_pricechange != None:
        all_details.append(('Days Since Price Change',17853 - (int(dwelling_pricechangedate.string))/86000))
    else:
        all_details.append(('Days Since Price Change','NA'))
    
    for k, v in all_details:
        d[k].append(v)
        
    #this bit gets all the other information not found in classes, so in <dt>, <dd> tags
        
    comp_info = pd.DataFrame()
    cleaned_id_text = []
    cleaned_id_attrb_text = []
    info = soup.find_all("div", {'class':'prop-descrip property_detail_specs'})
    for j in range(len(info)):
        for i in info[j].find_all('dt'):
            cleaned_id_text.append(i.text)
        for i in info[j].find_all('dd'):
            cleaned_id_attrb_text.append(i.text)
    
    # this replaces any values that aren't there in the listing with default 0
    
    attribute_list = ['Status','Dwelling Type','Days on Market','Bedrooms', 'Area', 'MLS(R)#','Half baths','Style', 'Community', 'Living Area','Year Built','Total baths', 'Stories']
    comp_info['Id'] = cleaned_id_text
    comp_info['Attribute'] = cleaned_id_attrb_text
    for i in attribute_list:
        if i not in list(comp_info.Id):
            df2 = pd.DataFrame([[i,0]], columns=['Id','Attribute'])
            comp_info = comp_info.append(df2,ignore_index = True)
    comp_info = comp_info[comp_info.Id != 'Price Change']
    comp_dict = dict(zip(comp_info.Id, comp_info.Attribute))
    
    #puts it all together in a dataframe
    
    for k, v in comp_dict.items():
        d[k].append(v)

    return(d)



In [5]:
### Finds all dwelling details from the main webpage
### Stores all the information in a dictionary

d = collections.defaultdict(list)
for i in range(100):
    url = 'https://search.vancitycondoguide.com/search/details/h/'+str(i)
    result = dwelling_all_details(url,d)


In [6]:
## Converts the dataset into a panda dataframe and writes into a CSV file
my_df = pd.DataFrame(d)
my_df.to_csv('dwellings1.csv',encoding='utf-8-sig')