In [1]:
import requests
import json
import pandas as pd
import time
import re

from selenium import webdriver
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm

In [2]:
def get_full_link(project_id):
    return f'https://homes.hdb.gov.sg/home/sbf/details/{project_id}'

In [3]:
class SBFScraper:

    def __init__(self):

        self.driver = webdriver.Chrome(
            executable_path='/home/chewzy/selenium_drivers/chromedriver')

    def load_page(self, project_id):

        url = get_full_link(project_id)
        self.driver.get(url)
        time.sleep(1)

    def get_soup(self):

        # Update soup
        self.soup = BeautifulSoup(self.driver.page_source)

        # Update block and flat soups
        params = {'class': 'form-row'}
        self.block_soup, self.flat_soup = self.soup.find(
            'div', **params).find_all('div', class_='col')
        self.block_param = {'class': ' '.join(
            list(self.block_soup.children)[0].get('class'))}
        self.flat_param = {'class': ' '.join(
            list(self.flat_soup.children)[0].get('class'))}

    def select_blk_num(self, blk_num):

        select_class = self.block_param['class']

        (
            self.driver
            .find_element_by_xpath(f"//select[@class='{select_class}']/option[text()='{blk_num}']")
            .click()
        )

    def select_flat_type(self, flat_type):
        select_class = self.flat_param['class']
        (
            self.driver
            .find_element_by_xpath(f"//select[@class='{select_class}']/option[text()='{flat_type}']")
            .click()
        )

    def get_block_details(self, town, lease, est_comp, proj_id, blk_num, flat_type):

        # Gather block details
        block_details = dict()
        block_details['Town'] = []
        block_details['Remaining_Lease'] = []
        block_details['Est_Completion'] = []
        block_details['proj_id'] = []
        block_details['blk_num'] = []
        block_details['flat_type'] = []
        block_details['floor_num'] = []
        block_details['unit_num'] = []
        block_details['unit_size'] = []
        block_details['unit_price'] = []

        # Get all floor levels
        params = {'class': 'col-12 d-flex flex-wrap'}
        floor_lvls = self.soup.find_all('div', **params)

        for floor in floor_lvls:

            # Get floor level
            params = {'for': 'floor-06'}
            floor_txt = floor.find('label', **params).text

            # Get all unit
            params = {'class': 'flat-grid unit'}
            units = floor.find_all('div', **params)

            for unit in units:
                unit_info_str = str(unit.find('a'))
                unit_info_str = re.sub('<.*?>', ' ', unit_info_str).strip()
                unit_num, floor_size, price = (
                    re.search('(.+) +(\d+ sqm) +(\$\d+,\d+)',
                              unit_info_str).groups()
                )
                block_details['Town'].append(town)
                block_details['Remaining_Lease'].append(lease)
                block_details['Est_Completion'].append(est_comp)
                block_details['proj_id'].append(proj_id)
                block_details['blk_num'].append(blk_num)
                block_details['flat_type'].append(flat_type)
                block_details['floor_num'].append(floor_txt)
                block_details['unit_num'].append(unit_num)
                block_details['unit_size'].append(floor_size)
                block_details['unit_price'].append(price)

        self.df = self.df.append(pd.DataFrame(block_details))

    def get_project_details(self, project_id):

        self.df = pd.DataFrame()
        
        try:
            self.load_page(project_id)
            self.get_soup()
            
            params = {'class':'col-lg-6 col-sm-12 col-md-6 mb-3'}
            sbf_raw = [div.text for div in self.soup.find_all('div', **params)]
            
            town = sbf_raw[0].replace('Town','').strip()
            lease = sbf_raw[1].replace('Remaining Lease','').strip()
            est_comp = sbf_raw[2].replace('Est. Completion Date','').strip()
            
            # Block options
            blk_nums = list(
                option.text
                for option in (
                    self.block_soup
                    .find('select', **self.block_param)
                    .find_all('option')
                )
            )

            for blk_num in blk_nums:

                self.select_blk_num(blk_num)
                self.get_soup()

                # Flat type options
                flat_types = list(
                    option.text
                    for option in (
                        self.flat_soup
                        .find('select', **self.flat_param)
                        .find_all('option')
                    )
                )

                for flat_type in flat_types:

                    self.select_flat_type(flat_type)
                    self.get_soup()
                    self.get_block_details(town, lease, est_comp, project_id, blk_num, flat_type)

            return self.df
        
        except Exception as e:
            print(e)
            print('Failed all retries, returning None')
            return None

In [4]:
scraper = SBFScraper()

In [5]:
df = pd.read_csv('../data/SBF_project_info.csv')

In [10]:
df_sbf_info = pd.DataFrame()

proj_list = df['proj_id'].tolist()

for proj_id in tqdm(proj_list):
    
    temp_df = scraper.get_project_details(proj_id)
    
    if temp_df is not None:
        df_sbf_info = df_sbf_info.append(temp_df)
    else:
        # not a good practice as it mutates the iterator
        # but lets make do for now
        proj_list.append(proj_id)

 16%|█▋        | 69/421 [02:45<48:56,  8.34s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 17%|█▋        | 70/421 [02:47<36:58,  6.32s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 17%|█▋        | 71/421 [02:48<28:21,  4.86s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 17%|█▋        | 72/421 [02:50<22:13,  3.82s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 17%|█▋        | 73/421 [02:51<17:57,  3.10s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 39%|███▉      | 164/421 [07:38<10:38,  2.48s/it] 

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


 95%|█████████▌| 402/421 [19:02<02:07,  6.69s/it]

'NoneType' object has no attribute 'find_all'
Failed all retries, returning None


428it [20:00,  2.80s/it]                         


In [17]:
df_merged = (
    df_sbf_info
    .merge(df[['proj_id','precinct_name']], on='proj_id')
)

In [18]:
(
    df_merged
    .to_csv('../data/SBF_project_info_v2.csv', index=False)
)

In [19]:
df_merged

Unnamed: 0,Town,Remaining_Lease,Est_Completion,proj_id,blk_num,flat_type,floor_num,unit_num,unit_size,unit_price,precinct_name
0,Bukit Batok,15 - 99 years,1/2022,2021-05_SBF_QkJfTjFDMTVfMTYxOTUwOTg0NzQwMg,Blk 114A,2-room Flexi,#12,516,47 sqm,"$179,600",Sky Vista @ Bukit Batok
1,Bukit Batok,15 - 99 years,1/2022,2021-05_SBF_QkJfTjFDMTVfMTYxOTUwOTg0NzQwMg,Blk 114B,2-room Flexi,#08,508,47 sqm,"$188,100",Sky Vista @ Bukit Batok
2,Bukit Batok,63 years,Keys Available,2021-05_SBF_QkJfTjFDOUFfMTYyMDIxMDYwMDEwNA,Blk 167,4-room,#02,236 *,103 sqm,"$275,000",Bt Batok West Ave 8
3,Bukit Batok,15 - 45 years,Keys Available,2021-05_SBF_QkJfTjJDMjFfMTYyMDQ1MzI0MDE0OA,Blk 230A,2-room Flexi,#10,415 *,47 sqm,"$98,100",Golden Daisy
4,Bukit Batok,15 - 45 years,Keys Available,2021-05_SBF_QkJfTjJDMjFfMTYyMDQ1MzI0MDE0OA,Blk 230A,2-room Flexi,#07,421 *,37 sqm,"$77,100",Golden Daisy
...,...,...,...,...,...,...,...,...,...,...,...
2489,Jurong West,15 - 99 years,9/2022 to 11/2022,2021-05_SBF_SldfTjJDMjBfMTYxOTUwOTg0OTMwMA,Blk 238B,2-room Flexi,#09,277,38 sqm,"$139,900",Boon Lay Glade
2490,Jurong West,15 - 99 years,9/2022 to 11/2022,2021-05_SBF_SldfTjJDMjBfMTYxOTUwOTg0OTMwMA,Blk 238B,3-room,#12,267,68 sqm,"$255,000",Boon Lay Glade
2491,Jurong West,15 - 99 years,9/2022 to 11/2022,2021-05_SBF_SldfTjJDMjBfMTYxOTUwOTg0OTMwMA,Blk 238B,3-room,#11,269,68 sqm,"$257,100",Boon Lay Glade
2492,Jurong West,15 - 99 years,9/2022 to 11/2022,2021-05_SBF_SldfTjJDMjBfMTYxOTUwOTg0OTMwMA,Blk 238B,4-room,#10,261,93 sqm,"$387,400",Boon Lay Glade
