In [1]:
import os
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import pandas as pd
import re

import sys
sys.path.append('../')
from onemap_client import OneMapClient
import os
from dotenv import load_dotenv
load_dotenv()
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
with open('./search/processed/20231116/raw_extracted.json', 'r') as f:
    store = json.load(f)

In [3]:
store_df = {
    'id': [],
    'name': [], 
    'url': [], 
    'street_address': [], 
    'price': [], 
    'num_bedroom': [], 
    'num_bathroom': [], 
    'cost_psf': [], 
    'total_area': [],
    'walk': [],
    'tags': [],
    'recency': [],
}
for key, value in store.items():
    store_df['id'].append(key)
    for i, j in value.items():
        store_df[i].append(j)

listings = pd.DataFrame(store_df)

In [4]:
def clean_name(name):
    if name is None:
        return None
    name = name.strip()
    return name

def clean_url(url):
    if url is None:
        return None
    return url

def clean_street_address(address):
    if address is None:
        return None
    address = address.strip()
    return address

def clean_price(price):
    if price is None:
        return None
    price = price.strip()
    price = re.search(r'[\d,]+(\.\d+)?',price)
    if price:
        price = float(price.group(0).replace(',','').strip())
    else:
        price = 'Price on ask'
    return price

def clean_num_bedroom(num_bedroom):
    if num_bedroom is None:
        return None
    num_bedroom = int(num_bedroom.strip())
    return num_bedroom

def clean_num_bathroom(num_bathroom):
    if num_bathroom is None:
        return None
    num_bathroom = int(num_bathroom.strip())
    return num_bathroom

def clean_cost_psf(cost_psf):
    if cost_psf is None:
        return None
    cost_psf = cost_psf.strip()
    pattern = r"\d{1,3}(?:,\d{3})*(?:\.\d{2})?"
    cost_psf = re.search(pattern, cost_psf)
    cost_psf = float(cost_psf.group(0).replace(',', ''))
    return cost_psf

def clean_total_area(total_area):
    if total_area is None:
        return None
    total_area = total_area.strip()
    if ',' in total_area:

        cleaned = {'floor': None, 'land': None}
        total_area = total_area.split(',')
        for area in total_area:
            keyword = re.search(r'\((.*?)\)', area).group(1).strip()
            size = int(re.search(r'(\d+)', area).group(0))

            cleaned[keyword] = size
    else:
        cleaned = {
            'floor': int(re.search(r'(\d+)', total_area).group(0)),
            'land': None
        }
    
    return cleaned

def clean_walk(walk):
    results = {'destination': None, 'distance': None, 'time': None}
    if walk is not None:
        walk = walk.strip()
        results['destination'] = re.search(r'to (.*)', walk).group(1).strip()
        results['distance'] = re.search(r'\((.*?)\)', walk).group(1).strip()
        results['time'] = re.search(r'(.*?)\s*\(', walk).group(1).strip()
    
    return results


def clean_tags(tags):
    if tags is None:
        return None
    for i in range(len(tags)):
        tags[i] = tags[i].strip()
    return tags


def clean_recency(recency):
    if recency is None:
        return None
    recency = recency.strip()
    return recency


In [5]:
for i, row in tqdm(listings.iterrows()):
    listings.at[i, 'name'] = clean_name(row['name'])
    listings.at[i, 'url'] = clean_url(row['url'])
    listings.at[i, 'street_address'] = clean_street_address(row['street_address'])
    listings.at[i, 'price'] = clean_price(row['price'])
    listings.at[i, 'num_bedroom'] = clean_num_bedroom(row['num_bedroom'])
    listings.at[i, 'num_bathroom'] = clean_num_bathroom(row['num_bathroom'])
    listings.at[i, 'cost_psf'] = clean_cost_psf(row['cost_psf'])
    listings.at[i, 'total_area'] = clean_total_area(row['total_area'])
    listings.at[i, 'walk'] = clean_walk(row['walk'])
    listings.at[i, 'tags'] = clean_tags(row['tags'])
    listings.at[i, 'recency'] = clean_recency(row['recency'])

48729it [00:03, 13257.32it/s]


# Process

In [6]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48729 entries, 0 to 48728
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              48729 non-null  object
 1   name            48729 non-null  object
 2   url             48729 non-null  object
 3   street_address  48729 non-null  object
 4   price           48729 non-null  object
 5   num_bedroom     48304 non-null  object
 6   num_bathroom    48001 non-null  object
 7   cost_psf        47955 non-null  object
 8   total_area      48729 non-null  object
 9   walk            48729 non-null  object
 10  tags            48729 non-null  object
 11  recency         48729 non-null  object
dtypes: object(12)
memory usage: 4.5+ MB


In [7]:
email = os.environ['ONE_MAP_API_EMAIL']
password = os.environ['ONE_MAP_API_PASSWORD']

In [10]:
def process_street_address(address, name, retries=5):
    searches = [address, name, f'{address} {name}']
    for i in range(retries):
        client = OneMapClient(email,password)
        resp = client.search(searches[i%len(searches)], return_geom=True, get_addr_details=True, page_num=1)
        if resp==None or 'error' in resp or resp['found']==0:
            continue
        else:
            return resp['results']
    return None

listings['one_map_search'] = listings.parallel_apply(lambda x: process_street_address(x['street_address'], x['name']), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4873), Label(value='0 / 4873'))), …

objc[5792]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[5792]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
objc[5791]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[5791]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
objc[5793]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[5793]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely cal

KeyboardInterrupt: 

In [None]:
def process_one_map_address(x):
    if x is None:
        return None, None, None, None, None
    road_name = x['ROAD_NAME'] if 'ROAD_NAME' in x and x['ROAD_NAME']!='NIL' else None
    building = x['BUILDING'] if 'BUILDING' in x and x['BUILDING']!='NIL' else None
    postal_code = x['POSTAL'] if 'BUILDING' in x and x['POSTAL']!='NIL' else None
    lat = x['LATITUDE'] if 'LATITUDE' in x and x['LATITUDE']!='NIL' else None
    long = x['LONGITUDE'] if 'LONGITUDE' in x and x['LONGITUDE']!='NIL' else None
    return road_name, building, postal_code, lat, long


listings['road_name'], \
listings['building'], \
listings['postal_code'], \
listings['latitude'], \
listings['longitude'] = zip(*listings['one_map_search'].apply(lambda x: process_one_map_address(x)))

In [None]:
listings['floor_area'] = listings['total_area'].apply(lambda x: x['floor'])
listings['land_area'] = listings['total_area'].apply(lambda x: x['land'])

In [None]:
def process_walk_distance(distance):
    if distance is None:
        return None
    num, unit = distance.split(' ')
    num = int(num)
    if unit.lower()=='km':
        num = num * 1000
    return num

def process_walk_time(time):
    if time is None:
        return None
    num, unit = time.split(' ')
    num = int(num)
    return num

listings['walk_destination'] = listings['walk'].apply(lambda x: x['destination'])
listings['walk_distance_m'] = listings['walk'].apply(lambda x: process_walk_distance(x['distance']))
listings['walk_time_mins'] = listings['walk'].apply(lambda x: process_walk_time(x['time']))

In [None]:
def process_tags(tags):
    processed = {
        'lease_duration': None,
        'completion': None,
        'type': None
    }
    for tag in tags:
        string = tag.lower()
        if 'freehold' in string:
            processed['lease_duration'] = 'Freehold'
        elif 'unknown tenure' in string:
            pass
        elif 'leasehold' in string:
            processed['lease_duration'] = int(re.search(r'\d+', tag).group(0))
        elif 'built' in string or 'completion' in string:
            processed['completion'] = int(re.search(r'\d+', tag).group(0))
        else:
            processed['type'] = tag
    
    return processed['lease_duration'], processed['completion'], processed['type']
    


listings['lease_duration'], \
listings['completion'], \
listings['type'] = zip(*listings['tags'].apply(lambda x: process_tags(x)))

In [None]:
listings.to_csv('./search/processed/20231116/processed.csv')