# Domain Pre-Processing

In [1]:
import json
import pandas as pd
import re

In [2]:
with open('../data/landing/domain.json', 'r') as json_file:
    data = json.load(json_file)

In [3]:
df = pd.DataFrame.from_dict(data).T
df.count()

name           9840
cost_text      9840
rooms          9840
parking        9840
coordinates    9840
desc           9840
dtype: int64

## Extract Some Features

In [4]:
def extract_price(cost_text):

    """
    Extracts a price value from a given text containing prices.
    """

    # Pattern for prices with or without commas as thousands separators
    price_pattern = r'\$([\d,]+(?:\.\d+)?)'
    
    # Find all price matches in the text
    prices = re.findall(price_pattern, cost_text)
    
    if prices:
        # Extract the first match
        price_str = prices[0]
        
        # Remove commas and dollar sign, then convert to float
        price_str = price_str.replace(',', '').replace('$', '')
        
        try:
            return float(price_str)
        except ValueError:
            return None
    else:
        return None


In [5]:
def extract_bed(rooms):

    """
    Extracts the number of beds from a string containing room information.
    """
    
    bed_pattern = r'(\d)\sBed'
    bed_matches = re.findall(bed_pattern, str(rooms))
    
    # If there are bed matches, return the first one as an integer, otherwise return None
    return int(bed_matches[0]) if bed_matches else None

In [6]:
def extract_bath(rooms):

    """
    Extracts the number of baths from a string containing room information.
    """

    bath_pattern = r'(\d)\sBath'
    bath_matches = re.findall(bath_pattern, str(rooms))

    return int(bath_matches[0]) if bath_matches else None

In [7]:
def extract_parking(parking):

    """
    Extracts the number of parking from a string containing parking information.
    """

    parking_pattern = r'(\d)\sParking'
    parking_matches = re.findall(parking_pattern, str(parking))

    # If there are parking matches, return the first one as an integer, otherwise return 0
    return int(parking_matches[0]) if parking_matches else 0

In [8]:
df['price'] = df['cost_text'].apply(extract_price)
df['bed'] = df['rooms'].apply(extract_bed)
df['bath'] = df['rooms'].apply(extract_bath)
df['park_num'] = df['parking'].apply(extract_parking)

df['postcode'] = df['name'].str[-4:]
df['Latitude'] = df['coordinates'].apply(lambda x: x[0] if x is not None else None)
df['Longitude'] = df['coordinates'].apply(lambda x: x[1] if x is not None else None)

## Feature Engineering

In [9]:
# drop some insufficient features
df.reset_index(drop=True, inplace=True)
columns_to_drop = ['cost_text', 'rooms', 'parking', 'desc']
df.drop(columns=columns_to_drop, inplace=True)

df = df.dropna(axis=0,how='any') # delete price = NaN
df.head()

Unnamed: 0,name,coordinates,price,bed,bath,park_num,postcode,Latitude,Longitude
0,2010/7 Katherine Place Melbourne VIC 3000,"[-37.8201858, 144.9564628]",675.0,2.0,2.0,0,3000,-37.820186,144.956463
1,1308/68 La Trobe Street Melbourne VIC 3000,"[-37.8083648, 144.9667899]",570.0,2.0,1.0,1,3000,-37.808365,144.96679
2,1307/268 Flinders Street Melbourne VIC 3000,"[-37.8178134, 144.9655688]",400.0,1.0,1.0,0,3000,-37.817813,144.965569
3,407/62-68 Hayward Lane Melbourne VIC 3000,"[-37.8089779, 144.9668783]",350.0,0.0,1.0,0,3000,-37.808978,144.966878
4,403/39 Queen Street Melbourne VIC 3000,"[-37.817888, 144.9617372]",450.0,0.0,1.0,0,3000,-37.817888,144.961737


In [10]:
output_path = "../data/raw/domain.csv"
df.to_csv(output_path, index=False)