In [166]:
import pandas as pd
import datetime

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [167]:
data = pd.read_csv('./data/raw_data.csv', header=None, on_bad_lines='skip')

data.columns = column_names = [
"address",
"price",
"gross_tax",
"strata_maintenance_fees",
"bedrooms",
"bathrooms",
"property_type",
"property_age",
"title",
"style",
"heating_type",
"feature",
"amenities",
"appliances",
"community",
"days_on_rew",
"property_views",
"mls®_number",
"source",
'frontage', 
'lot_size', 
'year_built', 
'depth']
data.head()


Unnamed: 0,address,price,gross_tax,strata_maintenance_fees,bedrooms,bathrooms,property_type,property_age,title,style,heating_type,feature,amenities,appliances,community,days_on_rew,property_views,mls®_number,source,frontage,lot_size,year_built,depth
0,address,price,gross_tax,strata_maintenance_fees,bedrooms,bathrooms,property_type,property_age,title,style,heating_type,feature,amenities,appliances,community,days_on_rew,property_views,mls®_number,source,frontage,lot_size,year_built,depth
1,3735 Puget Drive,"$5,498,800","$19,041",,3,4,house,,freehold nonstrata,2 storey w/bsmt.,,,"golf course nearby, recreation nearby",,arbutus,36 days,2410,r2735110,rebgv,57.00 feet,57 ft x 122 ft (6954 ft²),built in 1987 (35 yrs old),122
2,3229 Trutch Street,"$6,800,000","$23,411",,9,10,house,,freehold nonstrata,2 storey w/bsmt.,"natural gas,radiant","pantry, smoke alarm, sprinkler - fire, vacuum ...","air conditioning, recreation nearby, shopping ...",washer/dryer/fridge/stove/dishwasher,arbutus,46 days,547,r2732958,rebgv,45.00 feet,45 ft x 190 ft (8500 ft²),built in 2020 (2 yrs old),190
3,2127 W 21st Avenue,"$4,360,000","$15,988",,7,5,house,,freehold nonstrata,3 storey,,,,,arbutus,109 days,688,r2689450,rebgv,50.00 feet,50 ft x 122 ft (6100 ft²),built in 1989 (33 yrs old),122
4,2268 W 19th Avenue,"$6,880,000","$23,147",,5,8,house,,freehold nonstrata,2 storey w/bsmt.,"heat recovery ventilator, hot water,natural ga...","drapes/window coverings, heat recovery ventila...","air conditioning, central air conditioning, sauna",washer/dryer/fridge/stove/dishwasher,arbutus,109 days,617,r2682157,rebgv,50.00 feet,50 ft x 122 ft (6104 ft²),built in 2017 (5 yrs old),122


In [168]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   address                  402 non-null    object
 1   price                    402 non-null    object
 2   gross_tax                150 non-null    object
 3   strata_maintenance_fees  60 non-null     object
 4   bedrooms                 150 non-null    object
 5   bathrooms                150 non-null    object
 6   property_type            150 non-null    object
 7   property_age             1 non-null      object
 8   title                    150 non-null    object
 9   style                    150 non-null    object
 10  heating_type             137 non-null    object
 11  feature                  119 non-null    object
 12  amenities                130 non-null    object
 13  appliances               134 non-null    object
 14  community                150 non-null    o

In [169]:
# Filter Houses Only
houses = data[data['property_type'] == 'house']

In [170]:
# Convert Price column to integer
houses["price"] = houses["price"].str.replace(",", "").str.replace("$", "")
houses["price"] = pd.to_numeric(houses["price"])

In [171]:
# extract the lot width from the lot_size column
houses["lot_width"] = houses["lot_size"].str.extract(r"^(\d+) ft x")

# extract the lot length from the lot_size column
houses["lot_length"] = houses["lot_size"].str.extract(r"(\d+) ft x")

# extract the lot size from the lot_size column
houses["lot_size"] = houses["lot_size"].str.extract(r"\((\d+) ft²\)")



In [172]:
# Convert Gross Tax Income to Integer
houses['gross_tax'] = houses['gross_tax'].astype(str)
houses['gross_tax'] = houses['gross_tax'].str.replace(',', '')
houses['gross_tax'] = houses['gross_tax'].str.replace('$', '')
houses['gross_tax'] = houses['gross_tax'].astype(int)

In [173]:
# Convert House Age to integer - Consider prebuild houses with age 0
# extract the year from the year_built column
houses["year_built"] = houses["year_built"].str.extract(r"(\d+)")

# compute the age of the house using the current year
current_year = datetime.datetime.now().year
houses["age"] = current_year - pd.to_numeric(houses["year_built"])

## remove duplicate based on mls number
houses = houses.drop_duplicates(subset=['mls®_number'])

In [174]:
houses = houses.drop(['property_age'], axis=1)
houses = houses.drop(['strata_maintenance_fees'], axis=1)

In [175]:
# extract bungalow information
houses["bungalow"] = houses["style"].str.contains("bungalow").fillna(-1).astype(int)

# extract storey information
houses["storey"] = houses["style"].str.extract(r"(\d+)").fillna(-1).astype(int)

# extract basement information
houses["basement"] = houses["style"].str.contains(" w/bsmt").fillna(-1).astype(int)

# extract laneway house information
houses["laneway_house"] = houses["style"].str.contains("laneway house").fillna(-1).astype(int)

# Extract garage house information:
houses["garage"] = houses["feature"].str.contains("garage").fillna(-1).astype(int)


# extract split entry information
houses["split_entry"] = houses["style"].str.contains("split entry").fillna(-1).astype(int)
houses = houses.drop(['style'], axis=1)

In [176]:
# these columns can be cleaned more thoroughly, but they are not as important at the moment
houses = houses.drop(['title','amenities','heating_type', 'days_on_rew', 'appliances', 'feature', 'source', 'frontage', 'mls®_number','depth'], axis=1)


In [177]:
# IMPORTANT:
"""
We need to get the GEO Cordinates for address. We will drop the column for now
"""

# houses = houses.drop(['address'], axis=1)

'\nWe need to get the GEO Cordinates for address. We will drop the column for now\n'

In [178]:
pd.options.display.max_columns = None

houses.head()


Unnamed: 0,address,price,gross_tax,bedrooms,bathrooms,property_type,community,property_views,lot_size,year_built,lot_width,lot_length,age,bungalow,storey,basement,laneway_house,garage,split_entry
1,3735 Puget Drive,5498800,19041,3,4,house,arbutus,2410,6954,1987,57,57,36,0,2,1,0,-1,0
2,3229 Trutch Street,6800000,23411,9,10,house,arbutus,547,8500,2020,45,45,3,0,2,1,0,0,0
3,2127 W 21st Avenue,4360000,15988,7,5,house,arbutus,688,6100,1989,50,50,34,0,3,0,0,-1,0
4,2268 W 19th Avenue,6880000,23147,5,8,house,arbutus,617,6104,2017,50,50,6,0,2,1,0,0,0
5,3894 Quesnel Drive,5888000,12189,3,4,house,arbutus,325,4100,1933,44,44,90,0,3,0,0,0,0


In [179]:
houses.to_csv('data/clean_housing_data.csv')