In [80]:
import pandas as pd
import datetime

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [81]:
data = pd.read_csv('./data/sample-data.csv', header=None, on_bad_lines='skip')

data.columns = column_names = [
"address",
"price",
"gross_tax",
"strata_maintenance_fees",
"bedrooms",
"bathrooms",
"property_type",
"property_age",
"title",
"style",
"heating_type",
"feature",
"amenities",
"appliances",
"community",
"days_on_rew",
"property_views",
"mls®_number",
"source",
'frontage', 
'lot_size', 
'year_built', 
'depth']
data.head()


Unnamed: 0,address,price,gross_tax,strata_maintenance_fees,bedrooms,bathrooms,property_type,property_age,title,style,heating_type,feature,amenities,appliances,community,days_on_rew,property_views,mls®_number,source,frontage,lot_size,year_built,depth
0,3735 Puget Drive,"$5,498,800","$19,041",,3,4,house,,freehold nonstrata,2 storey w/bsmt.,,,"golf course nearby, recreation nearby",,arbutus,13 days,1491,r2735110,rebgv,57.00 feet,57 ft x 122 ft (6954 ft²),built in 1987 (35 yrs old),122.0
1,3735 Puget Drive,"$5,498,800","$19,041",,3,4,house,,freehold nonstrata,2 storey w/bsmt.,,,"golf course nearby, recreation nearby",,arbutus,13 days,1491,r2735110,rebgv,57.00 feet,57 ft x 122 ft (6954 ft²),built in 1987 (35 yrs old),122.0
2,302 5118 Cambie Street,"$2,367,900",$0,$925,3,3,apt/condo,,freehold strata,1 storey,"forced air,heat pump","elevator, oven - built in","air conditioning, central air conditioning, re...","in suite laundry, microwave, range top, washer...",cambie,109 days,592,r2712600,rebgv,,,built in 2022 (0 yrs old),
3,438 W 17th Avenue,"$3,888,000","$10,064",,8,4,multifamily,,freehold nonstrata,,,,,,cambie,61 days,1129,r2722725,rebgv,49.70 feet,49 ft x 137 ft (6807 ft²),built in 1930 (92 yrs old),137.5
4,2505 1151 W Georgia Street,"$2,470,000","$5,658","$1,215",2,2,apt/condo,,freehold strata,upper unit,"forced air,heat pump",elevator,"central air conditioning, exercise room, recre...",in suite laundry,coal harbour,57 days,789,r2724260,rebgv,,,built in 2016 (6 yrs old),


In [82]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   address                  38 non-null     object 
 1   price                    38 non-null     object 
 2   gross_tax                38 non-null     object 
 3   strata_maintenance_fees  10 non-null     object 
 4   bedrooms                 38 non-null     int64  
 5   bathrooms                38 non-null     int64  
 6   property_type            38 non-null     object 
 7   property_age             0 non-null      float64
 8   title                    38 non-null     object 
 9   style                    36 non-null     object 
 10  heating_type             23 non-null     object 
 11  feature                  21 non-null     object 
 12  amenities                31 non-null     object 
 13  appliances               23 non-null     object 
 14  community                38 

In [83]:
# Filter Houses Only
houses = data[data['property_type'] == 'house']

In [84]:
# Convert Price column to integer
houses["price"] = houses["price"].str.replace(",", "").str.replace("$", "")
houses["price"] = pd.to_numeric(houses["price"])

In [85]:
# extract the lot width from the lot_size column
houses["lot_width"] = houses["lot_size"].str.extract(r"^(\d+) ft x")

# extract the lot length from the lot_size column
houses["lot_length"] = houses["lot_size"].str.extract(r"(\d+) ft x")

# extract the lot size from the lot_size column
houses["lot_size"] = houses["lot_size"].str.extract(r"\((\d+) ft²\)")



In [86]:
# Convert Gross Tax Income to Integer
houses['gross_tax'] = houses['gross_tax'].astype(str)
houses['gross_tax'] = houses['gross_tax'].str.replace(',', '')
houses['gross_tax'] = houses['gross_tax'].str.replace('$', '')
houses['gross_tax'] = houses['gross_tax'].astype(int)

In [87]:
# Convert House Age to integer - Consider prebuild houses with age 0
# extract the year from the year_built column
houses["year_built"] = houses["year_built"].str.extract(r"(\d+)")

# compute the age of the house using the current year
current_year = datetime.datetime.now().year
houses["age"] = current_year - pd.to_numeric(houses["year_built"])

## remove duplicate based on mls number
houses = houses.drop_duplicates(subset=['mls®_number'])

In [88]:
houses = houses.drop(['property_age'], axis=1)
houses = houses.drop(['strata_maintenance_fees'], axis=1)

In [89]:
# extract bungalow information
houses["bungalow"] = houses["style"].str.contains("bungalow").fillna(1).astype(int)

# extract storey information
houses["storey"] = houses["style"].str.extract(r"(\d+)").fillna(1).astype(int)

# extract basement information
houses["basement"] = houses["style"].str.contains(" w/bsmt").fillna(1).astype(int)

# extract laneway house information
houses["laneway_house"] = houses["style"].str.contains("laneway house").fillna(1).astype(int)

# Extract garage house information:
houses["garage"] = houses["feature"].str.contains("garage").fillna(1).astype(int)


# extract split entry information
houses["split_entry"] = houses["style"].str.contains("split entry").fillna(1).astype(int)
houses = houses.drop(['style'], axis=1)

In [90]:
# these columns can be cleaned more thoroughly, but they are not as important at the moment
houses = houses.drop(['title','amenities','heating_type', 'days_on_rew', 'appliances', 'feature', 'source', 'frontage', 'mls®_number','depth'], axis=1)


In [91]:
# IMPORTANT:
"""
We need to get the GEO Cordinates for address. We will drop the column for now
"""

houses = houses.drop(['address'], axis=1)

In [92]:
pd.options.display.max_columns = None

houses.head()


Unnamed: 0,price,gross_tax,bedrooms,bathrooms,property_type,community,property_views,lot_size,year_built,lot_width,lot_length,age,bungalow,storey,basement,laneway_house,garage,split_entry
0,5498800,19041,3,4,house,arbutus,1491,6954,1987,57,57,36,0,2,1,0,1,0
22,4360000,15988,7,5,house,arbutus,601,6100,1989,50,50,34,0,3,0,0,1,0
23,6880000,23147,5,8,house,arbutus,540,6104,2017,50,50,6,0,2,1,0,0,0
24,5888000,12189,3,4,house,arbutus,284,4100,1933,44,44,90,0,3,0,0,0,0
26,4299000,14265,7,6,house,arbutus,1440,7320,1994,60,60,29,0,2,1,0,1,0


In [93]:
houses.to_csv('data/clean_housing_data.csv')