In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/victoria_house.csv')

In [3]:
df.shape

(105120, 15)

In [4]:
def exploration(df):
    
    print(f'>SHAPE:\n{df.shape}')
    print('-' * 100)
    print(f'>COLUMN NAMES:\n{list(df.columns)}')
    print('-' * 100)
    print(f'>DESCRIBE:\n{df.describe()}')
    print('-' * 100)
    print(f'>ISNULL:\n{df.isnull().sum()}')
    print('-' * 100)
    print(f'>DTYPES:\n{df.dtypes}')
    
exploration(df)

>SHAPE:
(105120, 15)
----------------------------------------------------------------------------------------------------
>COLUMN NAMES:
['latitude', 'longitude', 'streetAddress', 'suburb', 'postcode', 'region', 'bedrooms', 'bathrooms', 'parkingSpaces', 'propertyType', 'price', 'listingId', 'title', 'dateSold', 'modifiedDate']
----------------------------------------------------------------------------------------------------
>DESCRIBE:
            latitude      longitude       postcode      bedrooms  \
count  105119.000000  105119.000000  105120.000000  105120.00000   
mean      -37.588618     144.986669    3375.773735       2.65429   
std         0.612449       0.884748     289.116018       0.79562   
min       -38.661799     141.579392    3000.000000       0.00000   
25%       -37.998847     144.339984    3101.000000       2.00000   
50%       -37.780162     144.962296    3337.000000       3.00000   
75%       -37.581211     145.347311    3629.000000       3.00000   
max       -36.1

In [5]:
df.head(1)

Unnamed: 0,latitude,longitude,streetAddress,suburb,postcode,region,bedrooms,bathrooms,parkingSpaces,propertyType,price,listingId,title,dateSold,modifiedDate
0,-38.276067,144.485488,6 Corymbia Circuit,Barwon Heads,3227,bellarine_peninsula,3,2,2,house,"$1,255,000",129714950,Luxurious Coastal Lifestyle Awaits,2018-11-06,2018-11-06T02:00:19Z


In [6]:
# We drop modifiedDate
df.drop('modifiedDate', axis = 1, inplace = True)

##### We see that the price is not numeric. Also, the dateSold is not  a datetime. We drop the modifiedDate.

In [7]:
df['price'].unique()

array(['$1,255,000', '$725,000', '$670,000', '$600,000', '$520,000',
       '$515,000', '$460,000', '$1,935,000', '$1,500,000', '$1,400,000',
       '$1,300,000', '$990,000', '$920,000', '$885,000', '$855,000',
       '$835,000', '$735,000', '$730,000', '$658,000', '$650,545',
       '$650,000', '$647,000', '$643,000', '$627,500', '$625,000',
       '$615,000', '$580,000', '$577,500', '$577,000', '$570,000',
       '$550,000', '$525,000', '$497,000', '$493,500', '$485,000',
       '$482,000', '$480,000', '$472,000', '$466,000', '$455,000',
       '$447,500', '$410,000', '$401,000', '$398,000', '$390,000',
       '$380,000', '$379,000', '$350,000', '$330,000', '$305,000',
       '$290,000', '$275,000', '$790,000', '$376,000', '$1,530,000',
       'Contact agent', '$1,265,000', '$1,235,000', '$1,067,500',
       '$900,000', '$780,000', '$678,000', '$632,500', '$630,000',
       '$605,000', '$603,500', '$575,000', '$540,000', '$530,000',
       '$500,600', '$487,000', '$445,000', '$430,00

In [8]:
# How many rows have no price?
print('>SHAPE:',df[df['price'] == 'Contact agent'].shape)

>SHAPE: (15, 14)


In [9]:
def get_price(price):
    
    return price.replace(',','').replace('$','')

In [10]:
df['price'] = df['price'].apply(lambda x: get_price(x))

In [11]:
df['price'].unique()

array(['1255000', '725000', '670000', '600000', '520000', '515000',
       '460000', '1935000', '1500000', '1400000', '1300000', '990000',
       '920000', '885000', '855000', '835000', '735000', '730000',
       '658000', '650545', '650000', '647000', '643000', '627500',
       '625000', '615000', '580000', '577500', '577000', '570000',
       '550000', '525000', '497000', '493500', '485000', '482000',
       '480000', '472000', '466000', '455000', '447500', '410000',
       '401000', '398000', '390000', '380000', '379000', '350000',
       '330000', '305000', '290000', '275000', '790000', '376000',
       '1530000', 'Contact agent', '1265000', '1235000', '1067500',
       '900000', '780000', '678000', '632500', '630000', '605000',
       '603500', '575000', '540000', '530000', '500600', '487000',
       '445000', '430000', '400000', '386000', '1453000', '1200000',
       '1100000', '1055000', '1000000', '873000', '865000', '850000',
       '810000', '805000', '800000', '765000', '750

In [12]:
df = df[df['price'].str.isnumeric()]

In [13]:
# Checking of the deletion.
print('>SHAPE:',df[~df['price'].str.isnumeric()].shape)

>SHAPE: (0, 14)


In [14]:
df['price'] = pd.to_numeric(df['price'])

In [15]:
df['dateSold'] = pd.to_datetime(df['dateSold'])

In [16]:
# Checking of the types changing.
df.dtypes

latitude                float64
longitude               float64
streetAddress            object
suburb                   object
postcode                  int64
region                   object
bedrooms                  int64
bathrooms                 int64
parkingSpaces             int64
propertyType             object
price                     int64
listingId                 int64
title                    object
dateSold         datetime64[ns]
dtype: object

In [17]:
df.head(1)

Unnamed: 0,latitude,longitude,streetAddress,suburb,postcode,region,bedrooms,bathrooms,parkingSpaces,propertyType,price,listingId,title,dateSold
0,-38.276067,144.485488,6 Corymbia Circuit,Barwon Heads,3227,bellarine_peninsula,3,2,2,house,1255000,129714950,Luxurious Coastal Lifestyle Awaits,2018-11-06


In [18]:
# We have 5242 rows with no region name.
df['region'].unique()

array(['bellarine_peninsula', 'melbourne___northern_region', nan,
       'south_western_region', 'south_west_melbourne',
       'south_east_melbourne', 'mornington_peninsula',
       'eastern_melbourne', 'western_region', 'western_melbourne',
       'gippsland', 'north_west_melbourne', 'northern_region',
       'inner_east_melbourne', 'north_east_region', 'bayside',
       'melbourne_city___greater_region'], dtype=object)

##### We fix the loosing region name by the postcode.

In [19]:
dfdict = df[['postcode','region']].copy()
dfdict.dropna(axis = 0,inplace = True)
dfdict.drop_duplicates(inplace = True)
regionDict = dfdict.to_dict('records')

In [20]:
def get_region(postcode, regionDict):
    
    for i in regionDict:
        if i['postcode'] == postcode:
            return i['region']

In [21]:
df['region'] = df['postcode'].apply(lambda x: get_region(x, regionDict))

In [22]:
# There is still some rows with no region, so we delete them.
df.isnull().sum()

latitude          1
longitude         1
streetAddress     0
suburb            0
postcode          0
region           11
bedrooms          0
bathrooms         0
parkingSpaces     0
propertyType      0
price             0
listingId         0
title             0
dateSold          0
dtype: int64

In [23]:
# We drop this row also.
df[df['latitude'].isnull()]

Unnamed: 0,latitude,longitude,streetAddress,suburb,postcode,region,bedrooms,bathrooms,parkingSpaces,propertyType,price,listingId,title,dateSold
34,,,12 Caitlin Street,Lara,3212,south_west_melbourne,4,2,2,house,577000,128340206,BRAND NEW JUST UNPACK AND ENJOY!,2018-11-05


In [24]:
df.dropna(inplace = True)

In [25]:
# Checking the deletion.
df.isnull().sum()

latitude         0
longitude        0
streetAddress    0
suburb           0
postcode         0
region           0
bedrooms         0
bathrooms        0
parkingSpaces    0
propertyType     0
price            0
listingId        0
title            0
dateSold         0
dtype: int64

In [26]:
df.shape

(105093, 14)

In [27]:
df.head(5)

Unnamed: 0,latitude,longitude,streetAddress,suburb,postcode,region,bedrooms,bathrooms,parkingSpaces,propertyType,price,listingId,title,dateSold
0,-38.276067,144.485488,6 Corymbia Circuit,Barwon Heads,3227,bellarine_peninsula,3,2,2,house,1255000,129714950,Luxurious Coastal Lifestyle Awaits,2018-11-06
1,-38.238159,144.545576,9 La Bella Street,Ocean Grove,3226,bellarine_peninsula,4,2,2,house,725000,129219294,Laid Back Family Living In An Idyllic Location,2018-11-06
2,-37.638387,144.922701,26 Greensted Grove,Roxburgh Park,3064,melbourne___northern_region,4,2,2,house,670000,129517974,Oversized & park-facing,2018-11-06
3,-38.121844,144.332476,80 Calvert Street,Hamlyn Heights,3215,bellarine_peninsula,3,2,1,house,600000,129640018,Cherished Family Home In Leafy Hamlyn Heights,2018-11-06
4,-38.129071,144.32364,295 Church Street,Herne Hill,3218,bellarine_peninsula,3,1,2,house,520000,129459426,Potential Plus,2018-11-06


##### We save the dataset.

In [28]:
df.to_csv('victoria_house_cln.csv')