In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./page_in_pages.csv')

In [3]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne VIC 3000",2 Beds,1 Bath,− Parking,53m²,$550k,47279,20 to 39
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne VIC 3000",2 Beds,2 Baths,1 Parking,93m²,$550k,47279,20 to 39
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne VIC 3000",2 Beds,1 Bath,1 Parking,80m²,$550k,47279,20 to 39


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80596 entries, 0 to 80595
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          80450 non-null  object
 1   price         80360 non-null  object
 2   address       80596 non-null  object
 3   bedrooms      80592 non-null  object
 4   bathrooms     75452 non-null  object
 5   parking       75452 non-null  object
 6   area          74408 non-null  object
 7   median_price  77286 non-null  object
 8   population    77286 non-null  object
 9   average_age   77286 non-null  object
dtypes: object(10)
memory usage: 6.1+ MB


In [5]:
df.isna().sum()

date             146
price            236
address            0
bedrooms           4
bathrooms       5144
parking         5144
area            6188
median_price    3310
population      3310
average_age     3310
dtype: int64

## Price

In [6]:
df['price'][1]

'815000'

In [7]:
for i in range(0,10):
    print(df['price'][i])

480000
815000
3350000
3350000
590000
318000
285000
1180000
450000
925000


In [8]:
df = df.dropna(subset=['price'])

In [9]:
df.isna().sum()

date             146
price              0
address            0
bedrooms           4
bathrooms       5140
parking         5140
area            6184
median_price    3306
population      3306
average_age     3306
dtype: int64

In [10]:
df = df.dropna()

In [11]:
df.isna().sum()

date            0
price           0
address         0
bedrooms        0
bathrooms       0
parking         0
area            0
median_price    0
population      0
average_age     0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71010 entries, 0 to 80595
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          71010 non-null  object
 1   price         71010 non-null  object
 2   address       71010 non-null  object
 3   bedrooms      71010 non-null  object
 4   bathrooms     71010 non-null  object
 5   parking       71010 non-null  object
 6   area          71010 non-null  object
 7   median_price  71010 non-null  object
 8   population    71010 non-null  object
 9   average_age   71010 non-null  object
dtypes: object(10)
memory usage: 6.0+ MB


In [13]:
df = df[df['price'] != 'rice Withheld']

In [14]:
df['price'] = df['price'].str.replace(',','')

In [15]:
df['price']

0         480000
1         815000
2        3350000
3        3350000
4         590000
          ...   
80590     720000
80591     617000
80592     718000
80594     588000
80595     582500
Name: price, Length: 71002, dtype: object

In [16]:
df['price'] = df['price'].astype(int)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71002 entries, 0 to 80595
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          71002 non-null  object
 1   price         71002 non-null  int64 
 2   address       71002 non-null  object
 3   bedrooms      71002 non-null  object
 4   bathrooms     71002 non-null  object
 5   parking       71002 non-null  object
 6   area          71002 non-null  object
 7   median_price  71002 non-null  object
 8   population    71002 non-null  object
 9   average_age   71002 non-null  object
dtypes: int64(1), object(9)
memory usage: 6.0+ MB


## Address

In [18]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne VIC 3000",2 Beds,1 Bath,− Parking,53m²,$550k,47279,20 to 39
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne VIC 3000",2 Beds,2 Baths,1 Parking,93m²,$550k,47279,20 to 39
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne VIC 3000",2 Beds,1 Bath,1 Parking,80m²,$550k,47279,20 to 39


In [19]:
df['zipcode'] = df['address'].str[-4:]

In [20]:
df

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne VIC 3000",2 Beds,1 Bath,− Parking,53m²,$550k,47279,20 to 39,3000
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne VIC 3000",2 Beds,2 Baths,1 Parking,93m²,$550k,47279,20 to 39,3000
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne VIC 3000",2 Beds,1 Bath,1 Parking,80m²,$550k,47279,20 to 39,3000
...,...,...,...,...,...,...,...,...,...,...,...
80590,1 Sep 2023,720000,"126 Old Forcett Road, Forcett TAS 7173",4 Beds,1 Bath,10 Parking,8012.78m²,-,964,40 to 59,7173
80591,6 Aug 2023,617000,"30 Sandy Point Avenue, Dodges Ferry TAS 7173",3 Beds,1 Bath,6 Parking,"1,429m²",$653k,2474,40 to 59,7173
80592,31 Aug 2023,718000,"87 Grevillea Street, Primrose Sands TAS 7173",3 Beds,2 Baths,3 Parking,612m²,$465k,1054,40 to 59,7173
80594,5 Oct 2023,588000,"15 Brady Street, Midway Point TAS 7171",3 Beds,2 Baths,2 Parking,756m²,$630k,2861,20 to 39,7171


In [21]:
df['state'] = df['address'].str[-8:-5]

In [22]:
df

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne VIC 3000",2 Beds,1 Bath,− Parking,53m²,$550k,47279,20 to 39,3000,VIC
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne VIC 3000",2 Beds,2 Baths,1 Parking,93m²,$550k,47279,20 to 39,3000,VIC
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000,VIC
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne VIC 3000",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000,VIC
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne VIC 3000",2 Beds,1 Bath,1 Parking,80m²,$550k,47279,20 to 39,3000,VIC
...,...,...,...,...,...,...,...,...,...,...,...,...
80590,1 Sep 2023,720000,"126 Old Forcett Road, Forcett TAS 7173",4 Beds,1 Bath,10 Parking,8012.78m²,-,964,40 to 59,7173,TAS
80591,6 Aug 2023,617000,"30 Sandy Point Avenue, Dodges Ferry TAS 7173",3 Beds,1 Bath,6 Parking,"1,429m²",$653k,2474,40 to 59,7173,TAS
80592,31 Aug 2023,718000,"87 Grevillea Street, Primrose Sands TAS 7173",3 Beds,2 Baths,3 Parking,612m²,$465k,1054,40 to 59,7173,TAS
80594,5 Oct 2023,588000,"15 Brady Street, Midway Point TAS 7171",3 Beds,2 Baths,2 Parking,756m²,$630k,2861,20 to 39,7171,TAS


In [23]:
df['address'] = df['address'].str[:-8]

In [24]:
df

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne",2 Beds,1 Bath,− Parking,53m²,$550k,47279,20 to 39,3000,VIC
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne",2 Beds,2 Baths,1 Parking,93m²,$550k,47279,20 to 39,3000,VIC
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000,VIC
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3 Beds,2 Baths,2 Parking,175m²,$1.072m,47279,20 to 39,3000,VIC
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne",2 Beds,1 Bath,1 Parking,80m²,$550k,47279,20 to 39,3000,VIC
...,...,...,...,...,...,...,...,...,...,...,...,...
80590,1 Sep 2023,720000,"126 Old Forcett Road, Forcett",4 Beds,1 Bath,10 Parking,8012.78m²,-,964,40 to 59,7173,TAS
80591,6 Aug 2023,617000,"30 Sandy Point Avenue, Dodges Ferry",3 Beds,1 Bath,6 Parking,"1,429m²",$653k,2474,40 to 59,7173,TAS
80592,31 Aug 2023,718000,"87 Grevillea Street, Primrose Sands",3 Beds,2 Baths,3 Parking,612m²,$465k,1054,40 to 59,7173,TAS
80594,5 Oct 2023,588000,"15 Brady Street, Midway Point",3 Beds,2 Baths,2 Parking,756m²,$630k,2861,20 to 39,7171,TAS


## Bedrooms

In [25]:
df = df[~df['bedrooms'].str.contains('m²',na=False)]
df = df[~df['bedrooms'].str.contains('ha',na=False)]
df['bedrooms']  = df['bedrooms'].str.replace('− 0 Beds','0')
df['bedrooms'] = df['bedrooms'].str.replace('Beds','')
df['bedrooms'] = df['bedrooms'].str.replace('Bed','')

In [26]:
df['bedrooms'] = df['bedrooms'].astype(int)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71002 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          71002 non-null  object
 1   price         71002 non-null  int64 
 2   address       71002 non-null  object
 3   bedrooms      71002 non-null  int64 
 4   bathrooms     71002 non-null  object
 5   parking       71002 non-null  object
 6   area          71002 non-null  object
 7   median_price  71002 non-null  object
 8   population    71002 non-null  object
 9   average_age   71002 non-null  object
 10  zipcode       71002 non-null  object
 11  state         71002 non-null  object
dtypes: int64(2), object(10)
memory usage: 7.0+ MB


In [28]:
df.isna().sum()

date            0
price           0
address         0
bedrooms        0
bathrooms       0
parking         0
area            0
median_price    0
population      0
average_age     0
zipcode         0
state           0
dtype: int64

## Bathrooms

In [29]:
df['bathrooms'] = df['bathrooms'].str.replace('− 0 Baths','0')
df['bathrooms'] = df['bathrooms'].str.replace('Baths','')
df['bathrooms'] = df['bathrooms'].str.replace('Bath','')

In [30]:
df['bathrooms'] = df['bathrooms'].astype(int)

## Parking

In [31]:
df['parking'] = df['parking'].str.replace('Parking','')
df['parking'] = df['parking'].str.replace('parking','')
df['parking'] = df['parking'].str.replace('−','0')
df['parking'] = df['parking'].str.replace(',','')

In [32]:
df['parking'] = df['parking'].astype(int)

In [33]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53m²,$550k,47279,20 to 39,3000,VIC
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93m²,$550k,47279,20 to 39,3000,VIC
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175m²,$1.072m,47279,20 to 39,3000,VIC
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175m²,$1.072m,47279,20 to 39,3000,VIC
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80m²,$550k,47279,20 to 39,3000,VIC


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71002 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          71002 non-null  object
 1   price         71002 non-null  int64 
 2   address       71002 non-null  object
 3   bedrooms      71002 non-null  int64 
 4   bathrooms     71002 non-null  int64 
 5   parking       71002 non-null  int64 
 6   area          71002 non-null  object
 7   median_price  71002 non-null  object
 8   population    71002 non-null  object
 9   average_age   71002 non-null  object
 10  zipcode       71002 non-null  object
 11  state         71002 non-null  object
dtypes: int64(4), object(8)
memory usage: 7.0+ MB


## Area

In [35]:
df['area'] = df['area'].str.replace('m²','')
df['area'] = df['area'].str.replace(',','')
df['area'] = df['area'].str.replace('ha','')

In [36]:
df['area'] = df['area'].astype(float)

## Median Price

In [37]:
df['median_price'] = df['median_price'].str.replace('$','')
df['median_price'] = df['median_price'].str.replace('m','000000')
df['median_price'] = df['median_price'].str.replace('k','000')
df['median_price'] = df['median_price'].str.replace('.','')
df = df[~df['median_price'].str.contains('-', na=False)]

In [38]:
df['median_price'] = df['median_price'].astype(int)

## Population

In [39]:
df['population'] = df['population'].astype(int)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54851 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          54851 non-null  object 
 1   price         54851 non-null  int64  
 2   address       54851 non-null  object 
 3   bedrooms      54851 non-null  int64  
 4   bathrooms     54851 non-null  int64  
 5   parking       54851 non-null  int64  
 6   area          54851 non-null  float64
 7   median_price  54851 non-null  int64  
 8   population    54851 non-null  int64  
 9   average_age   54851 non-null  object 
 10  zipcode       54851 non-null  object 
 11  state         54851 non-null  object 
dtypes: float64(1), int64(6), object(5)
memory usage: 5.4+ MB


## Data Preprocessing

In [41]:
from sklearn.preprocessing import LabelEncoder
agee_encoder=  LabelEncoder()
df['average_age'] = age_encoder.fit_transform(df['average_age'])

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54851 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          54851 non-null  object 
 1   price         54851 non-null  int64  
 2   address       54851 non-null  object 
 3   bedrooms      54851 non-null  int64  
 4   bathrooms     54851 non-null  int64  
 5   parking       54851 non-null  int64  
 6   area          54851 non-null  float64
 7   median_price  54851 non-null  int64  
 8   population    54851 non-null  int64  
 9   average_age   54851 non-null  int64  
 10  zipcode       54851 non-null  object 
 11  state         54851 non-null  object 
dtypes: float64(1), int64(7), object(4)
memory usage: 5.4+ MB


In [43]:
df.corr(numeric_only=True)

Unnamed: 0,price,bedrooms,bathrooms,parking,area,median_price,population,average_age
price,1.0,0.309295,0.42608,0.011766,0.002356,0.558588,0.13106,-0.08119
bedrooms,0.309295,1.0,0.507762,0.035953,0.017322,0.248636,0.03239,0.134987
bathrooms,0.42608,0.507762,1.0,0.019326,0.012059,0.258647,0.0458,0.023539
parking,0.011766,0.035953,0.019326,1.0,0.002687,0.003022,-0.008667,0.004558
area,0.002356,0.017322,0.012059,0.002687,1.0,-0.007693,-0.005291,0.01096
median_price,0.558588,0.248636,0.258647,0.003022,-0.007693,1.0,0.095193,-0.112824
population,0.13106,0.03239,0.0458,-0.008667,-0.005291,0.095193,1.0,-0.120237
average_age,-0.08119,0.134987,0.023539,0.004558,0.01096,-0.112824,-0.120237,1.0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54851 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          54851 non-null  object 
 1   price         54851 non-null  int64  
 2   address       54851 non-null  object 
 3   bedrooms      54851 non-null  int64  
 4   bathrooms     54851 non-null  int64  
 5   parking       54851 non-null  int64  
 6   area          54851 non-null  float64
 7   median_price  54851 non-null  int64  
 8   population    54851 non-null  int64  
 9   average_age   54851 non-null  int64  
 10  zipcode       54851 non-null  object 
 11  state         54851 non-null  object 
dtypes: float64(1), int64(7), object(4)
memory usage: 5.4+ MB


In [45]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,VIC
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,VIC
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,VIC
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,VIC
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,VIC


In [46]:
df['state'] = df['state'].str.replace('VIC', 'Victoria')
df['state'] = df['state'].str.replace('WA', 'Western Australia')
df['state'] = df['state'].str.replace('SA', 'Southern Australia')
df['state'] = df['state'].str.replace('QLD', 'Queensland')
df['state'] = df['state'].str.replace('TAS', 'Tasmania')
df['state'] = df['state'].str.replace('NT', 'Northern Territory')
df['state'] = df['state'].str.replace('NSW', 'New South Wales')

In [47]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,Victoria
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,Victoria
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,Victoria
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,Victoria
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,Victoria


In [48]:
encoder_state = LabelEncoder()
df['state'] = encoder_state.fit_transform(df['state'])

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54851 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          54851 non-null  object 
 1   price         54851 non-null  int64  
 2   address       54851 non-null  object 
 3   bedrooms      54851 non-null  int64  
 4   bathrooms     54851 non-null  int64  
 5   parking       54851 non-null  int64  
 6   area          54851 non-null  float64
 7   median_price  54851 non-null  int64  
 8   population    54851 non-null  int64  
 9   average_age   54851 non-null  int64  
 10  zipcode       54851 non-null  object 
 11  state         54851 non-null  int64  
dtypes: float64(1), int64(8), object(3)
memory usage: 5.4+ MB


In [50]:
df['zipcode'] = df['zipcode'].astype(int)

In [51]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-Apr-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27-Mar-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29-Feb-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,ction 8 Dec 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [52]:
df['date'] = df['date'].str.replace('Jan','01')
df['date'] = df['date'].str.replace('Feb','02')
df['date'] = df['date'].str.replace('Mar','03')
df['date'] = df['date'].str.replace('Apr','04')
df['date'] = df['date'].str.replace('May','05')
df['date'] = df['date'].str.replace('Jun','06')
df['date'] = df['date'].str.replace('Jul','07')
df['date'] = df['date'].str.replace('Aug','08')
df['date'] = df['date'].str.replace('Sep','09')
df['date'] = df['date'].str.replace('Oct','10')
df['date'] = df['date'].str.replace('Nov','11')
df['date'] = df['date'].str.replace('Dec','12')

In [53]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-04-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27-03-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,ction 8 12 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [54]:
df['date'] = df['date'].str.replace('ction','')

In [55]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-04-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27-03-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,8 12 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [56]:
# df['date'] = df['date'].str.replace(' ','-')

In [57]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-04-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27-03-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,8 12 2023,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [58]:
df.tail()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
80587,11 09 2023,625000,"12 Boundary Street, Midway Point",3,1,6,1052.0,6301000,2861,0,7171,5
80591,6 08 2023,617000,"30 Sandy Point Avenue, Dodges Ferry",3,1,6,1429.0,6531000,2474,1,7173,5
80592,31 08 2023,718000,"87 Grevillea Street, Primrose Sands",3,2,3,612.0,4651000,1054,1,7173,5
80594,5 10 2023,588000,"15 Brady Street, Midway Point",3,2,2,756.0,6301000,2861,0,7171,5
80595,26 08 2023,582500,"5 Kuneeamee Street, Dodges Ferry",3,1,2,635.0,6531000,2474,1,7173,5


In [59]:
df['date'] = df['date'].str.replace('ction','')

In [60]:
df['date'][30931]

'09-07-20'

In [61]:
df['date'] = df['date'].str.replace('2023', '23')

In [62]:
df.date

0        29-04-24
1        27-03-24
2        29-02-24
3        29-02-24
4         8 12 23
           ...   
80587    11 09 23
80591     6 08 23
80592    31 08 23
80594     5 10 23
80595    26 08 23
Name: date, Length: 54851, dtype: object

In [63]:
df.sample(5)

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
22126,07-12-23,380000,"10 The Centre Way, Mildura",3,1,2,454.0,4171000,32739,0,3500,6
31525,30-09-20,1352000,"3 Thomond Street, Hurstville",3,2,1,550.0,1551000000,29829,0,2220,3
70582,15 01 2021,435000,"55 Carruthers Crescent, Gillen",3,2,1,831.0,4501000,4261,0,870,0
5831,05-04-22,524500,"505/9 Watt Street, Newcastle",1,1,0,61.0,6301000,2765,0,2300,3
68336,29 12 2021,475000,"3 Derrinding Way, Kununurra",4,2,2,756.0,5891000,5315,0,6743,2


In [64]:
df['date'] = df['date'].str.replace('-', ' ')

In [65]:
df.date

0        29 04 24
1        27 03 24
2        29 02 24
3        29 02 24
4         8 12 23
           ...   
80587    11 09 23
80591     6 08 23
80592    31 08 23
80594     5 10 23
80595    26 08 23
Name: date, Length: 54851, dtype: object

In [66]:
df['date'] = df['date'].str.replace('2024', '24')
df['date'] = df['date'].str.replace('2022', '22')
df['date'] = df['date'].str.replace('2021', '21')
df['date'] = df['date'].str.replace('2020', '20')
df['date'] = df['date'].str.replace('2019', '19')
df['date'] = df['date'].str.replace('2018', '18')
df['date'] = df['date'].str.replace('2017', '17')
df['date'] = df['date'].str.replace('2016', '16')

In [67]:
df.date

0        29 04 24
1        27 03 24
2        29 02 24
3        29 02 24
4         8 12 23
           ...   
80587    11 09 23
80591     6 08 23
80592    31 08 23
80594     5 10 23
80595    26 08 23
Name: date, Length: 54851, dtype: object

In [81]:
df.to_csv('./cleaned_data.csv')

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54851 entries, 0 to 80595
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          54851 non-null  object 
 1   price         54851 non-null  int64  
 2   address       54851 non-null  object 
 3   bedrooms      54851 non-null  int64  
 4   bathrooms     54851 non-null  int64  
 5   parking       54851 non-null  int64  
 6   area          54851 non-null  float64
 7   median_price  54851 non-null  int64  
 8   population    54851 non-null  int64  
 9   average_age   54851 non-null  int64  
 10  zipcode       54851 non-null  int64  
 11  state         54851 non-null  int64  
dtypes: float64(1), int64(9), object(2)
memory usage: 7.5+ MB


In [70]:
df.corr(numeric_only=True)

Unnamed: 0,price,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
price,1.0,0.309295,0.42608,0.011766,0.002356,0.558588,0.13106,-0.08119,-0.299358,0.107915
bedrooms,0.309295,1.0,0.507762,0.035953,0.017322,0.248636,0.03239,0.134987,0.007798,-0.002308
bathrooms,0.42608,0.507762,1.0,0.019326,0.012059,0.258647,0.0458,0.023539,-0.111766,-0.051442
parking,0.011766,0.035953,0.019326,1.0,0.002687,0.003022,-0.008667,0.004558,0.019774,-0.003075
area,0.002356,0.017322,0.012059,0.002687,1.0,-0.007693,-0.005291,0.01096,0.00274,-0.007574
median_price,0.558588,0.248636,0.258647,0.003022,-0.007693,1.0,0.095193,-0.112824,-0.261424,0.104438
population,0.13106,0.03239,0.0458,-0.008667,-0.005291,0.095193,1.0,-0.120237,-0.375967,0.28127
average_age,-0.08119,0.134987,0.023539,0.004558,0.01096,-0.112824,-0.120237,1.0,0.102529,0.019996
zipcode,-0.299358,0.007798,-0.111766,0.019774,0.00274,-0.261424,-0.375967,0.102529,1.0,-0.014033
state,0.107915,-0.002308,-0.051442,-0.003075,-0.007574,0.104438,0.28127,0.019996,-0.014033,1.0


In [71]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29 04 24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27 03 24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,8 12 23,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [75]:
df.date

0        29 04 24
1        27 03 24
2        29 02 24
3        29 02 24
4         8 12 23
           ...   
80587    11 09 23
80591     6 08 23
80592    31 08 23
80594     5 10 23
80595    26 08 23
Name: date, Length: 54851, dtype: object

In [76]:
df['date'] = df['date'].str.replace(r'(\d{2}) (\d{2}) (\d{2})', r'\1-\2-\3')

In [77]:
df.date

0        29 04 24
1        27 03 24
2        29 02 24
3        29 02 24
4         8 12 23
           ...   
80587    11 09 23
80591     6 08 23
80592    31 08 23
80594     5 10 23
80595    26 08 23
Name: date, Length: 54851, dtype: object

In [78]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29 04 24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,6
1,27 03 24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,6
2,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
3,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,6
4,8 12 23,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,6


In [79]:
df['state'] = encoder_state.inverse_transform(df['state'])
df['average_age'] = age_encoder.inverse_transform(df['average_age'])

In [80]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29 04 24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,5501000,47279,0,3000,Victoria
1,27 03 24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,5501000,47279,0,3000,Victoria
2,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,Victoria
3,29 02 24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,10721000000,47279,0,3000,Victoria
4,8 12 23,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,5501000,47279,0,3000,Victoria


In [None]:
df.to_csv('./cleaned_data.csv')