# Load Data

In [129]:
import numpy as np
import pandas as pd

melbourne_df = pd.read_csv('dataset/melbourne.csv')
melbourne_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,03-09-2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,03-12-2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,04-02-2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,04-02-2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,04-03-2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


# Check data attributes

In [130]:
print(melbourne_df.shape)
print(melbourne_df.info())

(23547, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23547 entries, 0 to 23546
Data columns (total 21 columns):
Suburb           23547 non-null object
Address          23547 non-null object
Rooms            23547 non-null int64
Type             23547 non-null object
Price            18396 non-null float64
Method           23547 non-null object
SellerG          23547 non-null object
Date             23547 non-null object
Distance         23546 non-null float64
Postcode         23546 non-null float64
Bedroom2         19066 non-null float64
Bathroom         19063 non-null float64
Car              18921 non-null float64
Landsize         17410 non-null float64
BuildingArea     10018 non-null float64
YearBuilt        11540 non-null float64
CouncilArea      15656 non-null object
Lattitude        19243 non-null float64
Longtitude       19243 non-null float64
Regionname       23546 non-null object
Propertycount    23546 non-null float64
dtypes: float64(12), int64(1), object(8)
memory u

# Check how many null

In [131]:
melbourne_df.isnull().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price             5151
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          4481
Bathroom          4484
Car               4626
Landsize          6137
BuildingArea     13529
YearBuilt        12007
CouncilArea       7891
Lattitude         4304
Longtitude        4304
Regionname           1
Propertycount        1
dtype: int64

In [132]:
# No row with all values empty
melbourne_df.isnull().all(axis = 1).sum()

0

# Remove rows with more than 5 entries missing

# Check Percentage Missing and remove

In [133]:
round(100 * melbourne_df.isnull().sum()/melbourne_df.shape[0], 2)

Suburb            0.00
Address           0.00
Rooms             0.00
Type              0.00
Price            21.88
Method            0.00
SellerG           0.00
Date              0.00
Distance          0.00
Postcode          0.00
Bedroom2         19.03
Bathroom         19.04
Car              19.65
Landsize         26.06
BuildingArea     57.46
YearBuilt        50.99
CouncilArea      33.51
Lattitude        18.28
Longtitude       18.28
Regionname        0.00
Propertycount     0.00
dtype: float64

In [134]:
melbourne_df = melbourne_df.drop('BuildingArea', axis = 1)
melbourne_df = melbourne_df.drop('YearBuilt', axis = 1)
melbourne_df = melbourne_df.drop('CouncilArea', axis = 1)

In [135]:
round(100 * melbourne_df.isnull().sum()/ melbourne_df.shape[0], 2)

Suburb            0.00
Address           0.00
Rooms             0.00
Type              0.00
Price            21.88
Method            0.00
SellerG           0.00
Date              0.00
Distance          0.00
Postcode          0.00
Bedroom2         19.03
Bathroom         19.04
Car              19.65
Landsize         26.06
Lattitude        18.28
Longtitude       18.28
Regionname        0.00
Propertycount     0.00
dtype: float64

# Remove more than 5 missing rows

In [136]:
melbourne_df_cleaned = melbourne_df[melbourne_df.isnull().sum(axis = 1) <= 5]
print(melbourne_df.shape)
print(melbourne_df_cleaned.shape)

#only 20% rows, remove them
melbourne_df = melbourne_df_cleaned

(23547, 18)
(19269, 18)


In [137]:
round(100 * melbourne_df.isnull().sum()/ melbourne_df.shape[0], 2)

Suburb            0.00
Address           0.00
Rooms             0.00
Type              0.00
Price            21.71
Method            0.00
SellerG           0.00
Date              0.00
Distance          0.00
Postcode          0.00
Bedroom2          1.05
Bathroom          1.07
Car               1.81
Landsize          9.65
Lattitude         0.13
Longtitude        0.13
Regionname        0.00
Propertycount     0.00
dtype: float64

# Remove isNan price rows

In [138]:
rows_with_no_price = np.isnan(melbourne_df['Price'])

melbourne_df = melbourne_df[~rows_with_no_price]

In [139]:
round(100 * melbourne_df.isnull().sum()/melbourne_df.shape[0], 2)

Suburb           0.00
Address          0.00
Rooms            0.00
Type             0.00
Price            0.00
Method           0.00
SellerG          0.00
Date             0.00
Distance         0.00
Postcode         0.00
Bedroom2         1.05
Bathroom         1.07
Car              1.76
Landsize         9.83
Lattitude        0.15
Longtitude       0.15
Regionname       0.00
Propertycount    0.00
dtype: float64

In [140]:
melbourne_df['Landsize'].describe()

count     13603.000000
mean        558.116371
std        3987.326586
min           0.000000
25%         176.500000
50%         440.000000
75%         651.000000
max      433014.000000
Name: Landsize, dtype: float64

In [141]:
rows_without_landsize = np.isnan(melbourne_df['Landsize'])

melbourne_df = melbourne_df[~rows_without_landsize]

In [142]:
round(100 * melbourne_df.isnull().sum()/melbourne_df.shape[0], 2)

Suburb           0.00
Address          0.00
Rooms            0.00
Type             0.00
Price            0.00
Method           0.00
SellerG          0.00
Date             0.00
Distance         0.00
Postcode         0.00
Bedroom2         0.00
Bathroom         0.01
Car              0.46
Landsize         0.00
Lattitude        0.16
Longtitude       0.16
Regionname       0.00
Propertycount    0.00
dtype: float64

In [143]:
melbourne_df.loc[:, ['Lattitude', 'Longtitude']].describe()

Unnamed: 0,Lattitude,Longtitude
count,13581.0,13581.0
mean,-37.809204,144.995221
std,0.079257,0.103913
min,-38.18255,144.43181
25%,-37.85682,144.9296
50%,-37.80236,145.0001
75%,-37.7564,145.05832
max,-37.40853,145.52635


In [144]:
melbourne_df.loc[np.isnan(melbourne_df['Lattitude']), 'Lattitude'] = melbourne_df['Lattitude'].mean()
melbourne_df.loc[np.isnan(melbourne_df['Longtitude']), 'Longtitude'] = melbourne_df['Longtitude'].mean()

In [147]:
round(100 * melbourne_df.isnull().sum()/melbourne_df.shape[0], 2)

Suburb           0.00
Address          0.00
Rooms            0.00
Type             0.00
Price            0.00
Method           0.00
SellerG          0.00
Date             0.00
Distance         0.00
Postcode         0.00
Bedroom2         0.00
Bathroom         0.01
Car              0.46
Landsize         0.00
Lattitude        0.00
Longtitude       0.00
Regionname       0.00
Propertycount    0.00
dtype: float64

In [150]:
melbourne_df.loc[:, ['Bathroom', 'Car']].describe()

Unnamed: 0,Bathroom,Car
count,13602.0,13540.0
mean,1.534921,1.610414
std,0.691834,0.962244
min,0.0,0.0
25%,1.0,1.0
50%,1.0,2.0
75%,2.0,2.0
max,8.0,10.0


In [154]:
melbourne_df['Car'].value_counts()

2.0     5606
1.0     5515
0.0     1026
3.0      748
4.0      507
5.0       63
6.0       54
8.0        9
7.0        8
10.0       3
9.0        1
Name: Car, dtype: int64

# Most people have 2 cars, lets impute by that.

In [163]:
melbourne_df.loc[np.isnan(melbourne_df['Car']), 'Car'] = 2

In [168]:
melbourne_df.loc[:, 'Car'].value_counts()

2.0     5669
1.0     5515
0.0     1026
3.0      748
4.0      507
5.0       63
6.0       54
8.0        9
7.0        8
10.0       3
9.0        1
Name: Car, dtype: int64

In [171]:
melbourne_df['Bathroom'].value_counts()

1.0    7517
2.0    4987
3.0     921
4.0     106
0.0      34
5.0      28
6.0       5
8.0       2
7.0       2
Name: Bathroom, dtype: int64

In [175]:
melbourne_df.loc[np.isnan(melbourne_df['Bathroom']), 'Bathroom'] = 1

In [179]:
melbourne_df.loc[:, ['Car', 'Bathroom']].describe()

Unnamed: 0,Car,Bathroom
count,13603.0,13603.0
mean,1.612218,1.534882
std,0.960377,0.691824
min,0.0,0.0
25%,1.0,1.0
50%,2.0,1.0
75%,2.0,2.0
max,10.0,8.0


In [201]:
round(melbourne_df.isnull().sum()/ melbourne_df.shape[0], 2)
a = np.array([[4, 3, 1], [5, 7, 0], [9, 9, 3], [8, 2, 4]])

print(a)
t = np.array(a[2])
a[2] = a[3]
a[3] = t
print(a)
np.zeros([2,3])

[[4 3 1]
 [5 7 0]
 [9 9 3]
 [8 2 4]]
[[4 3 1]
 [5 7 0]
 [8 2 4]
 [9 9 3]]


array([[0., 0., 0.],
       [0., 0., 0.]])