<a href="https://colab.research.google.com/github/Rabiul-ds/Melbourne-housing-market-price-prediction/blob/main/DA_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Duplicate Removal
2. Missing Value Treatment
3. Outlier Detection & Handling
4. Feature Scaling (if needed)
5. Perform EDA:
   - Summary Statistics
   - Visualizations
   - Correlation Analysis
   - Interpretation

In [None]:
import numpy as np
import pandas as pd

In [None]:
url = 'https://raw.githubusercontent.com/Rabiul-ds/Melbourne-housing-market-price-prediction/refs/heads/main/Melbourne_housing_FULL.csv'
df = pd.read_csv(url, encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

# **Part A: Data Preprocessing & EDA**

In [None]:
columns_to_convert = df.select_dtypes(include=['object']).columns
df[columns_to_convert] = df[columns_to_convert].astype('category')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


Duplicate Removal

In [None]:
# Check duplicate rows
df[df.duplicated()]

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
15858,Nunawading,1/7 Lilian St,3,t,,SP,Jellis,17/06/2017,15.4,3131.0,...,3.0,2.0,405.0,226.0,2000.0,Manningham City Council,-37.82678,145.16777,Eastern Metropolitan,4973.0


In [None]:
# Delete duplicates
df.drop_duplicates(inplace=True)

Missing value treatment

In [None]:
# Count the number of missing data
df.isna().sum()

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,7609
Method,0
SellerG,0
Date,0
Distance,1
Postcode,1


In [None]:
# Correlation Analysis
df.corr(numeric_only=True).T

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
Rooms,1.0,0.465238,0.271513,0.08589,0.946755,0.611875,0.39388,0.037402,0.156231,-0.012743,0.004871,0.103244,-0.071678
Price,0.465238,1.0,-0.211384,0.04495,0.430275,0.429878,0.201803,0.032748,0.100754,-0.333306,-0.215607,0.197874,-0.059017
Distance,0.271513,-0.211384,1.0,0.481567,0.269528,0.126168,0.241831,0.060864,0.076295,0.323035,-0.100414,0.200923,-0.01813
Postcode,0.08589,0.04495,0.481567,1.0,0.089292,0.120079,0.067885,0.040665,0.042436,0.0898,-0.231026,0.362901,0.017111
Bedroom2,0.946755,0.430275,0.269528,0.089292,1.0,0.61494,0.388492,0.037019,0.154159,-0.002017,0.003447,0.106172,-0.053453
Bathroom,0.611875,0.429878,0.126168,0.120079,0.61494,1.0,0.30752,0.036341,0.147554,0.167869,-0.059174,0.106444,-0.032847
Car,0.39388,0.201803,0.241831,0.067885,0.388492,0.30752,1.0,0.03783,0.104371,0.128688,-0.009018,0.0472,-0.009611
Landsize,0.037402,0.032748,0.060864,0.040665,0.037019,0.036341,0.03783,1.0,0.354532,0.044484,0.025317,-0.002579,-0.018197
BuildingArea,0.156231,0.100754,0.076295,0.042436,0.154159,0.147554,0.104371,0.354532,1.0,0.067758,0.017158,-0.00216,-0.024516
YearBuilt,-0.012743,-0.333306,0.323035,0.0898,-0.002017,0.167869,0.128688,0.044484,0.067758,1.0,0.091606,-0.022261,0.022456


In [None]:
df.corrwith(other=df["Price"], numeric_only=True)

Unnamed: 0,0
Rooms,0.465238
Price,1.0
Distance,-0.211384
Postcode,0.04495
Bedroom2,0.430275
Bathroom,0.429878
Car,0.201803
Landsize,0.032748
BuildingArea,0.100754
YearBuilt,-0.333306


In [None]:
# Remove features that has a very weak correlation with price, address is not necessary
df.drop(columns=["Postcode", "Landsize", "BuildingArea",
                 "Lattitude", "Longtitude", "Propertycount"], inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,7609
Method,0
SellerG,0
Date,0
Distance,1
Bedroom2,8217


In [None]:
# Fill the missing values of Bathroom and Car with mode
df['Bathroom'].fillna(df['Bathroom'].mode()[0], inplace=True)
df['Car'].fillna(df['Car'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bathroom'].fillna(df['Bathroom'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Car'].fillna(df['Car'].mode()[0], inplace=True)


In [None]:
df.isna().sum()

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,7609
Method,0
SellerG,0
Date,0
Distance,1
Bedroom2,8217


In [None]:
df['Bedroom2'] = df['Bedroom2'].fillna(df.groupby('Rooms')['Bedroom2'].transform('median')).round()
df.isna().sum()

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,7609
Method,0
SellerG,0
Date,0
Distance,1
Bedroom2,0
