#Building an Algorithm which can predict the House price based on Multiple factor

Importing the Libraries needed for this Notebook

In [339]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
from scipy.stats import kurtosis , skew

#ML algorithm

from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score , mean_absolute_percentage_error , accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#style
sns.set_style("darkgrid")
plt.rcParams["figure.figsize"] = (10,6)
plt.rcParams["font.size"] = 14

#color
from termcolor import colored

In [340]:
df = pd.read_csv("Melbourne_housing_FULL.csv")

In [341]:
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


We can check which all columns are numeric and categorical

In [342]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

We can check what are the maximum and minimum value for the numeric column

In [343]:
df.describe().loc[["min","max"]]

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.19043,144.42379,83.0
max,16.0,11200000.0,48.1,3978.0,30.0,12.0,26.0,433014.0,44515.0,2106.0,-37.3902,145.52635,21650.0


We want to rename the name Bedroom2 to Bedroom

In [344]:
df.rename(columns={"Bedroom2" : "Bedroom"},inplace = True)

In [345]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom        26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

The column names of Latitude and Longitude are wrong

In [346]:

df.rename(columns = {"Lattitude" : "Latitude","Longtitude" : "Longitude"},inplace = True)

Now we are going to create a function missing_values which consist of column name , missing_number , missing_percent

In [347]:
def missing_values(df):
  null_val = df.isna().sum()
  missing_values_df = pd.DataFrame({"column" : null_val.index , "missing_number" : null_val.values , "missing_percent" :((null_val.values) / len(df[null_val.index])*100)})
  return missing_values_df[missing_values_df["missing_number"]>0]

In [348]:
def first_looking(df):
  print("Shape : " , "\n" , df.shape , "\n" ,
      "*" *100, "\n",
        "Info : "  ),"\n",
  print(df.info() , "\n" ,
        "*" *100, "\n" ,
        "No of unique values : " ),"\n",
  print(df.nunique() , "\n" ,
        "*" * 100 , "\n" ,
        "Missing Value Chart : " ) , "\n",
  print(missing_values(df) , "\n" ,
        "*" *100 , "\n" ,
        "Displaying all columns : ") , "\n",
  print(df.columns , "\n" , "*" *100)

In [349]:
first_looking(df)

Shape :  
 (34857, 21) 
 **************************************************************************************************** 
 Info : 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom        26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  Ye

We need to make the target column at last

In [350]:
df = df[[i for i in df.columns if i!= "Price"] +["Price"]]

We are assigning the dataframe to a new dataframe for assurity

In [351]:
raw_df = df

In [352]:
df = raw_df

In [353]:
df

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,Regionname,Propertycount,Price
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/09/2016,2.5,3067.0,2.0,...,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0,
1,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,...,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0,1480000.0
2,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,...,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0,1035000.0
3,Abbotsford,18/659 Victoria St,3,u,VB,Rounds,4/02/2016,2.5,3067.0,3.0,...,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0,
4,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,...,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0,1465000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,PI,Jas,24/02/2018,6.3,3013.0,4.0,...,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0,1480000.0
34853,Yarraville,29A Murray St,2,h,SP,Sweeney,24/02/2018,6.3,3013.0,2.0,...,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0,888000.0
34854,Yarraville,147A Severn St,2,t,S,Jas,24/02/2018,6.3,3013.0,2.0,...,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0,705000.0
34855,Yarraville,12/37 Stephen St,3,h,SP,hockingstuart,24/02/2018,6.3,3013.0,,...,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0,1140000.0


In [354]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Method', 'SellerG', 'Date',
       'Distance', 'Postcode', 'Bedroom', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Latitude', 'Longitude',
       'Regionname', 'Propertycount', 'Price'],
      dtype='object')

In [355]:
df.columns = df.columns.str.replace(r'([a-z])([A-Z])' , r'\1_\2').str.lower()


The default value of regex will change from True to False in a future version.



In [356]:
df

Unnamed: 0,suburb,address,rooms,type,method,seller_g,date,distance,postcode,bedroom,...,car,landsize,building_area,year_built,council_area,latitude,longitude,regionname,propertycount,price
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/09/2016,2.5,3067.0,2.0,...,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0,
1,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,...,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0,1480000.0
2,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,...,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0,1035000.0
3,Abbotsford,18/659 Victoria St,3,u,VB,Rounds,4/02/2016,2.5,3067.0,3.0,...,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0,
4,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,...,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0,1465000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,PI,Jas,24/02/2018,6.3,3013.0,4.0,...,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0,1480000.0
34853,Yarraville,29A Murray St,2,h,SP,Sweeney,24/02/2018,6.3,3013.0,2.0,...,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0,888000.0
34854,Yarraville,147A Severn St,2,t,S,Jas,24/02/2018,6.3,3013.0,2.0,...,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0,705000.0
34855,Yarraville,12/37 Stephen St,3,h,SP,hockingstuart,24/02/2018,6.3,3013.0,,...,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0,1140000.0


In [357]:
def display(df):
  for i in df.columns[:] :
    print("*" * 100)
    print("column name : " , i) , "\n" ,
    print("No of Nan values in the column : " , df[i].isna().sum()),"\n",
    print("What percentage of column consist of Nan values : " , (df[i].isna().sum() / df.isna().sum().sum())*100) , "\n" ,
    print("Value counts for the column : " , "\n" , df[i].value_counts())


# Filling Null values based on the Double stagic using scikit learn

Now we want to know the skewness and kurtosis inorder to use median or more to fill the Null values

In [358]:
def visualize_data(df,column):
  fig = px.histogram(df[column], marginal = "box" , title=f'Histogram of {column}')
  skewness = df[column].skew()
  kurtosis = df[column].kurtosis()
  annotations = [
      dict(x=skewness, y= 1.0 , xref = "x" , yref = "paper" , text = f'Skewness :{skewness:.2f}' , showarrow = True , arrowhead = 2,ax = 0),
      dict(x=kurtosis , y = 0.9, xref = "x" , yref = "paper" , text = f'Kurtosis :{kurtosis:.2f}',showarrow = True , arrowhead = 2,ax=0)]
  fig.update_layout(annotations = annotations)
  fig.show()

We need to create a function which wil briefly say about the column so we can fill the Null values

In [359]:
def first_look(df,column):
  print("Column name : ",column) , "\n"
  print("*" *100) , "\n" ,
  print("Percentage_of_null : ",(df[column].isna().sum()/len(df[column]))*100) , "\n" ,
  print("*" *100) , "\n" ,
  print("Number_of_uniques : ",df[column].nunique()) , "\n" ,
  print("*" *100) , "\n" ,
  print("shape_of_df : ",df.shape) , "\n" ,
  print("*" *100) , "\n" ,
  print("The correlation of particular column : ", "\n" ,df.corr()[column].sort_values(ascending=False)) , "\n" ,
  print("*" *100) , "\n" ,
  print("Value count of the column : ","\n" , df[column].value_counts(dropna=False).sort_values(ascending=False)) , "\n" ,

We are going to do double staging process inorder to fill the null values more accurately

In [360]:
#from sklearn.impute import SimpleImputer
def fill(df,col1,col2,maincol):
  imputer = SimpleImputer(strategy="median")
  df[maincol] = df.groupby([col1 , col2])[maincol].transform(lambda x : x.fillna(x.median()))
  df[maincol] = imputer.fit_transform(df[maincol].values.reshape(-1,1))
  return df[maincol]

#Column Name : Price

In [361]:
first_look(df,"price")





Column name :  price
****************************************************************************************************
Percentage_of_null :  21.83205668875692
****************************************************************************************************
Number_of_uniques :  2871
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 price            1.000000
rooms            0.465238
bedroom          0.430275
bathroom         0.429878
car              0.201803
longitude        0.197874
building_area    0.100754
postcode         0.044950
landsize         0.032748
propertycount   -0.059017
distance        -0.211384
latitude        -0.215607
year_built      -0.333306
Name: price, dtype: float64
**************************************************************************

In [362]:
df.type.value_counts()

h    23980
u     7297
t     3580
Name: type, dtype: int64

In [363]:
visualize_data(df,"price")

Now we are going to fill the null value using double staging process

In [364]:
fill(df,"rooms","type","price")


Mean of empty slice


Mean of empty slice


Mean of empty slice



0         941000.0
1        1480000.0
2        1035000.0
3         777000.0
4        1465000.0
           ...    
34852    1480000.0
34853     888000.0
34854     705000.0
34855    1140000.0
34856    1020000.0
Name: price, Length: 34857, dtype: float64

We are going to check whether there is any null value in the column price

In [365]:
first_look(df,"price")

Column name :  price
****************************************************************************************************
Percentage_of_null :  0.0
****************************************************************************************************
Number_of_uniques :  2872
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 price            1.000000
rooms            0.490779
bedroom          0.448058
bathroom         0.416476
car              0.209168
longitude        0.178372
building_area    0.112273
postcode         0.045174
landsize         0.034631
propertycount   -0.057061
distance        -0.157549
latitude        -0.185791
year_built      -0.290315
Name: price, dtype: float64
****************************************************************************************





yeah , now it contains 0 null values

Column name : building_area

In [366]:
first_look(df,"building_area")





Column name :  building_area
****************************************************************************************************
Percentage_of_null :  60.576067934704646
****************************************************************************************************
Number_of_uniques :  740
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 building_area    1.000000
landsize         0.354530
rooms            0.156229
bedroom          0.154157
bathroom         0.147558
price            0.112273
car              0.104373
distance         0.076301
year_built       0.067811
postcode         0.042437
latitude         0.017155
longitude       -0.002143
propertycount   -0.024523
Name: building_area, dtype: float64
**********************************************************

In [367]:
fill(df,"rooms","price","building_area")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of em

0         98.0
1        132.0
2         79.0
3        119.0
4        150.0
         ...  
34852    180.0
34853    104.0
34854    120.0
34855    121.0
34856    103.0
Name: building_area, Length: 34857, dtype: float64

In [368]:
first_look(df,"building_area")

Column name :  building_area
****************************************************************************************************
Percentage_of_null :  0.0
****************************************************************************************************
Number_of_uniques :  934
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 building_area    1.000000
rooms            0.190530
bedroom          0.168417
bathroom         0.150144
price            0.134039
car              0.098846
landsize         0.088883
distance         0.068298
year_built       0.041128
postcode         0.037433
latitude         0.009605
longitude        0.007541
propertycount   -0.023238
Name: building_area, dtype: float64
*************************************************************************





We have filled the null values for the column building area

#column : year_built

In [369]:
first_look(df,"year_built")

Column name :  year_built
****************************************************************************************************
Percentage_of_null :  55.38629256677281
****************************************************************************************************
Number_of_uniques :  160
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 year_built       1.000000
distance         0.323059
bathroom         0.167955
car              0.128702
latitude         0.091592
postcode         0.089805
landsize         0.044474
building_area    0.041128
propertycount    0.022420
bedroom         -0.002022
rooms           -0.012749
longitude       -0.022175
price           -0.290315
Name: year_built, dtype: float64
*****************************************************************





In [370]:
fill(df,"suburb","distance","year_built")


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice



0        1922.5
1        1922.5
2        1900.0
3        1922.5
4        1900.0
          ...  
34852    1950.0
34853    2018.0
34854    2000.0
34855    1950.0
34856    1930.0
Name: year_built, Length: 34857, dtype: float64

Column name : landsize


```



In [371]:
first_look(df,"landsize")

Column name :  landsize
****************************************************************************************************
Percentage_of_null :  33.88128639871475
****************************************************************************************************
Number_of_uniques :  1684
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 landsize         1.000000
building_area    0.088883
distance         0.060862
postcode         0.040664
car              0.037829
rooms            0.037402
bedroom          0.037019
bathroom         0.036333
price            0.034631
latitude         0.025318
year_built       0.011190
longitude       -0.002582
propertycount   -0.018195
Name: landsize, dtype: float64
********************************************************************





In [372]:
fill(df,"rooms","building_area","landsize")


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of e

0        126.0
1        202.0
2        156.0
3          0.0
4        134.0
         ...  
34852    593.0
34853     98.0
34854    220.0
34855    538.0
34856    250.0
Name: landsize, Length: 34857, dtype: float64

#column name : car

In [373]:
first_look(df,"car")





Column name :  car
****************************************************************************************************
Percentage_of_null :  25.03944688297903
****************************************************************************************************
Number_of_uniques :  15
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 car              1.000000
rooms            0.393878
bedroom          0.388491
bathroom         0.307518
distance         0.241835
price            0.209168
year_built       0.134169
building_area    0.098846
postcode         0.067886
longitude        0.047213
landsize         0.038384
latitude        -0.009020
propertycount   -0.009617
Name: car, dtype: float64
********************************************************************************

In [374]:
fill(df,"building_area","landsize","car")


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of e

0        1.0
1        1.0
2        0.0
3        1.0
4        0.0
        ... 
34852    3.0
34853    1.0
34854    2.0
34855    2.0
34856    0.0
Name: car, Length: 34857, dtype: float64

#Column name : bathroom

In [375]:
first_look(df,"bathroom")

Column name :  bathroom
****************************************************************************************************
Percentage_of_null :  23.599277046217402
****************************************************************************************************
Number_of_uniques :  11
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 bathroom         1.000000
bedroom          0.614892
rooms            0.611826
price            0.416476
car              0.305145
year_built       0.152378
building_area    0.150144
distance         0.126201
postcode         0.120080
longitude        0.106531
landsize         0.037623
propertycount   -0.032887
latitude        -0.059183
Name: bathroom, dtype: float64
*********************************************************************





In [376]:
fill(df,"rooms","price","bathroom")


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of e

0        1.0
1        1.0
2        1.0
3        2.0
4        2.0
        ... 
34852    1.0
34853    2.0
34854    1.0
34855    1.0
34856    1.0
Name: bathroom, Length: 34857, dtype: float64

#column name : bedroom

In [377]:
first_look(df,"bedroom")

Column name :  bedroom
****************************************************************************************************
Percentage_of_null :  23.573457268267493
****************************************************************************************************
Number_of_uniques :  15
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 bedroom          1.000000
rooms            0.946755
bathroom         0.614815
price            0.448058
car              0.386026
distance         0.269524
building_area    0.168417
longitude        0.106164
postcode         0.089292
year_built       0.050611
landsize         0.040602
latitude         0.003447
propertycount   -0.053451
Name: bedroom, dtype: float64
***********************************************************************





In [378]:
fill(df,"rooms","bathroom","bedroom")


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice



0        2.0
1        2.0
2        2.0
3        3.0
4        3.0
        ... 
34852    4.0
34853    2.0
34854    2.0
34855    3.0
34856    2.0
Name: bedroom, Length: 34857, dtype: float64

#Column name : propertycount :

In [379]:
first_look(df,"propertycount")

Column name :  propertycount
****************************************************************************************************
Percentage_of_null :  0.008606592649969878
****************************************************************************************************
Number_of_uniques :  342
****************************************************************************************************
shape_of_df :  (34857, 21)
****************************************************************************************************
The correlation of particular column :  
 propertycount    1.000000
year_built       0.039522
postcode         0.017108
longitude        0.016326
latitude         0.011112
landsize        -0.017908
distance        -0.018140
car             -0.023142
building_area   -0.023238
bathroom        -0.043666
price           -0.057061
rooms           -0.071677
bedroom         -0.071910
Name: propertycount, dtype: float64
********************************************************





In [380]:
fill(df,"suburb","postcode","propertycount")

0        4019.0
1        4019.0
2        4019.0
3        4019.0
4        4019.0
          ...  
34852    6543.0
34853    6543.0
34854    6543.0
34855    6543.0
34856    6543.0
Name: propertycount, Length: 34857, dtype: float64