In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split



In [2]:
#Reading Data
path='/content/drive/MyDrive/Colab Notebooks/Google Collab ML data repository/Melbourne_dataset/melb_data.csv'

data=pd.read_csv(path)


In [3]:
#Overviewing data
data.head()
data.describe()
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [4]:
#diving training and testing data 
y = data.Price
X = data.drop(['Price'],axis=1)

X_train_full,X_test_full,y_train,y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)


In [5]:
#Selecting categorical columns with low cardinality  

categorical_col=[col for col in X_train_full.columns if X_train_full[col].nunique()<10 and X_train_full[col].dtype=='O']
print(categorical_col)

['Type', 'Method', 'Regionname']


In [6]:
#Selecting numerical columns

numerical_col = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]
print(numerical_col)

['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']


In [7]:
#Keeping only selected columns
my_col=numerical_col + categorical_col

X_train=X_train_full[my_col].copy()
X_test_full=X_test_full[my_col].copy()

X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0,u,S,Southern Metropolitan
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0,h,SA,Western Metropolitan
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0,h,S,Western Metropolitan
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0,u,SP,Northern Metropolitan
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,h,S,Western Metropolitan


Pipeling


In [8]:
#Step 1: Defining Preprocessing steps
#numerical data only needs imputation
#categorical data needs imputation and encoding 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder

numerical_transformer=SimpleImputer(strategy='constant')

categorical_transformer=Pipeline(steps=[
                                 ('imputer',SimpleImputer(strategy='most_frequent')),
                                 ('onehot',OneHotEncoder(handle_unknown='ignore'))                                     
])

#Bundle preprocessors for numerical and categorical data                                 
preprocessor = ColumnTransformer(transformers=[
    ('num',numerical_transformer,numerical_col),
    ('cat',categorical_transformer,categorical_col)
])




In [9]:
#Step 2: Defining model
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=100,random_state=0)

In [10]:
#Step 3: Creating and Evaluating Pipeline

my_pipeline = Pipeline(steps=[
                                              ('preprocessor',preprocessor),
                                              ('model',model)
])

In [11]:
#Fitting model and prediction
my_pipeline.fit(X_train,y_train)

preds=my_pipeline.predict(X_test_full)



In [12]:
#Score=Mean_absolute error
from sklearn.metrics import mean_absolute_error

score=mean_absolute_error(y_test,preds)
print('Mean_Absolute_Error:',score)

Mean_Absolute_Error: 160679.18917034855
