# #Import Libraries

In [1]:
# !pip install scikit-learn==1.5.1

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# #Import Dataset

In [3]:
dataset = pd.read_csv('House_price.csv')
dataset

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
0,2003,3,2,8450,Urban,208500
1,1976,3,2,9600,SubUrban,181500
2,2001,3,2,11250,Rural,223500
3,1915,3,1,9550,Urban,140000
4,2000,4,2,14260,SubUrban,250000
...,...,...,...,...,...,...
1455,1999,3,2,7917,Urban,175000
1456,1978,3,2,13175,SubUrban,210000
1457,1941,4,2,9042,Urban,266500
1458,1950,2,1,9717,SubUrban,142125


# #Sanity Check

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   HouseAge   1460 non-null   int64 
 1   Bedroom    1460 non-null   int64 
 2   FullBath   1460 non-null   int64 
 3   LotArea    1460 non-null   int64 
 4   Location   1460 non-null   object
 5   SalePrice  1460 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 68.6+ KB


In [5]:
dataset.shape

(1460, 6)

In [6]:
dataset.columns

Index(['HouseAge', 'Bedroom', 'FullBath', 'LotArea', 'Location', 'SalePrice'], dtype='object')

In [7]:
dataset.head()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
0,2003,3,2,8450,Urban,208500
1,1976,3,2,9600,SubUrban,181500
2,2001,3,2,11250,Rural,223500
3,1915,3,1,9550,Urban,140000
4,2000,4,2,14260,SubUrban,250000


In [8]:
dataset.tail()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
1455,1999,3,2,7917,Urban,175000
1456,1978,3,2,13175,SubUrban,210000
1457,1941,4,2,9042,Urban,266500
1458,1950,2,1,9717,SubUrban,142125
1459,1965,3,1,9937,SubUrban,147500


In [9]:
dataset.describe()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0
mean,1971.267808,2.866438,1.565068,10516.828082,180921.19589
std,30.202904,0.815778,0.550916,9981.264932,79442.502883
min,1872.0,0.0,0.0,1300.0,34900.0
25%,1954.0,2.0,1.0,7553.5,129975.0
50%,1973.0,3.0,2.0,9478.5,163000.0
75%,2000.0,3.0,2.0,11601.5,214000.0
max,2010.0,8.0,3.0,215245.0,755000.0


In [10]:
dataset.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1455    False
1456    False
1457    False
1458    False
1459    False
Length: 1460, dtype: bool

In [11]:
dataset.drop_duplicates(inplace=True, ignore_index=True)
dataset

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location,SalePrice
0,2003,3,2,8450,Urban,208500
1,1976,3,2,9600,SubUrban,181500
2,2001,3,2,11250,Rural,223500
3,1915,3,1,9550,Urban,140000
4,2000,4,2,14260,SubUrban,250000
...,...,...,...,...,...,...
1455,1999,3,2,7917,Urban,175000
1456,1978,3,2,13175,SubUrban,210000
1457,1941,4,2,9042,Urban,266500
1458,1950,2,1,9717,SubUrban,142125


In [12]:
dataset.isna().sum()

HouseAge     0
Bedroom      0
FullBath     0
LotArea      0
Location     0
SalePrice    0
dtype: int64

# #split the data into features and target

In [13]:
X = dataset.iloc[:,:-1]
X

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
0,2003,3,2,8450,Urban
1,1976,3,2,9600,SubUrban
2,2001,3,2,11250,Rural
3,1915,3,1,9550,Urban
4,2000,4,2,14260,SubUrban
...,...,...,...,...,...
1455,1999,3,2,7917,Urban
1456,1978,3,2,13175,SubUrban
1457,1941,4,2,9042,Urban
1458,1950,2,1,9717,SubUrban


In [14]:
y = dataset.iloc[:,[-1]]
y

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


# #data preprocessing

In [15]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [16]:
blum = LabelEncoder()
Location = blum.fit_transform(X['Location'])

In [17]:
Location

array([2, 1, 0, ..., 2, 1, 1])

In [18]:
X['Location'] = Location
X

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
0,2003,3,2,8450,2
1,1976,3,2,9600,1
2,2001,3,2,11250,0
3,1915,3,1,9550,2
4,2000,4,2,14260,1
...,...,...,...,...,...
1455,1999,3,2,7917,2
1456,1978,3,2,13175,1
1457,1941,4,2,9042,2
1458,1950,2,1,9717,1


In [19]:
X.describe()

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
count,1460.0,1460.0,1460.0,1460.0,1460.0
mean,1971.267808,2.866438,1.565068,10516.828082,1.425342
std,30.202904,0.815778,0.550916,9981.264932,0.653384
min,1872.0,0.0,0.0,1300.0,0.0
25%,1954.0,2.0,1.0,7553.5,1.0
50%,1973.0,3.0,2.0,9478.5,2.0
75%,2000.0,3.0,2.0,11601.5,2.0
max,2010.0,8.0,3.0,215245.0,2.0


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [21]:
numeric_columns = ['HouseAge', 'Bedroom', 'FullBath', 'LotArea', 'Location']
transformer = MinMaxScaler()

In [22]:
preprocess = ColumnTransformer(
transformers=[
    ('numerical', transformer, numeric_columns)
])

In [23]:
# model
base_model = RandomForestRegressor( n_estimators=200, random_state=0)   

In [24]:
model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', base_model)
])

# split the dataset into train and test set

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [27]:
X_train

Unnamed: 0,HouseAge,Bedroom,FullBath,LotArea,Location
64,1997,3,2,9375,1
682,1996,2,1,2887,2
960,1958,2,1,7207,2
1384,1939,2,1,9060,1
1100,1920,1,1,8400,2
...,...,...,...,...,...
763,1999,3,2,9430,1
835,1950,2,2,9600,1
1216,1978,4,2,8930,2
559,2003,2,2,3196,1


# #Training the Model

In [28]:
# from sklearn.ensemble import RandomForestRegressor

In [29]:
# model = RandomForestRegressor( n_estimators=200, random_state=0)   

In [30]:
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [31]:
prediction = model.predict(X_test)
prediction

array([265768.29      , 115744.66666667, 129692.415     , 176541.25      ,
        93328.98214286,  87945.89      , 308943.465     , 126016.5       ,
       500407.3       , 254977.        , 208004.6       , 194330.26666667,
       198417.75      , 136647.75      , 132517.56      , 143835.        ,
       200100.        , 151781.58333333, 135016.165     , 102951.48      ,
       121844.965     , 146587.5       , 129508.25      , 196537.        ,
       158636.5       , 264686.62      , 170099.5       ,  84598.25      ,
       336695.22      , 108854.895     , 151695.95833333, 195224.64      ,
       130604.17166667, 283107.1       , 269641.62      , 281638.05      ,
       185399.41666667, 130498.        , 288278.615     , 245976.8       ,
       146372.25      , 126942.96428571, 202356.265     , 238437.675     ,
       329918.435     , 183438.        , 121975.        , 142594.75      ,
       178563.19166667, 136060.625     , 238325.19      , 143504.75      ,
       167996.        , 1

# #Model Evaluation

In [32]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [33]:
r2 = r2_score(y_test, prediction)
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)

print('r2 score: ', r2)
print('mean absolute error: ', mae)
print('mean_square error: ', mse)

r2 score:  0.6317498357323758
mean absolute error:  33193.49806071972
mean_square error:  2500047213.3461466


# #Save the Model

In [34]:
import pickle

In [35]:
pickle_out = open('HousePredict.pkl', 'wb')
pickle.dump(model, pickle_out)
pickle_out.close

<function BufferedWriter.close>

In [36]:
import sklearn
print(sklearn.__version__)

1.5.1
