# House Price Prediction Using Linear Regression and KNN

# Importing Libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error , r2_score

In [2]:
df = pd.read_csv('../input/pakistan-house-price-prediction/Entities.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168446 entries, 0 to 168445
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     168446 non-null  int64  
 1   property_id    168446 non-null  int64  
 2   location_id    168446 non-null  int64  
 3   page_url       168446 non-null  object 
 4   property_type  168446 non-null  object 
 5   price          168446 non-null  int64  
 6   location       168446 non-null  object 
 7   city           168446 non-null  object 
 8   province_name  168446 non-null  object 
 9   latitude       168446 non-null  float64
 10  longitude      168446 non-null  float64
 11  baths          168446 non-null  int64  
 12  purpose        168446 non-null  object 
 13  bedrooms       168446 non-null  int64  
 14  date_added     168446 non-null  object 
 15  agency         124375 non-null  object 
 16  agent          124374 non-null  object 
 17  Total_Area     168446 non-nul

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,property_id,location_id,page_url,property_type,price,location,city,province_name,latitude,longitude,baths,purpose,bedrooms,date_added,agency,agent,Total_Area
0,0,237062,3325,https://www.zameen.com/Property/g_10_g_10_2_gr...,Flat,10000000,G-10,Islamabad,Islamabad Capital,33.67989,73.01264,2,For Sale,2,2/4/2019,,,1089.004
1,1,346905,3236,https://www.zameen.com/Property/e_11_2_service...,Flat,6900000,E-11,Islamabad,Islamabad Capital,33.700993,72.971492,3,For Sale,3,5/4/2019,,,15246.056
2,2,386513,764,https://www.zameen.com/Property/islamabad_g_15...,House,16500000,G-15,Islamabad,Islamabad Capital,33.631486,72.926559,6,For Sale,5,7/17/2019,,,2178.008
3,3,656161,340,https://www.zameen.com/Property/islamabad_bani...,House,43500000,Bani Gala,Islamabad,Islamabad Capital,33.707573,73.151199,4,For Sale,4,4/5/2019,,,10890.0
4,4,841645,3226,https://www.zameen.com/Property/dha_valley_dha...,House,7000000,DHA Defence,Islamabad,Islamabad Capital,33.492591,73.301339,3,For Sale,3,7/10/2019,Easy Property,Muhammad Junaid Ceo Muhammad Shahid Director,2178.008


# Missing Values

In [5]:
df.isnull().sum()

Unnamed: 0           0
property_id          0
location_id          0
page_url             0
property_type        0
price                0
location             0
city                 0
province_name        0
latitude             0
longitude            0
baths                0
purpose              0
bedrooms             0
date_added           0
agency           44071
agent            44072
Total_Area           0
dtype: int64

 **Property_id and location_id can be dropped because  it doesn't carry important information**

In [6]:
df.drop(['Unnamed: 0' , 'property_id' , 'location_id'] , axis = 1 , inplace = True)

**Page Url is just for reference so it is also not important**

In [7]:
df.drop(['page_url'] , axis = 1 , inplace = True)

**longitude and latitude is available so we can drop location**

In [8]:
df.drop(['location'],axis=1 , inplace = True)

**'agency' and 'agent' conatin very less data so it can also be dropped**

In [9]:
df.drop(['agency' , 'agent'],axis=1 , inplace = True)

# **Now For Categorical data**

In [10]:
df.city.value_counts()

Karachi       60484
Lahore        41427
Islamabad     37426
Rawalpindi    20976
Faisalabad     8133
Name: city, dtype: int64

In [11]:
dff = pd.get_dummies(df , columns= ['city'] , prefix = 'C')

In [12]:
dff.head()

Unnamed: 0,property_type,price,province_name,latitude,longitude,baths,purpose,bedrooms,date_added,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,C_Lahore,C_Rawalpindi
0,Flat,10000000,Islamabad Capital,33.67989,73.01264,2,For Sale,2,2/4/2019,1089.004,0,1,0,0,0
1,Flat,6900000,Islamabad Capital,33.700993,72.971492,3,For Sale,3,5/4/2019,15246.056,0,1,0,0,0
2,House,16500000,Islamabad Capital,33.631486,72.926559,6,For Sale,5,7/17/2019,2178.008,0,1,0,0,0
3,House,43500000,Islamabad Capital,33.707573,73.151199,4,For Sale,4,4/5/2019,10890.0,0,1,0,0,0
4,House,7000000,Islamabad Capital,33.492591,73.301339,3,For Sale,3,7/10/2019,2178.008,0,1,0,0,0


In [13]:
dff.province_name.value_counts()

Punjab               70536
Sindh                60484
Islamabad Capital    37426
Name: province_name, dtype: int64

In [14]:
dff1 = pd.get_dummies(dff, columns = ['province_name'] , prefix = 'P')

In [15]:
dff1.head()

Unnamed: 0,property_type,price,latitude,longitude,baths,purpose,bedrooms,date_added,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,C_Lahore,C_Rawalpindi,P_Islamabad Capital,P_Punjab,P_Sindh
0,Flat,10000000,33.67989,73.01264,2,For Sale,2,2/4/2019,1089.004,0,1,0,0,0,1,0,0
1,Flat,6900000,33.700993,72.971492,3,For Sale,3,5/4/2019,15246.056,0,1,0,0,0,1,0,0
2,House,16500000,33.631486,72.926559,6,For Sale,5,7/17/2019,2178.008,0,1,0,0,0,1,0,0
3,House,43500000,33.707573,73.151199,4,For Sale,4,4/5/2019,10890.0,0,1,0,0,0,1,0,0
4,House,7000000,33.492591,73.301339,3,For Sale,3,7/10/2019,2178.008,0,1,0,0,0,1,0,0


In [16]:
dff1.drop(['purpose'] , axis = 1 , inplace = True)

In [17]:
dff1.head()

Unnamed: 0,property_type,price,latitude,longitude,baths,bedrooms,date_added,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,C_Lahore,C_Rawalpindi,P_Islamabad Capital,P_Punjab,P_Sindh
0,Flat,10000000,33.67989,73.01264,2,2,2/4/2019,1089.004,0,1,0,0,0,1,0,0
1,Flat,6900000,33.700993,72.971492,3,3,5/4/2019,15246.056,0,1,0,0,0,1,0,0
2,House,16500000,33.631486,72.926559,6,5,7/17/2019,2178.008,0,1,0,0,0,1,0,0
3,House,43500000,33.707573,73.151199,4,4,4/5/2019,10890.0,0,1,0,0,0,1,0,0
4,House,7000000,33.492591,73.301339,3,3,7/10/2019,2178.008,0,1,0,0,0,1,0,0


In [18]:
dff1.property_type.value_counts()

House            105468
Flat              38238
Upper Portion     13774
Lower Portion      9229
Room                685
Farm House          657
Penthouse           395
Name: property_type, dtype: int64

In [19]:
dff2 = pd.get_dummies(dff1, columns = ['property_type'] , prefix = 'P')

In [20]:
dff2.head()

Unnamed: 0,price,latitude,longitude,baths,bedrooms,date_added,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,...,P_Islamabad Capital,P_Punjab,P_Sindh,P_Farm House,P_Flat,P_House,P_Lower Portion,P_Penthouse,P_Room,P_Upper Portion
0,10000000,33.67989,73.01264,2,2,2/4/2019,1089.004,0,1,0,...,1,0,0,0,1,0,0,0,0,0
1,6900000,33.700993,72.971492,3,3,5/4/2019,15246.056,0,1,0,...,1,0,0,0,1,0,0,0,0,0
2,16500000,33.631486,72.926559,6,5,7/17/2019,2178.008,0,1,0,...,1,0,0,0,0,1,0,0,0,0
3,43500000,33.707573,73.151199,4,4,4/5/2019,10890.0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
4,7000000,33.492591,73.301339,3,3,7/10/2019,2178.008,0,1,0,...,1,0,0,0,0,1,0,0,0,0


In [21]:
dff2['year'] = pd.DatetimeIndex(dff2['date_added']).year

In [22]:
dff2.head()

Unnamed: 0,price,latitude,longitude,baths,bedrooms,date_added,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,...,P_Punjab,P_Sindh,P_Farm House,P_Flat,P_House,P_Lower Portion,P_Penthouse,P_Room,P_Upper Portion,year
0,10000000,33.67989,73.01264,2,2,2/4/2019,1089.004,0,1,0,...,0,0,0,1,0,0,0,0,0,2019
1,6900000,33.700993,72.971492,3,3,5/4/2019,15246.056,0,1,0,...,0,0,0,1,0,0,0,0,0,2019
2,16500000,33.631486,72.926559,6,5,7/17/2019,2178.008,0,1,0,...,0,0,0,0,1,0,0,0,0,2019
3,43500000,33.707573,73.151199,4,4,4/5/2019,10890.0,0,1,0,...,0,0,0,0,1,0,0,0,0,2019
4,7000000,33.492591,73.301339,3,3,7/10/2019,2178.008,0,1,0,...,0,0,0,0,1,0,0,0,0,2019


In [23]:
dff2.drop(['date_added'] , axis = 1 , inplace = True)

In [24]:
dff2

Unnamed: 0,price,latitude,longitude,baths,bedrooms,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,C_Lahore,...,P_Punjab,P_Sindh,P_Farm House,P_Flat,P_House,P_Lower Portion,P_Penthouse,P_Room,P_Upper Portion,year
0,10000000,33.679890,73.012640,2,2,1089.004,0,1,0,0,...,0,0,0,1,0,0,0,0,0,2019
1,6900000,33.700993,72.971492,3,3,15246.056,0,1,0,0,...,0,0,0,1,0,0,0,0,0,2019
2,16500000,33.631486,72.926559,6,5,2178.008,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2019
3,43500000,33.707573,73.151199,4,4,10890.000,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2019
4,7000000,33.492591,73.301339,3,3,2178.008,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168441,26500000,25.029909,67.137192,0,6,26136.096,0,0,1,0,...,0,1,0,0,1,0,0,0,0,2019
168442,12500000,25.017951,67.136393,0,3,2178.008,0,0,1,0,...,0,1,0,0,1,0,0,0,0,2019
168443,27000000,25.015384,67.116330,0,6,26136.096,0,0,1,0,...,0,1,0,0,1,0,0,0,0,2019
168444,11000000,25.013265,67.120818,0,3,21235.578,0,0,1,0,...,0,1,0,0,1,0,0,0,0,2019


In [25]:
x = dff2.drop('price' , axis =1)

In [26]:
x.head()

Unnamed: 0,latitude,longitude,baths,bedrooms,Total_Area,C_Faisalabad,C_Islamabad,C_Karachi,C_Lahore,C_Rawalpindi,...,P_Punjab,P_Sindh,P_Farm House,P_Flat,P_House,P_Lower Portion,P_Penthouse,P_Room,P_Upper Portion,year
0,33.67989,73.01264,2,2,1089.004,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,2019
1,33.700993,72.971492,3,3,15246.056,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,2019
2,33.631486,72.926559,6,5,2178.008,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2019
3,33.707573,73.151199,4,4,10890.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2019
4,33.492591,73.301339,3,3,2178.008,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2019


In [27]:
y = dff2['price']

In [28]:
y.head()

0    10000000
1     6900000
2    16500000
3    43500000
4     7000000
Name: price, dtype: int64

# Linear Regression 

In [29]:
train_x , test_x , train_y , test_y = train_test_split(x , y , test_size = 0.3)

In [30]:
model = LinearRegression()

In [31]:
model.fit(train_x , train_y)

LinearRegression()

In [32]:
predict = model.predict(test_x)

In [33]:
mean_absolute_error(predict, test_y)

15374571.040111536

In [34]:
mean_squared_error(predict, test_y)

1062608148868978.0

In [35]:
r2_score(predict, test_y)

-4.498797915558164