## Import Libraries

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load DataSet

In [30]:
data = pd.read_csv("isb_data.csv")
data

Unnamed: 0.1,Unnamed: 0,price,location,baths,bedrooms,Total_Area
0,0,165.0,G-15,6,5,2178.008
1,1,435.0,Bani Gala,4,4,10890.000
2,2,70.0,DHA Defence,3,3,2178.008
3,3,345.0,Ghauri Town,8,8,87120.000
4,4,270.0,Korang Town,8,8,5445.000
...,...,...,...,...,...,...
9173,11864,23.0,DHA Defence,2,2,1361.255
9174,11865,127.0,Soan Garden,3,3,2722.510
9175,11866,270.0,G-13,4,4,1905.757
9176,11867,90.0,Soan Garden,3,2,1633.506


## Drop Unnecessary Columns
* drop unnecessary column like Unnamed: 0

In [31]:
data.shape

(9178, 6)

In [32]:
data.drop('Unnamed: 0', axis=1, inplace=True)

## Data Exploration and Cleaning

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9178 entries, 0 to 9177
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   price       9178 non-null   float64
 1   location    9178 non-null   object 
 2   baths       9178 non-null   int64  
 3   bedrooms    9178 non-null   int64  
 4   Total_Area  9178 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 358.6+ KB


In [34]:
data.describe()

Unnamed: 0,price,baths,bedrooms,Total_Area
count,9178.0,9178.0,9178.0,9178.0
mean,342.793786,5.17019,4.824472,12326.61
std,420.842852,1.800565,1.695079,27414.33
min,15.0,1.0,1.0,272.251
25%,125.0,4.0,4.0,1633.506
50%,200.0,5.0,5.0,2722.51
75%,410.0,6.0,6.0,5445.0
max,3800.0,12.0,12.0,1159785.0


In [35]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

price
150.0    260
140.0    219
450.0    191
160.0    191
145.0    179
        ... 
585.0      1
60.5       1
16.0       1
770.0      1
438.0      1
Name: count, Length: 486, dtype: int64
********************
location
DHA Defence        1057
Bahria Town         771
G-13                741
Ghauri Town         511
Soan Garden         317
                   ... 
Capital Enclave       1
Thalian               1
Sihala                1
PAF Tarnol            1
Bokra Road            1
Name: count, Length: 167, dtype: int64
********************
baths
6     2259
5     1813
4     1540
3     1089
7      988
2      625
8      480
10     187
9      160
1       32
12       3
11       2
Name: count, dtype: int64
********************
bedrooms
4     2160
5     2144
6     1797
3     1112
2      774
7      685
8      202
9      153
10      88
11      50
1       11
12       2
Name: count, dtype: int64
********************
Total_Area
1361.255      1463
5445.000      1363
2722.510       953
2178.008       89

In [36]:
data.isna().sum()

price         0
location      0
baths         0
bedrooms      0
Total_Area    0
dtype: int64

In [37]:
data['location'].value_counts()

location
DHA Defence        1057
Bahria Town         771
G-13                741
Ghauri Town         511
Soan Garden         317
                   ... 
Capital Enclave       1
Thalian               1
Sihala                1
PAF Tarnol            1
Bokra Road            1
Name: count, Length: 167, dtype: int64

In [38]:
data['price'].value_counts()

price
150.0    260
140.0    219
450.0    191
160.0    191
145.0    179
        ... 
585.0      1
60.5       1
16.0       1
770.0      1
438.0      1
Name: count, Length: 486, dtype: int64

In [39]:
data['bedrooms'].value_counts()

bedrooms
4     2160
5     2144
6     1797
3     1112
2      774
7      685
8      202
9      153
10      88
11      50
1       11
12       2
Name: count, dtype: int64

In [40]:
data['baths'].value_counts()

baths
6     2259
5     1813
4     1540
3     1089
7      988
2      625
8      480
10     187
9      160
1       32
12       3
11       2
Name: count, dtype: int64

In [41]:
data['Total_Area'].value_counts()

Total_Area
1361.255      1463
5445.000      1363
2722.510       953
2178.008       895
1905.757       537
              ... 
206910.000       1
13884.801        1
31853.367        1
32125.618        1
34575.877        1
Name: count, Length: 143, dtype: int64

In [42]:
data = data[(data['Total_Area'] != 0) & (data['baths'] != 0) & (data['bedrooms'] != 0)  & (data['price'] >= 15)]

In [73]:
data[data['Total_Area'] ==1361.255]

Unnamed: 0.1,Unnamed: 0,price,location,baths,bedrooms,Total_Area
10,10,45.0,Bhara kahu,3,3,1361.255
16,16,45.0,Ghauri Town,3,2,1361.255
37,38,90.0,Chatha Bakhtawar,4,4,1361.255
44,45,85.0,Bani Gala,5,5,1361.255
45,46,130.0,G-13,4,3,1361.255
...,...,...,...,...,...,...
9158,11819,135.0,Bahria Town,3,3,1361.255
9165,11826,95.0,PWD Housing Scheme,5,5,1361.255
9169,11858,35.0,DHA Defence,2,2,1361.255
9170,11859,150.0,Park View City,4,4,1361.255


In [43]:
data[data['bedrooms'] == 0]

Unnamed: 0,price,location,baths,bedrooms,Total_Area


In [44]:
data[data['baths'] == 0]

Unnamed: 0,price,location,baths,bedrooms,Total_Area


In [45]:
data[data['location'].isna()]

Unnamed: 0,price,location,baths,bedrooms,Total_Area


In [46]:
data[data['location']== 'DHA Defence']

Unnamed: 0,price,location,baths,bedrooms,Total_Area
2,70.0,DHA Defence,3,3,2178.008
5,500.0,DHA Defence,7,7,5445.000
6,190.0,DHA Defence,3,3,2722.510
7,800.0,DHA Defence,7,7,10890.000
28,65.0,DHA Defence,3,3,2178.008
...,...,...,...,...,...
9135,85.0,DHA Defence,3,3,2178.008
9164,80.0,DHA Defence,4,3,2178.008
9168,80.0,DHA Defence,3,3,2178.008
9169,35.0,DHA Defence,2,2,1361.255


In [47]:
data.location

0              G-15
1         Bani Gala
2       DHA Defence
3       Ghauri Town
4       Korang Town
           ...     
9173    DHA Defence
9174    Soan Garden
9175           G-13
9176    Soan Garden
9177            F-7
Name: location, Length: 9178, dtype: object

In [48]:
data.baths.unique()

array([ 6,  4,  3,  8,  7,  5, 10,  9,  2,  1, 11, 12], dtype=int64)

In [49]:
data.bedrooms.unique()

array([ 5,  4,  3,  8,  7,  6, 10,  2,  9, 11,  1, 12], dtype=int64)

In [50]:
data.Total_Area.unique()

array([2.1780080e+03, 1.0890000e+04, 8.7120000e+04, 5.4450000e+03,
       2.7225100e+03, 3.8115140e+03, 1.3612550e+03, 1.2795797e+04,
       1.0890040e+03, 3.2670120e+03, 4.8460678e+04, 1.1434500e+05,
       1.3612500e+05, 8.1675000e+04, 2.9675359e+04, 1.9057570e+03,
       1.5246056e+04, 1.1979044e+04, 7.0785000e+04, 2.5319343e+04,
       1.9602000e+05, 3.8659642e+04, 9.8010000e+04, 2.9947610e+03,
       8.1675300e+02, 1.8240817e+04, 4.3560160e+03, 1.0345500e+05,
       1.6335060e+03, 5.1727690e+03, 1.1979000e+05, 1.4973805e+04,
       1.9874323e+04, 2.6408347e+04, 4.3287909e+04, 2.7225000e+04,
       2.3413500e+05, 2.7497351e+04, 4.9005180e+03, 1.1434542e+04,
       6.8062750e+03, 2.0418825e+04, 5.9895000e+04, 2.4502590e+03,
       3.1853367e+04, 6.5340000e+04, 1.2251295e+04, 4.0020897e+04,
       4.0837650e+03, 7.6230000e+04, 9.5287850e+03, 9.8010360e+03,
       1.7968500e+05, 2.6136096e+04, 2.6952849e+04, 1.7696315e+04,
       3.3759124e+04, 2.9130857e+04, 3.4848128e+04, 7.6230280e

In [51]:
data.location.unique()

array(['G-15', 'Bani Gala', 'DHA Defence', 'Ghauri Town', 'Korang Town',
       'B-17', 'G-11', 'Bhara kahu', 'Garden Town', 'Koral Town',
       'Soan Garden', 'F-6', 'F-7', 'I-16', 'E-7', 'Emaar Canyon Views',
       'G-13', 'F-17', 'Bahria Town', 'PWD Housing Scheme', 'F-11',
       'Kuri Road', 'Pakistan Town', 'Chatha Bakhtawar', 'E-11', 'F-10',
       'F-8', 'Lehtarar Road', 'G-9', 'E-14', 'I-14', 'Tarlai',
       'Simly Dam Road', 'I-9', 'Burma Town',
       'National Police Foundation O-9', 'Naval Anchorage',
       'Islamabad Highway', 'CBR Town', 'G-10', 'Jhang Syedan', 'I-8',
       'D-17', 'Arsalan Town', 'Kashmir Highway', 'I-10', 'Pir Sohawa',
       'Ali Pur', 'FECHS', 'Shaheen Town', 'Chak Shahzad', 'H-13', 'G-14',
       'National Police Foundation', 'Shah Allah Ditta',
       'Islamabad Expressway', 'Margalla Town', 'Tarnol', 'H-15', 'G-8',
       'D-13', 'Meherban Colony', 'Zaraj Housing Scheme',
       'Multi Residencia & Orchards', 'Jhangi Syedan', 'AGHOSH',
      

In [52]:
data.describe()

Unnamed: 0,price,baths,bedrooms,Total_Area
count,9178.0,9178.0,9178.0,9178.0
mean,342.793786,5.17019,4.824472,12326.61
std,420.842852,1.800565,1.695079,27414.33
min,15.0,1.0,1.0,272.251
25%,125.0,4.0,4.0,1633.506
50%,200.0,5.0,5.0,2722.51
75%,410.0,6.0,6.0,5445.0
max,3800.0,12.0,12.0,1159785.0


In [86]:
data.describe()

Unnamed: 0,price,baths,bedrooms,Total_Area,Total_Area_Sqft
count,9178.0,9178.0,9178.0,9178.0,9178.0
mean,342.793786,5.17019,4.824472,12326.61,7312.068103
std,420.842852,1.800565,1.695079,27414.33,7713.28787
min,15.0,1.0,1.0,272.251,60.040969
25%,125.0,4.0,4.0,1633.506,2754.810818
50%,200.0,5.0,5.0,2722.51,6887.027045
75%,410.0,6.0,6.0,5445.0,10284.627054
max,3800.0,12.0,12.0,1159785.0,550962.163592


In [81]:
print(f"Length of price: {len('price')}")
print(f"Length of location: {len('location')}")
print(f"Length of baths: {len('baths')}")
print(f"Length of bedrooms: {len('bedrooms')}")
print(f"Length of Total_Area: {len('Total_Area')}")
 

Length of price: 5
Length of location: 8
Length of baths: 5
Length of bedrooms: 8
Length of Total_Area: 10


## Outlier detection and removal

In [87]:
data['location'].isnull().sum()

0

In [118]:
print(f"Length of X: {len(X)}")
print(f"Length of y: {len(y)}")


Length of X: 9178
Length of y: 9178


In [83]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

## Save Cleaned Data

In [84]:
data.to_csv("Cleaned_data.csv")
data.isna().sum()

price              0
location           0
baths              0
bedrooms           0
Total_Area         0
Total_Area_Sqft    0
dtype: int64

In [88]:
data.head

<bound method NDFrame.head of       price     location  baths  bedrooms  Total_Area  Total_Area_Sqft
0     165.0         G-15      6         5    2178.008      7575.729749
1     435.0    Bani Gala      4         4   10890.000      3994.490358
2      70.0  DHA Defence      3         3    2178.008      3213.945954
3     345.0  Ghauri Town      8         8   87120.000       396.005510
4     270.0  Korang Town      8         8    5445.000      4958.677686
...     ...          ...    ...       ...         ...              ...
9173   23.0  DHA Defence      2         2    1361.255      1689.617302
9174  127.0  Soan Garden      3         3    2722.510      4664.812985
9175  270.0         G-13      4         4    1905.757     14167.598492
9176   90.0  Soan Garden      3         2    1633.506      5509.621636
9177  140.0          F-7      7         7    5445.000      2571.166208

[9178 rows x 6 columns]>

## Imports Librarys for preprocessing

In [89]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [90]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False),['location']),remainder='passthrough')

In [91]:
scaler=StandardScaler()

In [92]:
lr=LinearRegression()

In [93]:
pipe=make_pipeline(column_trans,scaler,lr)

In [100]:
pipe.fit(X_train,y_train)



In [115]:
y_pred_lr=pipe.predict(X_test)

In [113]:
r2_score(y_test,y_pred)

0.7841749217482105

## Preprocessing

In [102]:
# Define the column transformer for preprocessing
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['location']),
    remainder=StandardScaler()  # Standard scale the remaining numerical columns
)

## Pipelines for LinearRegression Model

In [103]:
pipelines = {
    'Linear Regression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Lasso': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Lasso(alpha=1.0))
    ]),
    'Ridge': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=1.0))
    ])
}

##  Separate features and target variable

In [104]:
X = data.drop('price', axis=1)
y = data['price']

## Data into training and testing

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Pipeline with the training data and evaluate

In [111]:
results = {}

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[name] = r2

## R-squared scores

In [116]:
 
for name, r2 in results.items():
    print(f"{name}: R-squared score = {r2*100:.4f}")


Linear Regression: R-squared score = 78.4489
Lasso: R-squared score = 76.5507
Ridge: R-squared score = 78.4175


In [108]:
import pickle 

In [117]:
pickle.dump(pipe,open('RidgeModel.pkl','wb'))