#  Import Libraries

In [29]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

# Data Loading

In [2]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
df.head() # Display the first 5 rows of the data set

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.info() # Checking the dataset information like columns and it's datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.shape # Checking the size of the Dataset like how many rows and columns 9 Columns, 13320 rows

(13320, 9)

In [6]:
df.isnull().sum() # Checking the Null Values Count per each Column

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

# Data Cleaning

In [7]:
df.select_dtypes(include="object").columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft'],
      dtype='object')

In [8]:
# Function for take the numerical values from total_sqft and convert to numerical datatype

def convert_sqft(x):
    try:
        if '-' in x:
            nums = x.split('-')
            return (float(nums[0]) + float(nums[1])) / 2
        else:
            return float(x)
    except:
        return None

df["total_sqft"] = df["total_sqft"].apply(convert_sqft)

In [9]:
df.select_dtypes(include="object").columns

Index(['area_type', 'availability', 'location', 'size', 'society'], dtype='object')

In [10]:
# Create new "bhk" column and Extracting the Numerical Data from size column and convert it to numerical column

df["bhk"] = df["size"].str.split().str[0].astype(float) 

In [11]:
# Dropping the "size" column 

df.drop("size", axis=1, inplace=True)

In [12]:
# Dropping the "availability" Column due loss weightage on these column

df.drop("availability", axis=1, inplace=True)

In [13]:
# Dropping the "society" Column due high null values

df.drop("society", axis=1, inplace=True)

# Handling Categorical Variables

In [14]:
# Checking the Unique Values in "area_type" So it will help in one Hot encoding

df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [15]:
# Checking the Unique Values in "location" So it will help in one Hot encoding

df['location'].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [16]:
# Doing the one hot encoding and dropping the actual columns

df=pd.get_dummies(df, columns=["area_type","location"], drop_first=True)

In [18]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,location_ Banaswadi,location_ Basavangudi,...,location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,"location_white field,kadugodi",location_whitefiled
0,1056.0,2.0,1.0,39.07,2.0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2600.0,5.0,3.0,120.0,4.0,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1440.0,2.0,3.0,62.0,3.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1521.0,3.0,1.0,95.0,3.0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1200.0,2.0,1.0,51.0,2.0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
# Checking the Null values

df.isnull().sum()

total_sqft                        46
bath                              73
balcony                          609
price                              0
bhk                               16
                                ... 
location_t.c palya                 0
location_tc.palya                  0
location_vinayakanagar             0
location_white field,kadugodi      0
location_whitefiled                0
Length: 1312, dtype: int64

# Feature & Target Separation

In [20]:
# Selected the target Coulmn as "price" into the y
# Remaining Columns all into X

X=df.drop('price',axis=1)
y=df['price']

# Train-Test Split

In [23]:
X_test,X_train,y_test,y_train = train_test_split(X,y,test_size=0.25)

# Missing Value Imputation

In [24]:
imputer = SimpleImputer(strategy ='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Feature Scaling

In [25]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building

- Implementing the KNN Model with default k=5 value

In [26]:
knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X_train, y_train)

In [30]:
y_pred = knn.predict(X_test)

# Model Evaluation

In [31]:
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R2 Score: 0.3708347727599387
MSE: 14314.92900845195


# Cross Validation

In [32]:
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    print(k, scores.mean())

1 0.307944921210026
2 0.36268637009987287
3 0.3625520441435612
4 0.34216507746274083
5 0.3551102810284285
6 0.37163205675829286
7 0.3744413528366507
8 0.3721160021433514
9 0.3670099881904533
10 0.3570184734214818
11 0.35535780455470944
12 0.35307940037117547
13 0.3515948576912777
14 0.34484714267543837
15 0.3390201723367848
16 0.33867685809701503
17 0.337432237927269
18 0.33694717531312357
19 0.3398948169079932
20 0.34124160931693537
