# Boston DataSet - KNN Regression 

1. Data Gathering
2. EDA
3. Model Building
4. Prediction
5. Evaluation

- Test Data Evaluation
 - MSE:  58.783447058823526
 - MAE:  5.454313725490197
 - RMSE:  7.66703639347196
 - R2 Score:  0.45610151228174467
 
 
- Train Data Evaluation
 - MSE:  23.23600792079208
 - MAE:  3.3059405940594058
 - RMSE:  4.820374251112882
 - R2 Score:  0.7033954963315541

#### Import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import pickle
import json

from sklearn.datasets import load_boston

#### Data Gathering

In [3]:
data = load_boston()
data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [5]:
df = pd.DataFrame(data['data'],columns=data['feature_names'])
df['PRICE']=data['target']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### EDA

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [7]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


#### Model building

In [10]:
x = df.drop('PRICE',axis=1)
print(x.shape)
x.head()

(506, 13)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
y = df['PRICE']
print(y.shape)
y.head()

(506,)


0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: PRICE, dtype: float64

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


In [17]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(x_train,y_train)

#### Prediction

In [36]:
#test data

y_test_pred = knn_reg.predict(x_test)

print("Ya -- Yp")
for i in range(15,21):
    print(y_test.iloc[i],y_test_pred[i])


Ya -- Yp
18.6 16.38
28.7 22.139999999999997
23.7 24.32
13.1 9.02
25.0 32.0
11.9 25.46


In [35]:
#test data

y_train_pred = knn_reg.predict(x_train)

print("Ya -- Yp")
for i in range(15,21):
    print(y_train.iloc[i],y_train_pred[i])

Ya -- Yp
13.6 16.82
14.1 12.16
44.8 37.82
21.2 20.68
21.4 16.7
22.0 21.580000000000002


#### Evaluation

In [37]:
# Testing Data 

mse = mean_squared_error(y_test,y_test_pred)
print("MSE: ",mse)
mae = mean_absolute_error(y_test,y_test_pred)
print("MAE: ",mae)
rmse = np.sqrt(mse)
print("RMSE: ",rmse)
score = r2_score(y_test,y_test_pred)
print("R2 Score: ",score)

MSE:  58.783447058823526
MAE:  5.454313725490197
RMSE:  7.66703639347196
R2 Score:  0.45610151228174467


In [38]:
# Training data

mse = mean_squared_error(y_train,y_train_pred)
print("MSE: ",mse)
mae = mean_absolute_error(y_train,y_train_pred)
print("MAE: ",mae)
rmse = np.sqrt(mse)
print("RMSE: ",rmse)
score = r2_score(y_train,y_train_pred)
print("R2 Score: ",score)

MSE:  23.23600792079208
MAE:  3.3059405940594058
RMSE:  4.820374251112882
R2 Score:  0.7033954963315541
