# Boston Dataset -- KNN Regression Final Model 

## Comparision of metrics:

|Metric|Basic|Basic_K=4_P=1|StdScale|StdScale__K=3_P=2|NormScale|NormScale_K=2_P=1|
|--|--|--|--|--|--|--|
|Test_MSE|58.78|47.95|27.19|19.63|35.88|20.83|
|Test_MAE|5.45|4.88|3.49|3.06|3.91|3.21|
|Test_RMSE|7.66|6.92|5.21|4.43|5.99|4.56|
|Test_R2|0.45|0.55|0.74|0.81|0.66|0.80|
|--|--|--|--|--|--|--|
|Train_MSE|23.23|18.46|12.99|7.78|14.97|5.40|
|Train_MAE|3.30|2.82|2.21|1.77|2.40|1.53|
|Train_RMSE|4.82|4.29|3.60|2.79|3.86|2.32|
|Train_R2|0.70|0.76|0.83|0.90|0.80|0.93|

### Conclusion
- Scaling: Normalization
- HP Tuning : K=2, P=1
- RMSE and MAE have very less difference
- Test and Train Score is good

#### Import libraries

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import pickle
import json

from sklearn.datasets import load_boston

import warnings
warnings.filterwarnings('ignore')

#### Data Gathering

In [5]:
dataset = load_boston()
dataset.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [6]:
df = pd.DataFrame(dataset['data'],columns=dataset['feature_names'])
df['PRICE']=dataset['target']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### EDA

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [8]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


#### Scaling

In [9]:
x = df.drop('PRICE',axis=1)
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [10]:
y=df['PRICE']
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: PRICE, dtype: float64

In [11]:
norm_scalar = MinMaxScaler()
norm_scalar.fit(x)
arr = norm_scalar.transform(x)

x_norm = pd.DataFrame(arr,columns=x.columns)
x_norm.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


#### Model Building
HP Tuning : K=2, P=1

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x_norm,y,test_size=0.2,random_state=45)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


In [15]:
knn_reg = KNeighborsRegressor(n_neighbors=2,p=1)
knn_reg.fit(x_train,y_train)

#### Prediction

In [16]:
#test data

y_test_pred = knn_reg.predict(x_test)

print("Ya -- Yp")
for i in range(15,21):
    print(y_test.iloc[i],y_test_pred[i])

Ya -- Yp
18.6 23.4
28.7 22.700000000000003
23.7 25.35
13.1 12.05
25.0 21.4
11.9 21.3


In [17]:
#train data

y_train_pred = knn_reg.predict(x_train)

print("Ya -- Yp")
for i in range(15,21):
    print(y_train.iloc[i],y_train_pred[i])

Ya -- Yp
13.6 14.05
14.1 14.2
44.8 47.4
21.2 20.25
21.4 18.0
22.0 22.1


#### Evaluation

In [18]:
# Testing Data 

mse = mean_squared_error(y_test,y_test_pred)
print("MSE: ",mse)
mae = mean_absolute_error(y_test,y_test_pred)
print("MAE: ",mae)
rmse = np.sqrt(mse)
print("RMSE: ",rmse)
score = r2_score(y_test,y_test_pred)
print("R2 Score: ",score)

MSE:  20.897500000000004
MAE:  3.2205882352941178
RMSE:  4.571378347938399
R2 Score:  0.8066442303780115


In [19]:
# Training data

mse = mean_squared_error(y_train,y_train_pred)
print("MSE: ",mse)
mae = mean_absolute_error(y_train,y_train_pred)
print("MAE: ",mae)
rmse = np.sqrt(mse)
print("RMSE: ",rmse)
score = r2_score(y_train,y_train_pred)
print("R2 Score: ",score)

MSE:  5.417326732673267
MAE:  1.5398514851485146
RMSE:  2.327515141233944
R2 Score:  0.9308485557316171


#### Creating Model files for prediction

In [20]:
knn_reg

In [21]:
with open('Boston_KNN_Model.pkl','wb') as f:
    pickle.dump(knn_reg,f)

In [22]:
norm_scalar

In [23]:
with open('Boston_Norm_Scalar.pkl','wb') as f:
    pickle.dump(norm_scalar,f)

In [25]:
x.columns.to_list()

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']

In [26]:
encoder = {'columns':x.columns.to_list()}
encoder

{'columns': ['CRIM',
  'ZN',
  'INDUS',
  'CHAS',
  'NOX',
  'RM',
  'AGE',
  'DIS',
  'RAD',
  'TAX',
  'PTRATIO',
  'B',
  'LSTAT']}

In [27]:
with open('Boston_encoder.json','w') as f:
    json.dump(encoder,f)

#### Testing

In [28]:
# load model 

with open('Boston_KNN_Model.pkl','rb') as f:
    test_model = pickle.load(f)
    
test_model

In [29]:
# load scalar

with open('Boston_Norm_Scalar.pkl','rb') as f:
    test_scalar = pickle.load(f)
    
test_scalar

In [30]:
# load encoder

with open('Boston_encoder.json','r') as f:
    test_enc = json.load(f)
    
test_enc

{'columns': ['CRIM',
  'ZN',
  'INDUS',
  'CHAS',
  'NOX',
  'RM',
  'AGE',
  'DIS',
  'RAD',
  'TAX',
  'PTRATIO',
  'B',
  'LSTAT']}

In [40]:
test_arr = np.zeros(len(test_enc['columns']))
test_arr

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [41]:
#input data

CRIM=0.13069
ZN=  0.050000
INDUS =15.89000
CHAS=3.00000
NOX= 0.25000
RM=  7.95100
AGE=100.80000
DIS= 5.88930
RAD= 5.00000
TAX= 275.00000
PTRATIO=16.40000
B= 376.90000
LSTAT =17.92000

In [42]:
test_arr[0] = CRIM
test_arr[1] = ZN
test_arr[2] = INDUS
test_arr[3] = CHAS
test_arr[4] = NOX
test_arr[5] = RM
test_arr[6] = AGE
test_arr[7] = DIS
test_arr[8] = RAD
test_arr[9] = TAX
test_arr[10] = PTRATIO
test_arr[11] = B
test_arr[12] = LSTAT

test_arr

array([1.3069e-01, 5.0000e-02, 1.5890e+01, 3.0000e+00, 2.5000e-01,
       7.9510e+00, 1.0080e+02, 5.8893e+00, 5.0000e+00, 2.7500e+02,
       1.6400e+01, 3.7690e+02, 1.7920e+01])

In [43]:
#scaling

test_arr = test_scalar.transform([test_arr])
test_arr

array([[ 1.39788881e-03,  5.00000000e-04,  5.65615836e-01,
         3.00000000e+00, -2.77777778e-01,  8.41157310e-01,
         1.00823893e+00,  4.32821977e-01,  1.73913043e-01,
         1.67938931e-01,  4.04255319e-01,  9.49568813e-01,
         4.46743929e-01]])

In [46]:
#predict

predict_price = test_model.predict(test_arr)[0]
predict_price

22.25