In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.datasets import load_boston

import warnings 
warnings.filterwarnings('ignore')

In [2]:
boston = load_boston()

dir(boston)

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']

In [3]:
DF = pd.DataFrame(boston.data, columns=boston.feature_names)

DF['target'] = boston.target
DF

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [4]:
DF.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

In [5]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


# Train Test Split

In [6]:
x = DF.drop('target', axis = 1)
y = DF['target']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=11)

x_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
188,0.12579,45.0,3.44,0.0,0.437,6.556,29.1,4.5667,5.0,398.0,15.2,382.84,4.56
319,0.47547,0.0,9.90,0.0,0.544,6.113,58.8,4.0019,4.0,304.0,18.4,396.23,12.73
21,0.85204,0.0,8.14,0.0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83
14,0.63796,0.0,8.14,0.0,0.538,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26
369,5.66998,0.0,18.10,1.0,0.631,6.683,96.8,1.3567,24.0,666.0,20.2,375.33,3.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.03041,0.0,5.19,0.0,0.515,5.895,59.6,5.6150,5.0,224.0,20.2,394.81,10.56
91,0.03932,0.0,3.41,0.0,0.489,6.405,73.9,3.0921,2.0,270.0,17.8,393.55,8.20
80,0.04113,25.0,4.86,0.0,0.426,6.727,33.5,5.4007,4.0,281.0,19.0,396.90,5.29
191,0.06911,45.0,3.44,0.0,0.437,6.739,30.8,6.4798,5.0,398.0,15.2,389.71,4.69


# Model Training

In [7]:
LR_Model = LinearRegression()
LR_Model.fit(x_train,y_train)

# Model evaluation 

In [8]:
#Testing data Accuracy

y_pred = LR_Model.predict(x_test)


mse = mean_squared_error(y_test,y_pred)
print("MSE is : ",mse)

rmse = np.sqrt(mse)
print('RMSE is : ',rmse)

mae = mean_absolute_error(y_test,y_pred)
print('MAE is : ',mae)

r2 = r2_score(y_test,y_pred)
print('R2_score value is  : ',r2)

MSE is :  26.94481224951831
RMSE is :  5.190839262539181
MAE is :  3.56732601899681
R2_score value is  :  0.6859495577055542


In [9]:
# Training data accuracy

y_pred_train = LR_Model.predict(x_train)


mse = mean_squared_error(y_train,y_pred_train)
print("MSE is : ",mse)

rmse = np.sqrt(mse)
print('RMSE is : ',rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print('MAE is : ',mae)

r2 = r2_score(y_train,y_pred_train)
print('R2_score value is  : ',r2)

MSE is :  20.89839548405952
RMSE is :  4.571476291534226
MAE is :  3.25422014821505
R2_score value is  :  0.7511508467017516


In [None]:
#High bias & low Variance >>> Underfitting

# Single row testing

In [10]:
Test_array = np.zeros(len(x.columns))
Test_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
x.head(1).T

Unnamed: 0,0
CRIM,0.00632
ZN,18.0
INDUS,2.31
CHAS,0.0
NOX,0.538
RM,6.575
AGE,65.2
DIS,4.09
RAD,1.0
TAX,296.0


In [12]:
CRIM = 0.008
ZN = 25
INDUS = 2.5
CHAS = 0
NOX = 0.8
RM = 6
AGE = 47
DIS = 5.8
RAD = 2
TAX = 150
PTRATIO = 17.5
B = 456
LSTAT = 3.78

In [13]:
Test_array[0] = CRIM
Test_array[1] = ZN
Test_array[2] = INDUS
Test_array[3] = CHAS
Test_array[4] = NOX
Test_array[5] = RM
Test_array[6] = AGE
Test_array[7] = DIS
Test_array[8] = RAD
Test_array[9] = TAX
Test_array[10] = PTRATIO
Test_array[11] = B
Test_array[12] = LSTAT

In [14]:
Project_data = {'columns':list(x.columns)}
Project_data

{'columns': ['CRIM',
  'ZN',
  'INDUS',
  'CHAS',
  'NOX',
  'RM',
  'AGE',
  'DIS',
  'RAD',
  'TAX',
  'PTRATIO',
  'B',
  'LSTAT']}

In [15]:
LR_Model.predict([Test_array])[0]

21.566752654462547

In [16]:
import pickle

with open ('LR_Model.pkl','wb') as f :
    pickle.dump(LR_Model,f)

In [17]:
import json 

with open('Project_data.json','w') as f:
    json.dump(Project_data,f)