In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [3]:
from sklearn.datasets import fetch_openml

# Load the dataset
boston = fetch_openml(name='boston', version=1)

# Access the data and target
data = boston.data
target = boston.target

# Print the description of the dataset
print(boston.DESCR)

**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10

# Problem Statement
- In this dataset our goal is to learn about `acccuracy` and `Cofussion Matrix`.

## Saperate feature and label

In [4]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [6]:
target.head(2)

0    24.0
1    21.6
Name: MEDV, dtype: float64

# Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2,random_state=43)

In [10]:
x_train.shape

(404, 13)

In [11]:
x_test.shape

(102, 13)

# Standard Scaler

In [13]:
from sklearn.preprocessing import StandardScaler

In [15]:
scale=StandardScaler()
x_train_scale=scale.fit_transform(x_train)
x_test_scale=scale.transform(x_test)

# Logistic Regression

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
model=LinearRegression()

# Fit the model

In [21]:
model.fit(x_train_scale,y_train)

# prediction

In [25]:
pre=model.predict(x_test_scale)
pre

array([22.60677759, 16.78021435, 20.17105016, 33.48676401, 16.73664042,
       13.55533107, 13.3634463 , 24.16610538, 22.04480685,  9.66672043,
       28.31438091, 26.7259204 , 16.70695604, 21.42902101, 21.93836023,
       11.52759557, 31.64652227, 32.70202084, 15.17603523, 40.66430067,
       21.12162635,  9.25378106, 20.96925951, 31.01625843,  7.12724987,
       25.07650501, 20.03891867, 19.69875104, 19.27762119,  7.59094885,
       22.01713669, 23.83446741, 14.99121063,  9.89951399, 24.44739694,
       20.1548996 , 17.0084432 , 20.86032024, 28.41351254, 26.881437  ,
       20.36772619,  6.08947866, 25.73042356, 18.5806221 , 37.35522973,
       17.95177821, 24.0334408 , 22.63221468, 34.96239484, 17.56297555,
       27.66432492, 36.6593303 , 34.26584485, 22.55380439, 12.28501157,
       36.99577868, 33.05314014, 25.98561314, 28.16471499, 28.77985502,
       21.98988346, 14.50666491, 25.62567138, 27.4265174 , 25.68606585,
       13.75076105, 23.16059033, 36.32337894,  8.63040269, 24.74

# Check `MAE`,`MSE`,`RMSE`

In [26]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# `MAE`

In [28]:
mae=mean_absolute_error(y_test,pre)
mae

3.291238342730675

# Conclussion
- we can get the `error/loss` of `3.29`
- It show the `loss` of the model but it use `modolus`.
- It use `modolus` and we know the `modolus` is not differential at 0.
- It can handle the `outliers`

# `MSE`

In [31]:
mse=mean_squared_error(y_test,pre)
mse

22.018613449500396

# conclussion
- It can simply calculate the `MAE` and the square them.
- It can also show the `loss` of the model.
- It can use `square` not mod b/c `square` is differentail at 0.
- It can't handle the outliers.

# `RMSE`

In [32]:
rmse=np.sqrt(mean_squared_error(y_test,pre))
rmse

4.692399540693481

# conclussion
- It can simply calculate the `MAE` and the square root them.
- It can also show the `loss` of the model.
- It can use `square` not mod b/c `square` is differentail at 0.
- It can't handle the outliers.

# `R2-Score`

In [35]:
score=r2_score(y_test,pre)
score

0.7612436444752164

# Conclussion
- we can get the score of `0.761`
- It can shoe the performance of the model. 
- It can show how the model is accurate.
- But if we can add irrelevant columns `R2-score` are stuck and `increase` but it is not good.

# `Adjusted R2-Score`

In [36]:
x_train.shape

(404, 13)

In [38]:
adjusted=(1-((1-score)*(404-1)/(404-1-13)))
adjusted

0.7532850992910569

# Conclussion
- we can get the score of `0.761`
- It can shoe the performance of the model. 
- It can show how the model is accurate.
- But if we can add irrelevant columns `R2-score`  decrease and if we can add relevent col `r2-score` is increase.