### Import the Libraries

In [1]:
import pandas as pd

### Import the dataSet - load_boston

In [2]:
from sklearn.datasets import load_boston

boston_r = load_boston()
print(boston_r.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Reshape of Dataset

In [3]:
boston = pd.DataFrame(boston_r.data,columns=boston_r.feature_names)
boston["Price"] = boston_r.target

x = boston_r.data
y = boston_r.target
print(x.shape, y.shape)
boston.head()

(506, 13) (506,)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### Logistic Regression Model with Cross-Validation (k-fold)

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

skf = KFold(n_splits=5, shuffle = True, random_state = 20)

scores_table = []


for train_idx, test_idx in skf.split(x, y):
    model = LinearRegression()
    x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    scores_table.append({"MAE": mae, "MSE": mse, "R^2": r2})

for i in scores_table:
    print(i)

{'MAE': 3.055894153890937, 'MSE': 16.49535197593152, 'R^2': 0.7438826183113556}
{'MAE': 3.587134053611144, 'MSE': 29.185533811642596, 'R^2': 0.7277602099488687}
{'MAE': 3.6425086835565414, 'MSE': 23.521151460585834, 'R^2': 0.7076871876767586}
{'MAE': 3.569127374256797, 'MSE': 25.835724364710398, 'R^2': 0.6566288269770473}
{'MAE': 3.2558482010866867, 'MSE': 24.606201772488, 'R^2': 0.734251260418268}
