In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge



In [3]:
bean = datasets.load_boston()
print (bean.DESCR);

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [5]:
X_train, X_test, y_train, y_test = load_boston()

In [6]:
X_train.shape

(379, 13)

In [7]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
list(zip (y_test, clf.predict(X_test)))

[(24.0, 30.52921266216272),
 (19.0, 20.958181798035767),
 (46.700000000000003, 34.542365937567077),
 (19.300000000000001, 20.899333126985514),
 (16.600000000000001, 15.951981099242454),
 (10.9, 18.883100372102099),
 (23.399999999999999, 23.338051892218239),
 (29.899999999999999, 31.93763517661343),
 (50.0, 43.423542205358729),
 (20.600000000000001, 22.506395957688831),
 (30.100000000000001, 35.684549474769568),
 (15.6, 19.369280330041029),
 (30.100000000000001, 24.753230814122162),
 (41.700000000000003, 38.092029564932659),
 (12.1, 18.641958254717927),
 (22.699999999999999, 22.327281717389869),
 (30.5, 30.336926559643263),
 (17.199999999999999, 14.188961467523333),
 (19.199999999999999, 20.339622897732326),
 (18.0, 19.119300488687497),
 (20.100000000000001, 24.083708458983885),
 (23.699999999999999, 27.961504919123485),
 (31.600000000000001, 32.814319486107678),
 (50.0, 42.148671934733159),
 (12.800000000000001, 13.407009486287331),
 (14.0, 13.326032426481191),
 (19.300000000000001, 21

In [9]:
yLrPre=clf.predict(X_test)

In [10]:
r2Score=r2_score(y_test,yLrPre)

In [20]:
r2Score

0.7819380436928044

R^2 Score on Linear Regressor is 0.7819380436928044

In [21]:
mseValue=mean_squared_error(y_test,yLrPre)

In [22]:
mseValue

17.407549732878042

MSE on Linear Regressor is 17.407549732878042

In [23]:
ridge=Ridge(alpha=0.01)

In [24]:
ridge.fit(X_train,y_train)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [25]:
list(zip(y_test,ridge.predict(X_test)))

[(24.0, 30.529413636922293),
 (19.0, 20.95819351410448),
 (46.700000000000003, 34.542272051642243),
 (19.300000000000001, 20.899544814403377),
 (16.600000000000001, 15.952332090935387),
 (10.9, 18.883046230946139),
 (23.399999999999999, 23.338145225676556),
 (29.899999999999999, 31.937393174474785),
 (50.0, 43.42309493544473),
 (20.600000000000001, 22.506424140510404),
 (30.100000000000001, 35.683983890099086),
 (15.6, 19.370133697061483),
 (30.100000000000001, 24.753148399249177),
 (41.700000000000003, 38.091827267946485),
 (12.1, 18.641878620070528),
 (22.699999999999999, 22.327576609598449),
 (30.5, 30.337157279411652),
 (17.199999999999999, 14.188530363326983),
 (19.199999999999999, 20.339910176068596),
 (18.0, 19.119611396983547),
 (20.100000000000001, 24.083444609782724),
 (23.699999999999999, 27.961352916997651),
 (31.600000000000001, 32.813854938678055),
 (50.0, 42.147726839859061),
 (12.800000000000001, 13.407073024572229),
 (14.0, 13.326534958870791),
 (19.300000000000001, 21

In [26]:
yRPre = ridge.predict(X_test)

In [27]:
r2rScore=r2_score(y_test,yRPre)

In [28]:
r2rScore

0.78193943745052819

R^2 score on the Ridge linear regressor model is 0.78193943745052819

In [29]:
msRScor = mean_squared_error(y_test,yRPre)

In [30]:
msRScor

17.407438471348044

MSE on the Ridge linear regressor model is 17.407438471348044