In [348]:
from sklearn import datasets

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from pprint import pprint


from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

### Read data

In [349]:
boston = datasets.load_boston()

In [350]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [351]:
X = boston.data

In [352]:
y = boston.target


### split the data into a training and test data

In [353]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 42)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((354, 13), (152, 13), (354,), (152,))

In [354]:
X_train = pd.DataFrame(X_train, dtype=np.float32)
y_train = pd.DataFrame(y_train)

### scale the data

### construct a sklearn.pipeline.Pipeline that does the scaling and the model

In [355]:
numeric_features = [0,1,2,3,4,5,6,7,8,9,10,11,12]
numeric_transformer = make_pipeline(
    StandardScaler()
    )

In [356]:
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)])

In [357]:
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

### Train with LinearRegression

In [358]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

LinearRegression()

In [359]:
y_test_predictions_lin_reg = lin_reg.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_test_predictions_lin_reg)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

4.6386897118807795

### Train with RandomForest

In [360]:
forest_reg = RandomForestRegressor(random_state = 42)
forest_reg.fit(X_train_prepared, y_train)

  forest_reg.fit(X_train_prepared, y_train)


RandomForestRegressor(random_state=42)

In [361]:
y_train_predictions_forest = forest_reg.predict(X_test_prepared)
forest_mse = mean_squared_error(y_test, y_train_predictions_forest)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

3.103521729422882

### run a grid search with cross-validation to optimize hyperparameters

In [362]:
print('Parameters currently in use:\n')
pprint(forest_reg.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [363]:
hyperparam_grid = {
    'ccp_alpha': [0.0, 1.0],
    'max_features': ['auto', 'sqrt']
}

In [364]:
grid_cv = GridSearchCV(estimator=forest_reg,           
                       param_grid=hyperparam_grid,
                       cv=5,
                       scoring='accuracy') 

In [365]:
grid_cv.fit(X_train_prepared, y_train)

  estimator.fit(X_train, y_train, **fit_params)
Traceback (most recent call last):
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/Users/Disalo/opt/anaconda3/lib/python3.8/sit

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'ccp_alpha': [0.0, 1.0],
                         'max_features': ['auto', 'sqrt']},
             scoring='accuracy')

In [367]:
results_df = pd.DataFrame(grid_cv.cv_results_)

In [368]:
grid_cv.best_params_

{'ccp_alpha': 0.0, 'max_features': 'auto'}

In [369]:
cross_acc = cross_val_score(estimator=forest_reg,
                            X=X_train_prepared,
                            y=y_train,
                            cv=5,
                            scoring='r2'
                            )

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [370]:
cross_acc.mean()

0.8133315948196949

### Implement Ridge, Lasso and Elastic Net 

In [371]:
m_ridge = Ridge()
m_lasso = Lasso()
m_elastic = ElasticNet()

m_ridge.fit(X_train_prepared, y_train)  
m_lasso.fit(X_train_prepared, y_train)
m_elastic.fit(X_train_prepared, y_train)

 
ypred_ridge = m_ridge.predict(X_test_prepared)
ypred_lasso = m_lasso.predict(X_test_prepared)
ypred_elastic = m_elastic.predict(X_test_prepared)

In [372]:
ridge_mse = mean_squared_error(y_test, ypred_ridge)
ridge_rmse = np.sqrt(ridge_mse)
ridge_rmse

4.642052739131068

In [373]:
lasso_mse = mean_squared_error(y_test, ypred_lasso)
lasso_rmse = np.sqrt(lasso_mse)
lasso_rmse

5.150968709279756

In [374]:
elastic_mse = mean_squared_error(y_test, ypred_elastic)
elastic_rmse = np.sqrt(elastic_mse)
elastic_rmse

5.243103127815043