# Boston Housing Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import dataset and give headers
df = pd.read_csv('housing.csv', header=None)
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
              'NOX', 'RM', 'AGE', 'DIS', 'RAD',
              'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

In [None]:
df.head()

In [None]:
# create pairplot
cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
sns.pairplot(df[cols], height=2.5)
plt.tight_layout()
plt.show()

In [None]:
# create correlation matrix
cm = np.corrcoef(df[cols].values.T)

# create heatmap
sns.set(font_scale=1.5)
hm = sns.heatmap(
    cm, cbar=True, annot=True, 
    square=True, fmt='.2f',
    annot_kws={'size': 15},
    yticklabels=cols,
    xticklabels=cols)
plt.show()

# Implementing OLS Linear Regression

In [None]:
from linearregressiongd import LinearRegressionGD
from sklearn.preprocessing import StandardScaler

X = df[['RM']].values
y = df['MEDV'].values

# initalize and fit scalers
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()

# intialize and fit linear regression model
lr = LinearRegressionGD()
lr.fit(X_std, y_std)

In [None]:
# plot
plt.figure(figsize=(10, 10))
plt.plot(range(1, lr.n_iter+1), lr.cost_)
plt.ylabel('SSE')
plt.xlabel('Epoch')
plt.show()

In [None]:
def lin_regplot(X, y, model):
    plt.scatter(X, y, c='steelblue', edgecolor='white', s=70)
    plt.plot(X, model.predict(X), color='black', lw=2)
    return None

In [None]:
plt.figure(figsize=(10, 10))
lin_regplot(X_std, y_std, lr)
plt.xlabel('Average number of rooms [RM] (standardized)')
plt.ylabel('Price in $1000s [MEDV] (standardized)')
plt.show()

In [None]:
num_rooms_std = sc_x.transform(np.array(5.0).reshape(-1, 1))
price_std = lr.predict(num_rooms_std)

pred_price_std = sc_y.inverse_transform(price_std)
print(pred_price_std)

print(f"Price in $1000s: {pred_price_std[0]:.3f}")

In [None]:
print(f'Slope:\t\t {lr.w_[1]:.3f}')
print(f'Intercept:\t {lr.w_[0]:.3f}')

# Estimating Coefficient of a Regression Model via `scikit-learn`

In [None]:
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
slr.fit(X, y)

print(f'Slope:\t\t {slr.coef_[0]:.3f}')
print(f'Intercept:\t {slr.intercept_:.3f}')

In [None]:
plt.figure(figsize=(10, 10))
lin_regplot(X, y, slr)
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000s [MEDV]')
plt.show()

# Robust Regressino Models Using RANSAC

In [None]:
from sklearn.linear_model import RANSACRegressor

# initialize and fit RANSAC regressor
ransac = RANSACRegressor(
    LinearRegression(), 
    max_trials=100, 
    min_samples=50, 
    loss='absolute_loss', 
    residual_threshold=5.0, 
    random_state=0
)
ransac.fit(X, y)

In [None]:
# obtain inliers and outliers
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

# make predictions
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])

# plot
plt.figure(figsize=(10, 10))
plt.scatter(
    X[inlier_mask], y[inlier_mask],
    c='steelblue', edgecolor='white', 
    marker='o', label='Inliers'
)
plt.scatter(
    X[outlier_mask], y[outlier_mask],
    c='limegreen', edgecolor='white', 
    marker='s', label='Outliers'
)
plt.plot(line_X, line_y_ransac, color='black', lw=2)   
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000s [MEDV]')
plt.legend(loc='upper left')
plt.show()

In [None]:
print(f'Slope:\t\t {ransac.estimator_.coef_[0]:.3f}')
print(f'Intercept:\t {ransac.estimator_.intercept_:.3f}')

# Evaluating Performance

In [None]:
from sklearn.model_selection import train_test_split

# prepare data
X = df.iloc[:, :-1].values
y = df['MEDV'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# initialize and fit linear regression model
slr = LinearRegression()
slr.fit(X_train, y_train)

# make predictions
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)

In [None]:
# create residual plot
plt.figure(figsize=(10, 10))
plt.scatter(
    y_train_pred,  y_train_pred - y_train,
    c='steelblue', marker='o', edgecolor='white',
    label='Training data'
)
plt.scatter(
    y_test_pred,  y_test_pred - y_test,
    c='limegreen', marker='s', edgecolor='white',
    label='Test data'
)
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2)
plt.xlim([-10, 50])
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

print(f'MSE train:\t {mean_squared_error(y_train, y_train_pred):.3f}')
print(f'MSE test:\t {mean_squared_error(y_test, y_test_pred):.3f}')

In [None]:
from sklearn.metrics import r2_score

print(f'R^2 train:\t {r2_score(y_train, y_train_pred):.3f}')
print(f'R^2 test:\t {r2_score(y_test, y_test_pred):.3f}')

# Using Regularized Methods

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1.0)

In [None]:
from sklearn.linear_model import ElasticNet

elanet = ElasticNet(alpha=1.0, l1_ratio=0.5)

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# create dummy data
X = np.array(
    [258.0, 270.0, 294.0, 320.0, 342.0, 
     368.0, 396.0, 446.0, 480.0, 586.0]
)[:, np.newaxis]

y = np.array(
    [236.4, 234.4, 252.8, 298.6, 314.2, 
     342.2, 360.8, 368.0, 391.2, 390.8]
)

lr = LinearRegression()                    # initialize linreg model
pr = LinearRegression()                    # initialize polyreg model
quadratic = PolynomialFeatures(degree=2)   # set d = 2
X_quad = quadratic.fit_transform(X)        # transform features

In [None]:
# fit linreg model
lr.fit(X, y)
X_fit = np.arange(250,600,10)[:, np.newaxis]
y_lin_fit = lr.predict(X_fit)

In [None]:
# fit polyreg model
pr.fit(X_quad, y)
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))

In [None]:
# plot
plt.figure(figsize=(10, 10))
plt.scatter(X, y, label='training points')
plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--')
plt.plot(X_fit, y_quad_fit, label='quadratic fit')
plt.legend(loc='upper left')
plt.show()

In [None]:
# make predictions
y_lin_pred = lr.predict(X)
y_quad_pred = pr.predict(X_quad)

# calculate MSE and R-squared
print(f'Training MSE linear:\t {mean_squared_error(y, y_lin_pred):.3f}')
print(f'Training MSE quadratic:\t {mean_squared_error(y, y_quad_pred):.3f}')
print(f'Training R^2 linear:\t {r2_score(y, y_lin_pred):.3f}')
print(f'Training R^2 quadratic:\t {r2_score(y, y_quad_pred):.3f}')

In [None]:
X = df[['LSTAT']].values
X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
y = df['MEDV'].values

# create quadratic and cubic features
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)

In [None]:
# fit linreg model, make predictions, calculate R-squared
regr = LinearRegression()
regr = regr.fit(X, y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

In [None]:
# fit quadratic model, make predictions, calculate R-squared
regr = regr.fit(X_quad, y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y, regr.predict(X_quad))

In [None]:
# fit cubic model, make predictions, calculate R-squared
regr = regr.fit(X_cubic, y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

In [None]:
# plot results
plt.figure(figsize=(10, 10))
plt.scatter(X, y, label='training points', color='lightgray')
plt.plot(
    X_fit, y_lin_fit,
    label=f'linear (d=1), R^2 = {linear_r2:.2f}',
    color='blue', lw=2, linestyle=':'
)
plt.plot(
    X_fit, y_quad_fit,
    label=f'quadratic (d=2), R^2 = {quadratic_r2:.2f}',
    color='red', lw=2, linestyle='-'
)
plt.plot(
    X_fit, y_cubic_fit,
    label=f'cubic (d=3), R^2 = {cubic_r2:.2f}',
    color='green', lw=2, linestyle='--'
)

plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000s [MEDV]')
plt.legend(loc='upper right')
plt.show()

In [None]:
# transform features
X_log = np.log(X)
y_sqrt = np.sqrt(y)

# fit features
X_fit = np.arange(X_log.min()-1, X_log.max()+1, 1)[:, np.newaxis]
regr = regr.fit(X_log, y_sqrt)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y_sqrt, regr.predict(X_log))

# plot results
plt.figure(figsize=(10, 10))
plt.scatter(
    X_log, y_sqrt,
    label='training points',
    color='lightgray'
)
plt.plot(
    X_fit, y_lin_fit, 
    label='linear (d=1), $R^2=%.2f$' % linear_r2,
    color='blue', lw=2
)
plt.xlabel('log(% lower status of the population [LSTAT])')
plt.ylabel('$\sqrt{Price \; in \; \$1000s \; [MEDV]}$')
plt.legend(loc='lower left')
plt.show()

# Dealing with Nonlinear Relationships Using Random Forests

In [None]:
from sklearn.tree import DecisionTreeRegressor

X = df[['LSTAT']].values
y = df['MEDV'].values

# initialize and fit decision tree
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X, y)

sort_idx = X.flatten().argsort()

# plot
plt.figure(figsize=(10, 10))
lin_regplot(X[sort_idx], y[sort_idx], tree)
plt.xlabel('% lower status of the population [LSTAT]')
plt.ylabel('Price in $1000s [MEDV]')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

# define features and target and split into train and test sets
X = df.iloc[:, :-1].values
y = df['MEDV'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.4,
    random_state=1
)

# initalize and fit rf regressor
forest = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse', 
    random_state=1, 
    n_jobs=-1
)
forest.fit(X_train, y_train)

# make predictions
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

# calculate mse and r2
print(f'MSE train:\t {mean_squared_error(y_train, y_train_pred):.3f}')
print(f'MSE test:\t {mean_squared_error(y_test, y_test_pred):.3f}')
print(f'R^2 train:\t {r2_score(y_train, y_train_pred):.3f}')
print(f'R^2 test:\t {r2_score(y_test, y_test_pred):.3f}')

In [None]:
# plot residuals
plt.figure(figsize=(10, 10))
plt.scatter(
    y_train_pred, y_train_pred - y_train,
    c='steelblue', edgecolor='white',
    marker='o', s=35,
    alpha=0.9, label='Training data'
)
plt.scatter(
    y_test_pred, y_test_pred - y_test,
    c='limegreen', edgecolor='white',
    marker='s', s=35,
    alpha=0.9, label='Test data'
)
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='black')
plt.xlim([-10, 50])
plt.show()