In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 
data_df = pd.read_csv('/kaggle/input/boston-house-prices/housing.csv', header=None, delim_whitespace=True, names=names) #, delimiter=r"\s+")

In [None]:
data_df.head()

In [None]:
print(data_df.shape)

In [None]:
data_df.info()

In [None]:
data_df.isnull().sum()

In [None]:
data_df.duplicated().any()

In [None]:
data_df.hist(bins=12, figsize=(12,10), grid=False);

* CRIM per capita: crime rate by town

* ZN: proportion of residential land zoned for lots over 25,000 sq.ft.

* INDUS: proportion of non-retail business acres per town

* CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)

* NOX: nitric oxides concentration (parts per 10 million)

* RM: average number of rooms per dwelling

* AGE: proportion of owner-occupied units built prior to 1940

* DIS: weighted distances to five Boston employment centres

* RAD: index of accessibility to radial highways

* TAX: full-value property-tax rate per 10,000usd

* PTRATIO: pupil-teacher ratio by town

* B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

* LSTAT: % lower status of the population

* MEDV--> our resident value target

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(25, 12))
sns.heatmap(data_df.corr(), vmin = -1, vmax = 1, center = 0, cmap = 'coolwarm', annot = True)
plt.show()

In [None]:
X = data_df.drop('MEDV',axis = 1)
y = data_df['MEDV']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression

lr_all = LinearRegression()  
lr_all.fit(X_train, y_train) 

y_pred1=lr_all.predict(X_test)

In [None]:
# coefficient of intercept
lr_all.intercept_

In [None]:
#Converting the coefficient values to a dataframe
lr_all_coeffcients = pd.DataFrame([X_train.columns,lr_all.coef_]).T
lr_all_coeffcients = lr_all_coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'}) #put into dataframe
lr_all_coeffcients #print out

In [None]:
#accuracy score 
lr_all.score(X_test, y_test)

* 𝑅^2 : It is a measure of the linear relationship between X and Y. It is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable.

* Adjusted 𝑅^2 :The adjusted R-squared compares the explanatory power of regression models that contain different numbers of predictors.

* MAE : It is the mean of the absolute value of the errors. It measures the difference between two continuous variables, here actual and predicted values of y.

* MSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.

* RMSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.

In [None]:
# other evaluation metrics
from sklearn import metrics
print('R^2:',metrics.r2_score(y_test, y_pred1))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_pred1))*(len(y_test)-1)/(len(y_test)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred1))
print('MSE:',metrics.mean_squared_error(y_test, y_pred1))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))

**Ridge Regression**
* Ridge regression is one of the simple techniques to reduce model complexity and prevent over-fitting which may result from linear regression
* The loss function is altered by adding a penalty equivalent to square of the magnitude of the coefficients
* One parameter: Alpha (also called 'lambda')
* higher the alpha value --> more restriction on the coeffs
* lower alpha --> more generalization
* Normal pratice: alpha>1 (e.g. 150;230)

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=100)
ridge.fit(X_train, y_train)
y_pred2 = ridge.predict(X_test)

In [None]:
ridge.score(X_test, y_test)

**compare Linear regression vs Ridge(alpha=0.1) vs Ridge(alpha=100)**

In [None]:
from sklearn.linear_model import Ridge
rr1 = Ridge(alpha=0.01)
rr1.fit(X_train, y_train)

rr2 = Ridge(alpha=100)
rr2.fit(X_train, y_train)

print('Linear regression test score:',lr_all.score(X_test, y_test))
print('Ridge regression test score with low alpha(0.1):', rr1.score(X_test, y_test))
print('Ridge regression test score with high alpha(100):', rr2.score(X_test, y_test))

* In terms of magnitude of coefficients: Rigde regression with high alpha penalizes the coefficients on CHAS, NOX, and RM a lot

In [None]:
import matplotlib.pyplot as plt
plt.plot(names[0:13], lr_all.coef_, alpha=0.4, linestyle='none', marker='o', markersize=7, color='green', label='Linear Regression')
plt.plot(names[0:13], rr1.coef_, alpha=0.4, linestyle='none', marker='*', markersize=7, color='red', label=r'Ridge;$\alpha=0.01$')
plt.plot(names[0:13], rr2.coef_,alpha=0.4, linestyle='none', marker='d', markersize=7, color='blue', label=r'Ridge;$\alpha=100$')
plt.xlabel('Coefficient Index', fontsize=16)
plt.ylabel('Coefficient Magnitude', fontsize=16)
plt.legend(fontsize=13, loc=4)
plt.show()

**Lasso Regression**
* Lasso regression is another simple technique to reduce model complexity and prevent over-fitting which result from lienar regression
* Lasso regression not only helps in reducing over-fitting but it can help us in feature selection
* Normal practice: alpha<1 (e.g. 0.1, 0.03)

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.8)
lasso.fit(X_train, y_train)
y_pred3 = lasso.predict(X_test)

lasso.score(X_test, y_test)

* Removing the predictors with zero coefficients: CHAS and NOX

In [None]:
#Converting the coefficient values to a dataframe
lasso_coeffcients = pd.DataFrame([X_train.columns,lasso.coef_]).T
lasso_coeffcients = lasso_coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'}) #put into dataframe
lasso_coeffcients #print out

In [None]:
#Viewing by comparing linear and lasso regression coefficient plots 
import matplotlib.pyplot as plt
plt.plot(names[0:13], lasso.coef_, alpha=0.4, linestyle='none', marker='o', markersize=7, color='green', label='Lasso Regression')
plt.plot(names[0:13], lr_all.coef_, alpha=0.4, linestyle='none', marker='d', markersize=7, color='blue', label='Linear Regression')
plt.xlabel('Coefficient Index', fontsize=16)
plt.ylabel('Coefficient Magnitude', fontsize=16)
plt.legend(fontsize=13, loc=4)
plt.show()

**Hyperparameter tunning**
* Ridge and Lasso regression: Choosing alpha
* Hyperparameters cannot be learned by fitting he model
* Solution: GridSearch/RandomizedSearch

In [None]:
#find best alpha for Ridge Regression
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha':np.arange(1, 10, 500)} #range from 1-500 with equal interval of 10 
ridge = Ridge() 
ridge_best_alpha = GridSearchCV(ridge, param_grid)
ridge_best_alpha.fit(X_train, y_train)

In [None]:
print("Best alpha for Ridge Regression:", ridge_best_alpha.best_params_)
print("Best score for Ridge Regression with best alpha:", ridge_best_alpha.best_score_)

In [None]:
#find best alpha for Lasso Regression
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha':np.arange(0, 0.1, 1)} #range from 0-1 with equal interval of 0.1 
lasso = Lasso() 
lasso_best_alpha = GridSearchCV(lasso, param_grid) 
lasso_best_alpha.fit(X_train, y_train)

In [None]:
print("Best alpha for Lasso Regression:", lasso_best_alpha.best_params_)
print("Best score for Lasso Regression with best alpha:", lasso_best_alpha.best_score_)