**------------------------------------------------------------------------------------------------------------------------**
# Model Building
**------------------------------------------------------------------------------------------------------------------------**

In [1]:
import pandas as pd
house = pd.read_csv('housing.csv')

In [2]:
X = house.drop(["Id", "SalePrice", "TransformedPrice"], axis=1).values
y = house["TransformedPrice"].values

In [3]:
# split into train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state = 1)

In [4]:
# list of alphas to tune
alphas= [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20, 50, 100, 1000 ]

In [5]:
# Importing the relevant libraries
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import GridSearchCV

In [6]:
from sklearn.metrics import mean_squared_error
import numpy as np
def rmse(y_train, y_pred):
    return np.sqrt(mean_squared_error(y_train, y_pred))

---
## Linear Model
---


In [8]:
?LinearRegression

[1;31mInit signature:[0m
[0mLinearRegression[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mfit_intercept[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcopy_X[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mpositive[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Ordinary least squares Linear Regression.

LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
to minimize the residual sum of squares between the observed targets in
the dataset, and the targets predicted by the linear approximation.

Parameters
----------
fit_intercept : bool, default=True
    Whether to calculate the intercept for this model. If set
    to False, no intercept will be used in calculations
    (i.e. data is expected to be centered).

copy_X : bool, default=True
    If True, X will be copie

In [13]:
#Create the linear regression model here
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

In [59]:
print("RMSE on train:",round(rmse(y_train,lr.predict(X_train)),4))
print("RMSE on test:",round(rmse(y_test,lr.predict(X_test)),4))

RMSE on train: 0.1199
RMSE on test: 0.184


---
## Lasso
---
1. Run Lasso with alpha = 1

In [20]:
# Build Lasso regression with alpha = 1
lasso = Lasso(alpha=1)
lasso = lasso.fit(X_train, y_train)

In [21]:
print("RMSE on train:",round(rmse(y_train,lasso.predict(X_train)),3))
print("RMSE on test:",round(rmse(y_test,lasso.predict(X_test)),3))

RMSE on train: 0.4
RMSE on test: 0.399


### LassoCV 
2. Run LassoCV and find best alpha value (from the list of alphas given earlier)
3. In the best model, find % of features that are eliminated
4. RMSE for the best Lasso model on the test set

In [22]:
alphas= [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20, 50, 100, 1000]

In [28]:
#Train LassoCV for alphas and find best alpha
lassocv = LassoCV(alphas=alphas, cv=5)
lassocv = lassocv.fit(X_train, y_train)

print(f"The best alpha is: {lassocv.alpha_}")

The best alpha is: 0.001


In [25]:
#Find the percentage of coefficients turning 0
zero_coefficients_percentage = np.sum(lassocv.coef_ == 0) / len(lassocv.coef_) * 100
print(f"Percentage of coefficients turning 0: {zero_coefficients_percentage:.2f}%")

Percentage of coefficients turning 0: 15.38%


In [26]:
print("RMSE on train:",round(rmse(y_train,lassocv.predict(X_train)),4))
print("RMSE on test:",round(rmse(y_test,lassocv.predict(X_test)),4))

RMSE on train: 0.1203
RMSE on test: 0.1824


In [27]:
print("Train score:",round(lassocv.score(X_train,y_train),3))
print("Test score:",round(lassocv.score(X_test,y_test),3))

Train score: 0.909
Test score: 0.791


## Best LASSO model

In [29]:
# Build Lasso regression with alpha = 1
lasso_best = Lasso(alpha=lassocv.alpha_)
lasso_best = lasso_best.fit(X_train, y_train)

In [33]:
print("RMSE on train:",round(rmse(y_train,lasso_best.predict(X_train)),4))
print("RMSE on test:",round(rmse(y_test,lasso_best.predict(X_test)),4))

RMSE on train: 0.1203
RMSE on test: 0.1824


---
## Ridge
---
1. Run Ridge with alpha = 1

In [45]:
# Build Ridge regression for alpha = 1
ridge = Ridge(alpha=1)
ridge = ridge.fit(X_train, y_train)

In [46]:
print("RMSE on train:",round(rmse(y_train,ridge.predict(X_train)),3))
print("RMSE on test:",round(rmse(y_test,ridge.predict(X_test)),3))

RMSE on train: 0.12
RMSE on test: 0.184


### RidgeCV

In [47]:
#Train RidgeCV for alphas and find best alpha
ridgecv = RidgeCV(alphas=alphas, cv=5)
ridgecv = ridgecv.fit(X_train, y_train)

print(f"The best alpha is: {ridgecv.alpha_}")

The best alpha is: 100.0


In [52]:
#Find the % difference in the first coefficient between linear regression and Ridge model
# Get the coefficients from both models
coeff_linear = lr.coef_[0]
coeff_ridge = ridgecv.coef_[0]

# Calculate the percentage difference
percentage_difference = ((coeff_ridge - coeff_linear) / np.abs(coeff_linear)) * 100

print(f"The % difference in the first coefficient between linear regression and Ridge model is: {percentage_difference}")

The % difference in the first coefficient between linear regression and Ridge model is: 39.58090868257646


In [49]:
print("RMSE on train:",round(rmse(y_train,ridgecv.predict(X_train)),4))
print("RMSE on test:",round(rmse(y_test,ridgecv.predict(X_test)),4))

RMSE on train: 0.121
RMSE on test: 0.1817


In [50]:
print("Train score:",round(ridgecv.score(X_train,y_train),3))
print("Test score:",round(ridgecv.score(X_test,y_test),3))

Train score: 0.908
Test score: 0.793


### Best Ridge Model

In [55]:
# Build Ridge regression for alpha = 1
ridge_best = Ridge(alpha=ridgecv.alpha_)
ridge_best = ridge_best.fit(X_train, y_train)

In [57]:
print("RMSE on train:",round(rmse(y_train,ridge_best.predict(X_train)),4))
print("RMSE on test:",round(rmse(y_test,ridge_best.predict(X_test)),4))

RMSE on train: 0.121
RMSE on test: 0.1817
