<h2 style='color:blue' align='center'>L1 and L2 Regularization</h2>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
# to avoid warnings in jupyter notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./Melbourne_housing_FULL.csv')

In [5]:
df.head(2)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0


In [7]:
# gives no of unique values in each column
df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64


In [8]:
df.shape

(34857, 21)

In [10]:
# now we use useful columns by discarding some unuseful columns
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
               'BuildingArea', 'Price']
df = df[cols_to_use]

In [11]:
df.head(2)

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0


In [12]:
df.shape

(34857, 15)

#### Checking for Nan values

In [13]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

#### Handling Missing values

In [14]:
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)

In [15]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [16]:
# other continuous features can be imputed with mean since our focus is on Reducing overfitting
# using Lasso and Ridge Regression
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())

In [17]:
df.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          3
Propertycount       0
Distance            0
CouncilArea         3
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

**Drop NA values of Price, since it's our predictive variable we won't impute it**

In [18]:
df.dropna(inplace=True)

In [19]:
df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [20]:
df.shape

(27244, 15)

#### Let's one hot encode the categorical features

In [21]:
# if we pass whole data set it will create dummies to only text columns and does not effect int column
# now by passing whole dataset it will return the same dataset if dummy columns not effecting int col
# drop_first=True is given to avoid dummy variable trap by droping first_column of each encoded
df = pd.get_dummies(df, drop_first=True)

In [22]:
df.head(2)

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
df.shape

(27244, 745)

In [24]:
X = df.drop('Price', axis='columns').values
y = df['Price'].values

In [25]:
X.shape

(27244, 744)

In [26]:
y.shape

(27244,)

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

#### Let's train our Linear Regression Model on training dataset and check the accuracy on test set

In [28]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.13853683161579822

In [30]:
# we got very less score for test dataset

In [29]:
reg.score(X_train, y_train)

0.6827792395792723

In [31]:
# but we got some good score with training set this seems to be overfitting

**Here training score is 68% but test score is 13.85% which is very low**

<h4 style='color:purple'>Normal Regression is clearly overfitting the data, let's try other models</h4>

In [38]:
# underfit y_pred = m1x1 + b0
# overfit y_pred = m1x1 + m2x2^2 + m3x3^3 +....+b0
# correctfit y_pred = m1x1 + m2x2^2 + b0
# x1, x2, x3... are different independent features
# m1, m2, m3... are weights or slopes

# to get the correctfit we some how should make m3, m4, ... to nearly 0's then, 
# the overfit equation will get converted to correct fit equation
# the idea here is to shrink the slopes or weights m3, m4, m5... to zeroes

# to do the above process we should change the cost function 
# cost = mean(y-y_pred)^2 this is normal cost function of LinearRegression
# modefied cost function for lasso reg
# cost = mean(y-y_pred)^2 + lambda*mean(m1+m2+m3+..), where mi>0, i=1, 2, ....
# since we are increasing the mse error we need to perform gradient descent to optimize the weights further
# the all weights(m1, m2..) are decreased to optimum values
# after performing the regularization the slope of the curve get decreases
# before regularization our weights(m1, m2, ..) are higher, but
# after regularization our weights(m1, m2, ...) are decreased and some mi's also approched to 0
# if lambda value is very high then cost is very high then almost all weights approches to 0, then
# the curve is almost horizontal
# so we need to optimize the lambda value
# in general simpler equations are good for ml predictions

# in l2(ridge) cost = mean(y-y_pred)^2 + lambda*mean(m1^2+m2^2+..), everything is similar to lasso

#### Using Lasso (L1 Regularized) Regression Model

In [32]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=50, max_iter=100, tol=0.1)

In [33]:
lasso_reg.score(X_test, y_test)

0.6636111369404489

In [None]:
# by normal linearregression we got only 13%accuracy of test but after lasso regularization we got 66%

In [34]:
lasso_reg.score(X_train, y_train)

0.6766985624766824

#### Using Ridge (L2 Regularized) Regression Model

In [35]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=50, max_iter=100, tol=0.1)

In [36]:
ridge_reg.score(X_test, y_test)

0.6670848945194958

In [37]:
ridge_reg.score(X_train, y_train)

0.6622376739684328