1) Build a machine learning model for house price prediction analysis using lasso and ridge regression 

In [267]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv("housingprices.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Address,Zip,Price,Area,Room,Lon,Lat
0,1,"Blasiusstraat 8 2, Amsterdam",1091 CR,685000.0,64,3,4.907736,52.356157
1,2,"Kromme Leimuidenstraat 13 H, Amsterdam",1059 EL,475000.0,60,3,4.850476,52.348586
2,3,"Zaaiersweg 11 A, Amsterdam",1097 SM,850000.0,109,4,4.944774,52.343782
3,4,"Tenerifestraat 40, Amsterdam",1060 TH,580000.0,128,6,4.789928,52.343712
4,5,"Winterjanpad 21, Amsterdam",1036 KN,720000.0,138,5,4.902503,52.410538


In [268]:
df.isna().sum()

Unnamed: 0    0
Address       0
Zip           0
Price         4
Area          0
Room          0
Lon           0
Lat           0
dtype: int64

In [269]:
df.shape

(924, 8)

In [270]:
df.nunique()

Unnamed: 0    924
Address       919
Zip           834
Price         226
Area          193
Room           13
Lon           894
Lat           886
dtype: int64

In [271]:
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0    0
Address       0
Zip           0
Price         0
Area          0
Room          0
Lon           0
Lat           0
dtype: int64

In [272]:
df.drop(['Address','Zip','Unnamed: 0'],axis = 1,inplace= True)
df.head()

Unnamed: 0,Price,Area,Room,Lon,Lat
0,685000.0,64,3,4.907736,52.356157
1,475000.0,60,3,4.850476,52.348586
2,850000.0,109,4,4.944774,52.343782
3,580000.0,128,6,4.789928,52.343712
4,720000.0,138,5,4.902503,52.410538


In [273]:
x= df.drop('Price',axis=1)
y=df['Price']
x.describe()

Unnamed: 0,Area,Room,Lon,Lat
count,920.0,920.0,920.0,920.0
mean,95.607609,3.56413,4.888652,52.363271
std,56.849699,1.57103,0.053118,0.024054
min,21.0,1.0,4.644819,52.291519
25%,60.0,3.0,4.855834,52.351925
50%,83.0,3.0,4.886818,52.364499
75%,113.0,4.0,4.922337,52.377545
max,623.0,14.0,5.029122,52.423805


In [274]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=2)

In [275]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
model = reg.fit(xtrain,ytrain)
print("training score ",model.score(xtrain,ytrain),"\ntesting score ",model.score(xtest,ytest))

training score  0.7168398868441106 
testing score  0.6675785725805965


In [276]:
ypred = model.predict(xtest)

In [277]:
from sklearn.metrics import mean_absolute_error as mae
print("mean absolute error = ",mae(ytest,ypred))

mean absolute error =  181035.8857391994


In [278]:
from sklearn.linear_model import Lasso
lass_reg = Lasso()
lass_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [279]:
alphas = np.array([500,0.01,0.05,0.1,0.5,1,10,50,100,])
iter = np.array([1000,100,10])

In [280]:
import warnings 
warnings.filterwarnings('ignore')

In [281]:
from sklearn.model_selection import GridSearchCV as gscv
grid = gscv(estimator = lass_reg, param_grid={'alpha' : alphas , 'max_iter':iter},cv = 5)
grid.fit(xtrain,ytrain)
print('for lasso regularization',grid.best_params_)
print(grid.best_score_)

for lasso regularization {'alpha': 0.01, 'max_iter': 10}
0.6680042881114936


In [282]:
resultdf = pd.DataFrame(grid.cv_results_)
print(resultdf[['param_max_iter','param_alpha','mean_test_score']])

   param_max_iter param_alpha  mean_test_score
0            1000       500.0         0.666845
1             100       500.0         0.666845
2              10       500.0         0.666855
3            1000        0.01         0.667947
4             100        0.01         0.667947
5              10        0.01         0.668004
6            1000        0.05         0.667946
7             100        0.05         0.667946
8              10        0.05         0.668004
9            1000         0.1         0.667946
10            100         0.1         0.667946
11             10         0.1         0.668004
12           1000         0.5         0.667944
13            100         0.5         0.667944
14             10         0.5         0.668002
15           1000         1.0         0.667941
16            100         1.0         0.667941
17             10         1.0         0.667999
18           1000        10.0         0.667894
19            100        10.0         0.667894
20           

In [283]:
from sklearn.linear_model import Ridge
ridg_reg = Ridge()
ridg_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [284]:
grid = gscv(estimator = ridg_reg, param_grid={'alpha' : alphas , 'max_iter':iter},cv = 5)
grid.fit(xtrain,ytrain)
print('for ridge regularization\n',grid.best_params_)
print(grid.best_score_)

for ridge regularization
 {'alpha': 1.0, 'max_iter': 1000}
0.6688064394658662


In [287]:
models = {
    'lasso' : {
        'model' : lass_reg,
        'params' : {
            'alpha' : alphas , 'max_iter':iter
        }
    } ,
    'ridge' : {
        'model' : ridg_reg,
        'params' : {
            'alpha' : alphas , 'max_iter': iter
        }
    }
}


scores = []


for modelname, mp in models.items():
    grid = gscv(mp['model'],mp['params'],cv = 5)
    grid.fit(xtrain,ytrain)
    scores.append(
        {
            'model' : modelname,
            'best_score' : grid.best_score_,
            'best_parameters' : grid.best_params_
        }
    )
    
scores

[{'model': 'lasso',
  'best_score': 0.6680042881114936,
  'best_parameters': {'alpha': 0.01, 'max_iter': 10}},
 {'model': 'ridge',
  'best_score': 0.6688064394658662,
  'best_parameters': {'alpha': 1.0, 'max_iter': 1000}}]

In [290]:
finaldf = pd.DataFrame(scores, columns=['model','best_score','best_parameters'])
finaldf

Unnamed: 0,model,best_score,best_parameters
0,lasso,0.668004,"{'alpha': 0.01, 'max_iter': 10}"
1,ridge,0.668806,"{'alpha': 1.0, 'max_iter': 1000}"
