# Notebook 3 - Lasso, & Ridge

### Running Imports

In [16]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso, LassoCV

### Importing data set and cleaning column titles

In [34]:
dftrain = pd.read_csv("./datasets/train.csv")

dftrain.columns = dftrain.columns.str.replace(' ', '_')
dftrain.head()

Unnamed: 0,Id,PID,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,...,Screen_Porch,Pool_Area,Pool_QC,Fence,Misc_Feature,Misc_Val,Mo_Sold,Yr_Sold,Sale_Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


Unnamed: 0,Id,PID,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,...,Screen_Porch,Pool_Area,Pool_QC,Fence,Misc_Feature,Misc_Val,Mo_Sold,Yr_Sold,Sale_Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


### Correlation & Cleaning

In [11]:
dftrain.shape

(2051, 81)

In [8]:
dftrain.dtypes

Id                int64
PID               int64
MS_SubClass       int64
MS_Zoning        object
Lot_Frontage    float64
                 ...   
Misc_Val          int64
Mo_Sold           int64
Yr_Sold           int64
Sale_Type        object
SalePrice         int64
Length: 81, dtype: object

In [9]:
dftrain.isnull().sum()

Id                0
PID               0
MS_SubClass       0
MS_Zoning         0
Lot_Frontage    330
               ... 
Misc_Val          0
Mo_Sold           0
Yr_Sold           0
Sale_Type         0
SalePrice         0
Length: 81, dtype: int64

In [10]:
dftrain.corr()['SalePrice']

Id                -0.051398
PID               -0.255052
MS_SubClass       -0.087335
Lot_Frontage       0.341842
Lot_Area           0.296566
Overall_Qual       0.800207
Overall_Cond      -0.097019
Year_Built         0.571849
Year_Remod/Add     0.550370
Mas_Vnr_Area       0.512230
BsmtFin_SF_1       0.423519
BsmtFin_SF_2       0.016255
Bsmt_Unf_SF        0.190210
Total_Bsmt_SF      0.628925
1st_Flr_SF         0.618486
2nd_Flr_SF         0.248452
Low_Qual_Fin_SF   -0.041594
Gr_Liv_Area        0.697038
Bsmt_Full_Bath     0.283662
Bsmt_Half_Bath    -0.045328
Full_Bath          0.537969
Half_Bath          0.283001
Bedroom_AbvGr      0.137067
Kitchen_AbvGr     -0.125444
TotRms_AbvGrd      0.504014
Fireplaces         0.471093
Garage_Yr_Blt      0.533922
Garage_Cars        0.648220
Garage_Area        0.650270
Wood_Deck_SF       0.326490
Open_Porch_SF      0.333476
Enclosed_Porch    -0.135656
3Ssn_Porch         0.048732
Screen_Porch       0.134581
Pool_Area          0.023106
Misc_Val          -0

In [12]:
dftrain.corr()['SalePrice']

Id                -0.051398
PID               -0.255052
MS_SubClass       -0.087335
Lot_Frontage       0.341842
Lot_Area           0.296566
Overall_Qual       0.800207
Overall_Cond      -0.097019
Year_Built         0.571849
Year_Remod/Add     0.550370
Mas_Vnr_Area       0.512230
BsmtFin_SF_1       0.423519
BsmtFin_SF_2       0.016255
Bsmt_Unf_SF        0.190210
Total_Bsmt_SF      0.628925
1st_Flr_SF         0.618486
2nd_Flr_SF         0.248452
Low_Qual_Fin_SF   -0.041594
Gr_Liv_Area        0.697038
Bsmt_Full_Bath     0.283662
Bsmt_Half_Bath    -0.045328
Full_Bath          0.537969
Half_Bath          0.283001
Bedroom_AbvGr      0.137067
Kitchen_AbvGr     -0.125444
TotRms_AbvGrd      0.504014
Fireplaces         0.471093
Garage_Yr_Blt      0.533922
Garage_Cars        0.648220
Garage_Area        0.650270
Wood_Deck_SF       0.326490
Open_Porch_SF      0.333476
Enclosed_Porch    -0.135656
3Ssn_Porch         0.048732
Screen_Porch       0.134581
Pool_Area          0.023106
Misc_Val          -0

In [13]:
tempfeat = dftrain[['SalePrice','Overall_Qual', 'Year_Built', 'Total_Bsmt_SF','1st_Flr_SF','Gr_Liv_Area','Garage_Cars','Garage_Area']]

In [14]:
tempfeat.dtypes

SalePrice          int64
Overall_Qual       int64
Year_Built         int64
Total_Bsmt_SF    float64
1st_Flr_SF         int64
Gr_Liv_Area        int64
Garage_Cars      float64
Garage_Area      float64
dtype: object

In [23]:
dftrain.fillna(0, inplace = True)

In [24]:
X = dftrain[['Overall_Qual', 'Year_Built', 'Total_Bsmt_SF','1st_Flr_SF','Gr_Liv_Area','Garage_Cars','Garage_Area']]
y = dftrain['SalePrice']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [26]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

### Ridge Model

In [27]:
ridge = Ridge()

In [28]:
ridge.fit(Z_train, y_train)

Ridge()

In [29]:
print(ridge.score(Z_train, y_train))
print(ridge.score(Z_test, y_test))

0.7712262564085136
0.832996730843366


### LASSO Model

In [30]:
sc = StandardScaler()
Z_lassotrain = sc.fit_transform(X_train)
Z_lassotest = sc.transform(X_test)

In [31]:
lasso_cv = LassoCV()

In [32]:
lasso_cv.fit(Z_lassotrain, y_train)

LassoCV()

In [35]:
lasso_cv.alpha_

678.6238001413606

In [36]:
print(lasso_cv.score(Z_lassotrain, y_train))
print(lasso_cv.score(Z_lassotest, y_test))

0.7710963235769791
0.8315358794185387


accounts for 83% of variability in sales price holding all else equal