In [3]:
import numpy as np
import pandas as pd
%matplotlib inline 
# Option to display all dataframes columns
pd.options.display.max_columns = None

housing = pd.read_csv('Ames_HousePrice.csv', index_col=0)
housing.shape

(2580, 81)

In [4]:
#Initialize dictionaries for changing categorical variables to numeric
qual_dict = {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
mssubclass_dict = {20:'1story',30:'1story',40:'1story',
                   45:'1.5story',50:'1.5story',
                   60:'2story',70:'2story',75:'2story',
                   80:'split',85:'split',
                   120:'1story_dev',150:'1story_dev',
                   160:'2story_dev',180:'2story_dev',
                   90:'duplex',190:'duplex',}
mszoning_dict = {'RL':0,'RM':1,'FV':1,'RH':1,'C (all)':1,'I (all)':1,'A (agr)':1}
lotshape_dict = {'Reg':0,'IR1':1,'IR2':1,'IR3':1}
landcontour_dict = {'Lvl':0,'HLS':1,'Bnk':1,'Low':1}
lotconfig_dict = {'FR2':'FR','FR3':'FR'}
landslope_dict = {'Gtl':0,'Mod':1,'Sev':1}
roofstyle_dict = {'Gable':0,'Hip':1,'Gambrel':1,'Flat':1,'Mansard':1,'Shed':1}
foundation_dict = {'CBlock':'Cinder','PConc':'Concrete','BrkTil':'Other','Slab':'Other','Stone':'Other','Wood':'Other'}
bsmtfintype_dict = {'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
centralair_dict = {'N':0,'Y':1}
electrical_dict = {'SBrkr':0,'FuseA':1,'FuseF':1,'FuseP':1}
functional_dict = {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5,'Min1':6,'Typ':7}
garagetype_dict = {'Attchd':2,'Detchd':1,'BuiltIn':1,'Basment':1,'2Types':1,'CarPort':1}
garagefinish_dict = {'Unf':1,'RFn':2,'Fin':3}
paveddrive_dict = {'N':0,'P':0,'Y':1}
fence_dict = {'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4}

#Dictionary of dictionaries to apply a specific dict to the right column
cleanup_dict = {'MSSubClass':mssubclass_dict,
                'MSZoning':mszoning_dict,
                'LotShape':lotshape_dict,
                'LandContour':landcontour_dict,
                'LotConfig':lotconfig_dict,
                'LandSlope':landslope_dict,
                'RoofStyle':roofstyle_dict, 
                'ExterQual':qual_dict,
                'ExterCond':qual_dict,
                'Foundation':foundation_dict,
                'BsmtQual':qual_dict,
                'BsmtCond':qual_dict,
                'BsmtFinType1':bsmtfintype_dict,
                'BsmtFinType2':bsmtfintype_dict,
                'HeatingQC':qual_dict,
                'CentralAir':centralair_dict,
                'Electrical':electrical_dict,
                'KitchenQual':qual_dict,
                'Functional':functional_dict,
                'FireplaceQu':qual_dict,
                'GarageType':garagetype_dict,
                'GarageFinish':garagefinish_dict,
                'GarageQual':qual_dict,
                'GarageCond':qual_dict,
                'PavedDrive':paveddrive_dict,
                'Fence':fence_dict
               }
#Clean data
housing.replace(to_replace=cleanup_dict,value=None,inplace=True)

#Fill NAs (these are mostly adding a zero if the house doesn't have the feature)
housing.MasVnrArea.fillna(value=0,inplace=True)
housing.BsmtQual.fillna(value=0,inplace=True)
housing.BsmtCond.fillna(value=0,inplace=True)
housing.BsmtFinType1.fillna(value=0,inplace=True)
housing.BsmtFinType2.fillna(value=0,inplace=True)
housing.BsmtFinSF1.fillna(value=0,inplace=True)
housing.BsmtFinSF2.fillna(value=0,inplace=True)
housing.BsmtUnfSF.fillna(value=0,inplace=True)
housing.TotalBsmtSF.fillna(value=0,inplace=True)
housing.FireplaceQu.fillna(value=0,inplace=True)
housing.GarageFinish.fillna(value=0,inplace=True)
housing.GarageQual.fillna(value=0,inplace=True)
housing.GarageCond.fillna(value=0,inplace=True)
housing.Fence.fillna(value=0,inplace=True)
housing.LotFrontage.fillna(value=0,inplace=True)
housing.Electrical.fillna(value=0,inplace=True)
housing.GarageCars.fillna(value=0,inplace=True)
housing.GarageArea.fillna(value=0,inplace=True)
housing.GarageType.fillna(value=0,inplace=True)
housing.BsmtFullBath.fillna(value=0,inplace=True)
housing.BsmtHalfBath.fillna(value=0,inplace=True)


#Creating a new total bathroom number feature
housing['Baths'] = housing['BsmtFullBath'] + (housing['BsmtHalfBath']/2) + housing['FullBath']+ (housing['HalfBath']/2)

#Dummify the remaining categorical features.
dummies = pd.get_dummies(data=housing.MSSubClass,prefix='Class')
dummies = dummies.drop(columns=['Class_1story'])
housing = pd.concat([housing,dummies],axis=1)

dummies = pd.get_dummies(data=housing.LotConfig,prefix='Lot')
dummies = dummies.drop(columns=['Lot_Inside'])
housing = pd.concat([housing,dummies],axis=1)

dummies = pd.get_dummies(data=housing.Foundation,prefix='Foundation')
dummies = dummies.drop(columns=['Foundation_Cinder'])
housing = pd.concat([housing,dummies],axis=1)

#Drop columns
housing.drop(columns=['PID','Street','Alley','Utilities','Neighborhood','Condition1',
                    'Condition2','BldgType','HouseStyle', 'RoofMatl',
                    'Exterior1st','Exterior2nd','MasVnrType','BsmtExposure',
                    'BsmtFinType1','BsmtFinType2','Heating','1stFlrSF','2ndFlrSF','PoolQC',
                     'MiscFeature','MiscVal','YrSold','MoSold','SaleType','SaleCondition',
                     'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','MSSubClass',
                     'LotConfig','Foundation','GarageYrBlt'],inplace=True)


In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2580 entries, 1 to 763
Data columns (total 59 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   GrLivArea            2580 non-null   int64  
 1   SalePrice            2580 non-null   int64  
 2   MSZoning             2580 non-null   int64  
 3   LotFrontage          2580 non-null   float64
 4   LotArea              2580 non-null   int64  
 5   LotShape             2580 non-null   int64  
 6   LandContour          2580 non-null   int64  
 7   LandSlope            2580 non-null   int64  
 8   OverallQual          2580 non-null   int64  
 9   OverallCond          2580 non-null   int64  
 10  YearBuilt            2580 non-null   int64  
 11  YearRemodAdd         2580 non-null   int64  
 12  RoofStyle            2580 non-null   int64  
 13  MasVnrArea           2580 non-null   float64
 14  ExterQual            2580 non-null   int64  
 15  ExterCond            2580 non-null   in

In [6]:
housing.to_csv('Ames_HousePrice_cleaned.csv')

---

### LassoCV

In [7]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

In [8]:
X = housing.drop('SalePrice',axis=1)
y = housing['SalePrice']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [32]:
lcv = LassoCV(normalize=True).fit(X_train,y_train)


In [33]:
print('train score:',lcv.score(X_train,y_train))
print('test score:',lcv.score(X_test,y_test))

train score: 0.8863079452581605
test score: 0.8860497099905316


In [35]:
lcv.intercept_

-715965.5443329096

In [11]:
pd.Series(lcv.coef_, index=X.columns).sort_values(ascending=False).nlargest(15)

ExterQual              11035.415269
OverallQual            10553.101636
LandSlope               8052.863370
KitchenQual             7237.322169
OverallCond             5646.887409
Fireplaces              5624.597025
Foundation_Concrete     5383.238127
Foundation_Other        5301.397729
Lot_CulDSac             5277.942342
Functional              5181.704652
BsmtQual                4794.939067
LotShape                2883.470084
Electrical              2425.920316
RoofStyle               2146.529501
HeatingQC               1388.286218
dtype: float64

## np log LassoCV

In [12]:
features = housing.drop('SalePrice', axis=1)
target = np.log(housing['SalePrice'])
X_train1,X_test1,y_train1,y_test1 = train_test_split(features, target, test_size=0.2,random_state=42)

lcvlog = LassoCV(normalize=True).fit(X_train1, y_train1)

print("The train set R^2 is %.3f" %(lcvlog.score(X_train1, y_train1)))
print("The test set R^2 is %.3f" %(lcvlog.score(X_test1, y_test1)))

The train set R^2 is 0.906
The test set R^2 is 0.920


In [13]:
pd.Series(lcvlog.coef_, index=X.columns).sort_values(ascending=False).nlargest(15)

OverallQual            0.069848
CentralAir             0.053958
OverallCond            0.042618
LandSlope              0.035645
Fireplaces             0.025993
Foundation_Concrete    0.025569
PavedDrive             0.024161
KitchenQual            0.023048
GarageCars             0.021539
Functional             0.020298
ExterQual              0.017122
Baths                  0.016717
HeatingQC              0.013476
LotShape               0.012492
Class_1.5story         0.011383
dtype: float64

---

In [38]:
feats = housing[['GrLivArea', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
            'YearBuilt', 'ExterQual', 'BsmtFinSF1', 
            'TotalBsmtSF','KitchenAbvGr', 'KitchenQual','Fireplaces', 'GarageArea', 
            'ScreenPorch','Class_2story_dev', 'Class_duplex','Lot_CulDSac', 'Foundation_Concrete']]
targs = housing['SalePrice']

X_train2,X_test2,y_train2,y_test2 = train_test_split(feats, targs, test_size=0.2,random_state=42)

In [39]:
lcv = LassoCV(normalize=True).fit(X_train2, y_train2)
print('train score: ',lcv.score(X_train2,y_train2))
print('test score: ',lcv.score(X_test2,y_test2))

train score:  0.8677470652518091
test score:  0.8685712807631175


In [40]:
pd.Series(lcv.coef_, index=feats.columns).sort_values(ascending=False).nlargest(18)

ExterQual              15115.150914
OverallQual            11278.667359
KitchenQual             8966.072474
Lot_CulDSac             8113.718520
Fireplaces              7040.967812
Foundation_Concrete     6495.444084
OverallCond             4684.012011
YearBuilt                246.304700
LotFrontage              106.933281
GrLivArea                 49.919106
ScreenPorch               42.669911
GarageArea                29.914517
BsmtFinSF1                23.295081
TotalBsmtSF               21.144795
LotArea                    0.532227
Class_duplex           -7235.465526
Class_2story_dev       -9200.715252
KitchenAbvGr           -9738.832873
dtype: float64

---

### Overall Features that have an effect

In [15]:
coefs_abs = pd.Series(np.abs(lcv.coef_), index=X.columns).sort_values(ascending=False)

## positive features

## Feature Analysis

### ExterQual

 - The average home exterior makeover cost is around $7,700
     - includes exterior painting, landscaping, door and window updates, porch railing, and decorations. 
 
 - one of the cheaper parts of your home to revamp, and the value that it adds to your home when it is all said and done makes the investment well worth it.

https://www.brickandbatten.com/how-much-does-a-home-exterior-makeover-cost/

### Landslope

Average Cost to Reslope a Lawn - $1,925 ($400-$5100)
https://www.homeadvisor.com/cost/landscape/reslope-a-lawn/

### Kitchen Quality

- 100-250 (usd) per square foot
- 12,800 - 21,200 (usd) range
- average cost of $16,600 (usd) or 150 (usd) per square foot

https://homeguide.com/costs/kitchen-remodel-cost

### Fireplaces

National Fireplace Costs
- Ranges from $390 (usd) to 2000 (usd)

Can boost a home's value by as much as $15,000 in certain parts of the country 

https://www.improvenet.com/r/costs-and-prices/fireplace-remodeling-cost

In [16]:
pd.Series(lcv.coef_, index=X.columns).sort_values(ascending=False).nsmallest(15)

Class_1story_dev   -14164.910740
Class_2story_dev   -12961.891234
KitchenAbvGr       -10188.844928
Class_duplex       -10181.674908
BsmtCond            -7571.701243
BedroomAbvGr        -6418.576118
CentralAir          -4372.219006
Lot_FR              -4139.564086
GarageType          -3380.887071
Class_2story        -3351.448101
GarageCond          -2974.123697
MSZoning            -2776.190058
Class_split         -2598.481713
Lot_Corner          -1606.511316
ExterCond           -1605.202508
dtype: float64

In [17]:
from sklearn.feature_selection import RFE