In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('datasets/encoded-unscaled.csv')
df.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Land Contour,Utilities,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,ConLw,New,Oth,VWD,WD,AdjLand,Alloca,Family,Normal,Partial
0,20,81.0,14267,3,1,6,6,1958,1958,108.0,...,0,0,0,0,1,0,0,0,1,0
1,60,74.0,13830,3,1,5,5,1997,1998,0.0,...,0,0,0,0,1,0,0,0,1,0
2,60,78.0,9978,3,1,6,6,1998,1998,20.0,...,0,0,0,0,1,0,0,0,1,0
3,120,41.0,4920,3,1,8,5,2001,2001,0.0,...,0,0,0,0,1,0,0,0,1,0
4,120,43.0,5005,2,1,8,5,1992,1992,0.0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
df.isnull().sum().any()

np.False_

In [5]:
# Separate features (X) and target variable (y)
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import mean_squared_error, r2_score

def build_model(model_module, name: str, x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, y_test: pd.DataFrame):
    model = model_module()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = round(np.sqrt(mse), 2)

    r2 = r2_score(y_test, y_pred)
    r2 = round(r2, 2)

    print(f'Model: {name} \nmse: {mse}, rmse: {rmse}, r2: {r2}')
    return model

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso

linear_model = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

Model: Linear 
mse: 337900794.4714504, rmse: 18382.08, r2: 0.88
Model: Ridge 
mse: 331329978.3004672, rmse: 18202.47, r2: 0.89
Model: Lasso 
mse: 333971061.3629474, rmse: 18274.88, r2: 0.89


In [8]:
importances = linear_model.coef_

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
52,GrnHill,66794.843214
125,PreCast.1,38855.694361
109,PreCast,38855.694361
205,Po.6,38476.304247
206,TA.7,29975.625927
...,...,...
170,OthW,-31007.053822
202,TA.6,-31443.348753
199,Fa.6,-31567.819007
201,Po.5,-59838.700492


### Using scaled data

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
df.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Land Contour,Utilities,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Mo Sold,Yr Sold,SalePrice,BsmtFin SF 2_0,Low Qual Fin SF_0,Street_pave,Lot Shape_Reg,CulDSac,FR2,FR3,Inside,Mod,Sev,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,Gilbert,Greens,GrnHill,IDOTRR,MeadowV,Mitchel,NAmes,NPkVill,NWAmes,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker,Feedr,Norm,PosA,PosN,RRAe,RRAn,RRNe,RRNn,Feedr.1,Norm.1,PosN.1,RRNn.1,2fmCon,Duplex,Twnhs,TwnhsE,1.5Unf,1Story,2.5Fin,2.5Unf,2Story,SFoyer,SLvl,Gable,Gambrel,Hip,Mansard,Shed,Tar&Grv,WdShake,WdShngl,AsphShn,BrkComm,BrkFace,CBlock,CemntBd,HdBoard,ImStucc,MetalSd,Plywood,PreCast,Stone,Stucco,VinylSd,Wd Sdng,WdShing,AsphShn.1,Brk Cmn,BrkFace.1,CBlock.1,CmentBd,HdBoard.1,ImStucc.1,MetalSd.1,Other,Plywood.1,PreCast.1,Stone.1,Stucco.1,VinylSd.1,Wd Sdng.1,Wd Shng,BrkFace.2,Stone.2,Unknown,Fa,Gd,TA,Fa.1,Gd.1,Po,TA.1,CBlock.2,PConc,Slab,Stone.3,Wood,Fa.2,Gd.2,Po.1,TA.2,Fa.3,Gd.3,Po.2,TA.3,Gd.4,Mn,No,BLQ,GLQ,LwQ,Rec,Unf,BLQ.1,GLQ.1,LwQ.1,Rec.1,Unf.1,GasA,GasW,Grav,OthW,Wall,Fa.4,Gd.5,Po.3,TA.4,Y,FuseF,FuseP,Mix,SBrkr,Fa.5,Gd.6,Po.4,TA.5,Maj2,Min1,Min2,Mod.1,Sal,Sev.1,Typ,Attchd,Basment,BuiltIn,CarPort,Detchd,RFn,Unf.2,Fa.6,Gd.7,Po.5,TA.6,Fa.7,Gd.8,Po.6,TA.7,P,Y.1,CWD,Con,ConLD,ConLI,ConLw,New,Oth,VWD,WD,AdjLand,Alloca,Family,Normal,Partial
0,20,81.0,14267,3,1,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1958.0,1.0,312.0,393,36,6,2010,172000,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,60,74.0,13830,3,1,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,1997.0,2.0,482.0,212,34,3,2010,189900,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,60,78.0,9978,3,1,6,6,1998,1998,20.0,602.0,0.0,324.0,926.0,926,678,0,1604,0.0,0.0,2,1,3,1,7,1,1998.0,2.0,470.0,360,36,6,2010,195500,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,120,41.0,4920,3,1,8,5,2001,2001,0.0,616.0,0.0,722.0,1338.0,1338,0,0,1338,1.0,0.0,2,0,2,1,6,0,2001.0,2.0,582.0,0,0,4,2010,213500,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,120,43.0,5005,2,1,8,5,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,0,0,1280,0.0,0.0,2,0,2,1,5,0,1992.0,2.0,506.0,0,82,1,2010,191500,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [11]:
df['BsmtFin SF 2'].unique()

array([   0., 1120.,  168.,   78.,  119.,  121.,  859.,   46., 1029.,
        290.,  132.,  713.,  162.,  362.,  240.,  258.,  906.,  486.,
        350., 1073.,  692.,   12.,  159.,  712.,  474.,  453.,  684.,
        387.,  688.,  127.,  252.,  232.,  480.,  284.,  276.,  472.,
        239.,  180.,  622.,  495.,  113.,  360.,  774.,  364.,  596.,
        884.,  311.,  590.,   42.,  147., 1127.,  466.,  630.,  201.,
        345.,  512.,  230.,  247.,  661.,  620.,  202.,  483.,  690.,
        105.,   60.,  352.,  102.,   95.,  465.,   63.,  262.,  500.,
        670.,  393.,  286.,  450.,  344.,   72.,  144.,  243.,  420.,
        210.,  694.,  875.,  507.,  250.,  116.,  273.,   76.,  270.,
        110.,  411.,  228.,   93.,  438.,  613.,  294.,  852.,  555.,
        841.,  799.,  811.,  842.,  382.,  456.,   64.,  539.,  336.,
        306.,  872.,  108.,   52.,  196.,  128.,  319.,  532.,  106.,
        169.,  608.,   41.,  606.,  645.,  492.,  181.,  956.,  391.,
        380.,  723.,

In [12]:
all_colnames = list(df.columns)
featnames_to_scale = all_colnames[:3]
set2 = all_colnames[all_colnames.index('Year Built'):all_colnames.index('Bsmt Half Bath')+1]
set3 = all_colnames[all_colnames.index('Garage Yr Blt'):all_colnames.index('SalePrice')]
featnames_to_scale.extend(set2)
featnames_to_scale.extend(set3)
featnames_to_scale

['MS SubClass',
 'Lot Frontage',
 'Lot Area',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Mo Sold',
 'Yr Sold']

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_feats = scaler.fit_transform(df[featnames_to_scale])
scaled_feats_df = pd.DataFrame(scaled_feats, columns=featnames_to_scale)

In [14]:
scaled_feats_df

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Mo Sold,Yr Sold
0,-0.877598,0.837288,1.679885,-0.366544,-1.157339,0.486636,1.359138,-0.298662,-0.296696,1.027969,0.854808,-0.772963,-0.092566,-0.107014,-0.783950,-0.255825,-0.764746,-0.907220,-0.633065,3.174519,0.154802,-0.051694,1.661067
1,0.055757,0.437229,1.542818,0.937564,0.734213,-0.600891,1.019002,-0.298662,-0.972405,-0.108883,-0.491743,1.089631,-0.092566,0.685214,-0.783950,-0.255825,0.760971,0.511831,0.283891,1.358455,0.102870,-1.175792,1.661067
2,0.055757,0.665834,0.334626,0.971003,0.734213,-0.399497,0.531990,-0.298662,-0.502674,-0.114553,-0.498459,1.028519,-0.092566,0.619195,-0.783950,-0.255825,0.800092,0.511831,0.219164,2.843414,0.154802,-0.051694,1.661067
3,1.455790,-1.448765,-1.251833,1.071319,0.876080,-0.600891,0.568065,-0.298662,0.497075,1.053485,0.885030,-0.772963,-0.092566,-0.083247,1.174159,-0.255825,0.917455,0.511831,0.823276,-0.768649,-0.779980,-0.801093,1.661067
4,1.455790,-1.334462,-1.225173,0.770371,0.450480,-0.600891,-0.341540,-0.298662,1.238095,0.889052,0.690267,-0.772963,-0.092566,-0.236411,-0.783950,-0.255825,0.565367,0.511831,0.413343,-0.768649,1.349245,-1.925190,1.661067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213,0.522435,-1.677370,-0.305541,0.502861,0.072170,-0.600891,1.091152,-0.298662,-0.854345,0.103745,-0.239894,-0.772963,-0.092566,-0.967902,1.174159,-0.255825,0.252399,0.511831,0.855639,0.435372,-0.779980,-1.175792,-1.378114
2214,-0.877598,0.164308,-0.008198,0.469423,0.024881,-0.600891,-0.243622,1.774437,-0.716188,-0.290326,-0.579051,-0.772963,-0.092566,-1.234619,1.174159,-0.255825,0.213278,0.511831,0.294678,0.876846,-0.779980,-0.051694,-1.378114
2215,0.639104,-0.248587,0.479847,0.770371,0.450480,-0.600891,-0.150858,-0.298662,0.127821,-0.154244,-0.350708,-0.772963,-0.092566,-1.055047,-0.783950,3.740085,1.073939,-2.326271,-2.315949,0.034032,0.050937,0.323005,-1.378114
2216,-0.877598,0.608683,0.344662,0.168475,-0.353430,-0.600891,1.740502,0.488348,-0.826713,1.198072,1.056287,-0.772963,-0.092566,0.051432,1.174159,-0.255825,-0.099690,0.511831,-0.061316,1.639393,0.206734,-0.801093,-1.378114


In [15]:
scaled_df = pd.concat([df.drop(featnames_to_scale, axis=1), scaled_feats_df], axis=1)
scaled_df

Unnamed: 0,Land Contour,Utilities,Overall Qual,Overall Cond,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,SalePrice,BsmtFin SF 2_0,Low Qual Fin SF_0,Street_pave,Lot Shape_Reg,CulDSac,FR2,FR3,Inside,Mod,Sev,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,Gilbert,Greens,GrnHill,IDOTRR,MeadowV,Mitchel,NAmes,NPkVill,NWAmes,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker,Feedr,Norm,PosA,PosN,RRAe,RRAn,RRNe,RRNn,Feedr.1,Norm.1,PosN.1,RRNn.1,2fmCon,Duplex,Twnhs,TwnhsE,1.5Unf,1Story,2.5Fin,2.5Unf,2Story,SFoyer,SLvl,Gable,Gambrel,Hip,Mansard,Shed,Tar&Grv,WdShake,WdShngl,AsphShn,BrkComm,BrkFace,CBlock,CemntBd,HdBoard,ImStucc,MetalSd,Plywood,PreCast,Stone,Stucco,VinylSd,Wd Sdng,WdShing,AsphShn.1,Brk Cmn,BrkFace.1,CBlock.1,CmentBd,HdBoard.1,ImStucc.1,MetalSd.1,Other,Plywood.1,PreCast.1,Stone.1,Stucco.1,VinylSd.1,Wd Sdng.1,Wd Shng,BrkFace.2,Stone.2,Unknown,Fa,Gd,TA,Fa.1,Gd.1,Po,TA.1,CBlock.2,PConc,Slab,Stone.3,Wood,Fa.2,Gd.2,Po.1,TA.2,Fa.3,Gd.3,Po.2,TA.3,Gd.4,Mn,No,BLQ,GLQ,LwQ,Rec,Unf,BLQ.1,GLQ.1,LwQ.1,Rec.1,Unf.1,GasA,GasW,Grav,OthW,Wall,Fa.4,Gd.5,Po.3,TA.4,Y,FuseF,FuseP,Mix,SBrkr,Fa.5,Gd.6,Po.4,TA.5,Maj2,Min1,Min2,Mod.1,Sal,Sev.1,Typ,Attchd,Basment,BuiltIn,CarPort,Detchd,RFn,Unf.2,Fa.6,Gd.7,Po.5,TA.6,Fa.7,Gd.8,Po.6,TA.7,P,Y.1,CWD,Con,ConLD,ConLI,ConLw,New,Oth,VWD,WD,AdjLand,Alloca,Family,Normal,Partial,MS SubClass,Lot Frontage,Lot Area,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Mo Sold,Yr Sold
0,3,1,6,6,1,1,3,1,6,0,172000,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,-0.877598,0.837288,1.679885,-0.366544,-1.157339,0.486636,1.359138,-0.298662,-0.296696,1.027969,0.854808,-0.772963,-0.092566,-0.107014,-0.783950,-0.255825,-0.764746,-0.907220,-0.633065,3.174519,0.154802,-0.051694,1.661067
1,3,1,5,5,2,1,3,1,6,1,189900,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.055757,0.437229,1.542818,0.937564,0.734213,-0.600891,1.019002,-0.298662,-0.972405,-0.108883,-0.491743,1.089631,-0.092566,0.685214,-0.783950,-0.255825,0.760971,0.511831,0.283891,1.358455,0.102870,-1.175792,1.661067
2,3,1,6,6,2,1,3,1,7,1,195500,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.055757,0.665834,0.334626,0.971003,0.734213,-0.399497,0.531990,-0.298662,-0.502674,-0.114553,-0.498459,1.028519,-0.092566,0.619195,-0.783950,-0.255825,0.800092,0.511831,0.219164,2.843414,0.154802,-0.051694,1.661067
3,3,1,8,5,2,0,2,1,6,0,213500,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1.455790,-1.448765,-1.251833,1.071319,0.876080,-0.600891,0.568065,-0.298662,0.497075,1.053485,0.885030,-0.772963,-0.092566,-0.083247,1.174159,-0.255825,0.917455,0.511831,0.823276,-0.768649,-0.779980,-0.801093,1.661067
4,2,1,8,5,2,0,2,1,5,0,191500,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1.455790,-1.334462,-1.225173,0.770371,0.450480,-0.600891,-0.341540,-0.298662,1.238095,0.889052,0.690267,-0.772963,-0.092566,-0.236411,-0.783950,-0.255825,0.565367,0.511831,0.413343,-0.768649,1.349245,-1.925190,1.661067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213,3,1,6,6,1,0,3,1,6,0,142500,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.522435,-1.677370,-0.305541,0.502861,0.072170,-0.600891,1.091152,-0.298662,-0.854345,0.103745,-0.239894,-0.772963,-0.092566,-0.967902,1.174159,-0.255825,0.252399,0.511831,0.855639,0.435372,-0.779980,-1.175792,-1.378114
2214,1,1,5,5,1,0,2,1,5,0,131000,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,-0.877598,0.164308,-0.008198,0.469423,0.024881,-0.600891,-0.243622,1.774437,-0.716188,-0.290326,-0.579051,-0.772963,-0.092566,-1.234619,1.174159,-0.255825,0.213278,0.511831,0.294678,0.876846,-0.779980,-0.051694,-1.378114
2215,3,1,5,5,1,0,3,1,6,0,132000,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.639104,-0.248587,0.479847,0.770371,0.450480,-0.600891,-0.150858,-0.298662,0.127821,-0.154244,-0.350708,-0.772963,-0.092566,-1.055047,-0.783950,3.740085,1.073939,-2.326271,-2.315949,0.034032,0.050937,0.323005,-1.378114
2216,3,1,5,5,1,0,2,1,6,1,170000,1,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,-0.877598,0.608683,0.344662,0.168475,-0.353430,-0.600891,1.740502,0.488348,-0.826713,1.198072,1.056287,-0.772963,-0.092566,0.051432,1.174159,-0.255825,-0.099690,0.511831,-0.061316,1.639393,0.206734,-0.801093,-1.378114


In [16]:
scaled_df.to_csv('datasets/scaled.csv')

In [18]:
X = scaled_df.drop('SalePrice', axis=1)
y = scaled_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model_s = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

Model: Linear 
mse: 337897759.5212353, rmse: 18382.0, r2: 0.88
Model: Ridge 
mse: 331253878.97031134, rmse: 18200.38, r2: 0.89
Model: Lasso 
mse: 333975286.3599646, rmse: 18274.99, r2: 0.89


In [20]:
df['SalePrice'].describe()

count      2218.000000
mean     160837.552750
std       55049.109861
min       12789.000000
25%      125000.000000
50%      150000.000000
75%      189000.000000
max      426000.000000
Name: SalePrice, dtype: float64

- The mean SalePrice is around 160000.
- Mean error is around 18000 - Not much bad

### Only using important features

In [28]:
X_train.head()

Unnamed: 0,Land Contour,Utilities,Overall Qual,Overall Cond,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,BsmtFin SF 2_0,Low Qual Fin SF_0,Street_pave,Lot Shape_Reg,CulDSac,FR2,FR3,Inside,Mod,Sev,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,Gilbert,Greens,GrnHill,IDOTRR,MeadowV,Mitchel,NAmes,NPkVill,NWAmes,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker,Feedr,Norm,PosA,PosN,RRAe,RRAn,RRNe,RRNn,Feedr.1,Norm.1,PosN.1,RRNn.1,2fmCon,Duplex,Twnhs,TwnhsE,1.5Unf,1Story,2.5Fin,2.5Unf,2Story,SFoyer,SLvl,Gable,Gambrel,Hip,Mansard,Shed,Tar&Grv,WdShake,WdShngl,AsphShn,BrkComm,BrkFace,CBlock,CemntBd,HdBoard,ImStucc,MetalSd,Plywood,PreCast,Stone,Stucco,VinylSd,Wd Sdng,WdShing,AsphShn.1,Brk Cmn,BrkFace.1,CBlock.1,CmentBd,HdBoard.1,ImStucc.1,MetalSd.1,Other,Plywood.1,PreCast.1,Stone.1,Stucco.1,VinylSd.1,Wd Sdng.1,Wd Shng,BrkFace.2,Stone.2,Unknown,Fa,Gd,TA,Fa.1,Gd.1,Po,TA.1,CBlock.2,PConc,Slab,Stone.3,Wood,Fa.2,Gd.2,Po.1,TA.2,Fa.3,Gd.3,Po.2,TA.3,Gd.4,Mn,No,BLQ,GLQ,LwQ,Rec,Unf,BLQ.1,GLQ.1,LwQ.1,Rec.1,Unf.1,GasA,GasW,Grav,OthW,Wall,Fa.4,Gd.5,Po.3,TA.4,Y,FuseF,FuseP,Mix,SBrkr,Fa.5,Gd.6,Po.4,TA.5,Maj2,Min1,Min2,Mod.1,Sal,Sev.1,Typ,Attchd,Basment,BuiltIn,CarPort,Detchd,RFn,Unf.2,Fa.6,Gd.7,Po.5,TA.6,Fa.7,Gd.8,Po.6,TA.7,P,Y.1,CWD,Con,ConLD,ConLI,ConLw,New,Oth,VWD,WD,AdjLand,Alloca,Family,Normal,Partial,MS SubClass,Lot Frontage,Lot Area,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Mo Sold,Yr Sold
23,3,1,9,5,2,1,3,1,10,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.055757,2.037466,1.237947,1.338828,1.301679,1.030399,-1.019235,-0.298662,2.677432,1.767916,1.855487,1.105574,-0.092566,2.546951,-0.78395,-0.255825,1.230423,1.930882,1.734838,0.676176,2.673519,-1.92519,1.661067
1892,3,1,8,5,1,0,1,1,4,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1.45579,-1.505916,-1.579287,0.302229,-0.211563,-0.600891,0.660829,3.066924,-1.195967,0.73596,0.508936,-0.772963,-0.092566,-0.379012,1.174159,-0.255825,0.017673,0.511831,0.294678,0.565807,0.77799,0.323005,-1.378114
427,3,1,7,6,2,1,3,1,7,2,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.055757,0.164308,1.198427,0.168475,-0.400718,-0.600891,0.423765,-0.029927,-0.93975,-0.607851,0.707057,1.304853,-0.092566,1.841868,-0.78395,-0.255825,-0.138811,0.511831,1.912836,2.211303,0.258666,-0.051694,0.901272
552,3,1,6,7,1,0,4,1,8,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.289096,-0.36289,0.253703,-1.9716,0.923368,-0.600891,-1.019235,-0.298662,0.544802,-0.639037,-0.988724,1.195914,-0.092566,0.400012,-0.78395,-0.255825,-1.077714,1.930882,1.13612,-0.768649,-0.77998,1.072403,0.901272
869,3,1,5,5,1,0,3,1,5,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,3.089162,-0.077133,-0.144637,-0.032157,-0.684451,-0.600891,1.274104,-0.298662,-1.31654,-0.216615,-0.619346,-0.772963,-0.092566,-1.266308,1.174159,-0.255825,-0.295294,-0.90722,-0.654641,-0.768649,-0.77998,-0.801093,0.141477


In [None]:
lr_ceofs = linear_model.coef_
ridge_ceofs = ridge_model.coef_
lasso_ceofs = lasso_model.coef_


feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Linear Coefs': lr_ceofs, 'Ridge Coefs': ridge_ceofs, 'Lasso Coefs': lasso_ceofs})

pd.set_option('display.max_rows', None)
feature_importance_df

In [30]:
df.drop('Utilities', axis=1, inplace=True)
scaled_df.drop('Utilities', axis=1, inplace=True)

In [32]:
features_corr = df.corrwith(df['SalePrice'])
type(features_corr)

pandas.core.series.Series

In [58]:
correlation = 0.4

-0.3 <= correlation <= 0.3

False

In [75]:
def filter_correlation(correlation, threshold=0.3):
  if -threshold <= correlation <= threshold: 
    return None
  else:
    return correlation

In [88]:
features_corr_copy = {}
for feature, correlation in features_corr.items():
    correlation = filter_correlation(correlation)
    features_corr_copy[feature] = correlation


In [89]:
important_features = [feature for feature, correlation in features_corr_copy.items() if correlation is not None]
print(len(important_features))
important_features

39


['Overall Qual',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Full Bath',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'SalePrice',
 'NridgHt',
 'VinylSd',
 'VinylSd.1',
 'Stone.2',
 'Unknown',
 'Gd',
 'TA',
 'CBlock.2',
 'PConc',
 'Gd.2',
 'TA.2',
 'GLQ',
 'TA.4',
 'Y',
 'Gd.6',
 'TA.5',
 'Attchd',
 'Detchd',
 'Unf.2',
 'Y.1',
 'New',
 'Partial']

#### Using only important features to train the model

In [90]:
important_features.append('SalePrice')

In [91]:
df_with_imp_feats = scaled_df[important_features]
df_with_imp_feats.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,SalePrice,NridgHt,VinylSd,VinylSd.1,Stone.2,Unknown,Gd,TA,CBlock.2,PConc,Gd.2,TA.2,GLQ,TA.4,Y,Gd.6,TA.5,Attchd,Detchd,Unf.2,Y.1,New,Partial,SalePrice.1
0,6,-0.366544,-1.157339,0.486636,1.359138,1.027969,0.854808,-0.107014,1,6,0,-0.764746,-0.90722,-0.633065,3.174519,0.154802,172000,0,0,0,0,0,0,1,1,0,0,1,0,1,1,1,0,1,0,1,1,0,0,172000
1,5,0.937564,0.734213,-0.600891,1.019002,-0.108883,-0.491743,0.685214,2,6,1,0.760971,0.511831,0.283891,1.358455,0.10287,189900,0,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,0,1,0,0,189900
2,6,0.971003,0.734213,-0.399497,0.53199,-0.114553,-0.498459,0.619195,2,7,1,0.800092,0.511831,0.219164,2.843414,0.154802,195500,0,1,1,0,0,0,1,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,195500
3,8,1.071319,0.87608,-0.600891,0.568065,1.053485,0.88503,-0.083247,2,6,0,0.917455,0.511831,0.823276,-0.768649,-0.77998,213500,0,0,0,0,1,1,0,0,1,1,0,1,0,1,1,0,1,0,0,1,0,0,213500
4,8,0.770371,0.45048,-0.600891,-0.34154,0.889052,0.690267,-0.236411,2,5,0,0.565367,0.511831,0.413343,-0.768649,1.349245,191500,0,0,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,0,0,191500


In [92]:
df_with_imp_feats.shape

(2218, 40)

In [93]:
df_with_imp_feats.to_csv('datasets/most-imp-39-feats-df.csv')

In [94]:
X = df_with_imp_feats.drop('SalePrice', axis=1)
y = df_with_imp_feats['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model_s_imp = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s_imp = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s_imp = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

Model: Linear 
mse: 452871107.84936535, rmse: 21280.77, r2: 0.85
Model: Ridge 
mse: 452527401.6552155, rmse: 21272.69, r2: 0.85
Model: Lasso 
mse: 452850527.61047244, rmse: 21280.28, r2: 0.85


The results got a bit worse now.

In [74]:
X = df_with_imp_feats.drop('SalePrice', axis=1)
y = df_with_imp_feats['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

linear_model_s_imp = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s_imp = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s_imp = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

Model: Linear 
mse: 454326393.24453, rmse: 21314.93, r2: 0.86
Model: Ridge 
mse: 454020868.5478974, rmse: 21307.77, r2: 0.86
Model: Lasso 
mse: 454279221.84609187, rmse: 21313.83, r2: 0.86


In [76]:
# including more features by reducing the correlation threshold to 0.2 from 0.3

features_corr_copy = {}
for feature, correlation in features_corr.items():
    correlation = filter_correlation(correlation, 0.2)
    features_corr_copy[feature] = correlation

important_features = [feature for feature, correlation in features_corr_copy.items() if correlation is not None]
print(len(important_features))

important_features.append('SalePrice')
df_with_imp_feats = scaled_df[important_features]

X = df_with_imp_feats.drop('SalePrice', axis=1)
y = df_with_imp_feats['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

linear_model_s_imp = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s_imp = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s_imp = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

54
Model: Linear 
mse: 416653142.48910755, rmse: 20412.08, r2: 0.87
Model: Ridge 
mse: 416055244.5734701, rmse: 20397.43, r2: 0.87
Model: Lasso 
mse: 416569719.90747064, rmse: 20410.04, r2: 0.87


In [77]:
# including more features by reducing the correlation threshold to 0.1 from 0.1

features_corr_copy = {}
for feature, correlation in features_corr.items():
    correlation = filter_correlation(correlation, 0.1)
    features_corr_copy[feature] = correlation

important_features = [feature for feature, correlation in features_corr_copy.items() if correlation is not None]
print(len(important_features))

important_features.append('SalePrice')
df_with_imp_feats = scaled_df[important_features]

X = df_with_imp_feats.drop('SalePrice', axis=1)
y = df_with_imp_feats['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

linear_model_s_imp = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s_imp = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s_imp = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

100
Model: Linear 
mse: 396588380.15255046, rmse: 19914.53, r2: 0.88
Model: Ridge 
mse: 393791138.78307664, rmse: 19844.17, r2: 0.88
Model: Lasso 
mse: 396166650.7946852, rmse: 19903.94, r2: 0.88


Still the best model was the LassoRegression with rmse of around 18000 and r2 of .89

### Applying Polynomial regression

In [80]:
from sklearn.preprocessing import PolynomialFeatures

In [95]:
# using the 39 most important cols
X_train.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,NridgHt,VinylSd,VinylSd.1,Stone.2,Unknown,Gd,TA,CBlock.2,PConc,Gd.2,TA.2,GLQ,TA.4,Y,Gd.6,TA.5,Attchd,Detchd,Unf.2,Y.1,New,Partial
23,9,1.338828,1.301679,1.030399,-1.019235,1.767916,1.855487,2.546951,2,10,1,1.230423,1.930882,1.734838,0.676176,2.673519,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,1
1892,8,0.302229,-0.211563,-0.600891,0.660829,0.73596,0.508936,-0.379012,1,4,1,0.017673,0.511831,0.294678,0.565807,0.77799,0,0,0,0,1,1,0,1,0,1,0,1,1,1,1,0,1,0,0,1,0,0
427,7,0.168475,-0.400718,-0.600891,0.423765,-0.607851,0.707057,1.841868,2,7,2,-0.138811,0.511831,1.912836,2.211303,0.258666,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,0
552,6,-1.9716,0.923368,-0.600891,-1.019235,-0.639037,-0.988724,0.400012,1,8,0,-1.077714,1.930882,1.13612,-0.768649,-0.77998,0,0,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,1,1,0,0
869,5,-0.032157,-0.684451,-0.600891,1.274104,-0.216615,-0.619346,-1.266308,1,5,0,-0.295294,-0.90722,-0.654641,-0.768649,-0.77998,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,1,1,1,0,0


In [96]:
X_train.shape

(1774, 38)

In [97]:
poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

linear_model_s_imp = build_model(LinearRegression, 'Linear', X_train, y_train, X_test, y_test)
ridge_model_s_imp = build_model(Ridge, 'Ridge', X_train, y_train, X_test, y_test)
lasso_model_s_imp = build_model(Lasso, 'Lasso', X_train, y_train, X_test, y_test)

Model: Linear 
mse: 1442036227.4813738, rmse: 37974.15, r2: 0.51
Model: Ridge 
mse: 521778313.7642623, rmse: 22842.47, r2: 0.82


  model = cd_fast.enet_coordinate_descent(


Model: Lasso 
mse: 581281811.8312207, rmse: 24109.79, r2: 0.8


In [98]:
X_train.shape

(1774, 780)

- Still the best model was the LassoRegression with rmse of around 18000 and r2 of .89
- We can go to around and r2 of .91 if we use test_size as 0.1.