# Project 2 - Ames Housing Data and Kaggle Challenge
## Matt Reed / DSI-124

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split

In [65]:
df_train = pd.read_csv('../datasets/train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [66]:
df_train['Misc Feature'].unique()

array([nan, 'Shed', 'TenC', 'Gar2', 'Othr', 'Elev'], dtype=object)

In [67]:
df_train['Pool QC'].unique()

array([nan, 'Fa', 'Gd', 'Ex', 'TA'], dtype=object)

In [68]:
df_train['Alley'].unique()

array([nan, 'Pave', 'Grvl'], dtype=object)

In [69]:
df_train['Neighborhood'].unique()

array(['Sawyer', 'SawyerW', 'NAmes', 'Timber', 'Edwards', 'OldTown',
       'BrDale', 'CollgCr', 'Somerst', 'Mitchel', 'StoneBr', 'NridgHt',
       'Gilbert', 'Crawfor', 'IDOTRR', 'NWAmes', 'Veenker', 'MeadowV',
       'SWISU', 'NoRidge', 'ClearCr', 'Blmngtn', 'BrkSide', 'NPkVill',
       'Blueste', 'GrnHill', 'Greens', 'Landmrk'], dtype=object)

In [70]:
def dummies_add(column_name, dataframe):
    dummies = pd.get_dummies(dataframe[f'{column_name}'], prefix=f'{column_name}')
    dataframe[dummies.columns] = dummies
    dataframe.drop(columns=f'{column_name}', inplace=True)

In [71]:
df_train['Neighborhood']

0        Sawyer
1       SawyerW
2         NAmes
3        Timber
4       SawyerW
         ...   
2046     Timber
2047    Edwards
2048    Crawfor
2049      NAmes
2050    Gilbert
Name: Neighborhood, Length: 2051, dtype: object

In [72]:
dummies_add('Neighborhood', df_train)

In [73]:
corr = df_train.corr()
# Useful sorting bit from https://stackoverflow.com/questions/11350770/filter-pandas-dataframe-by-substring-criteria
corr['SalePrice'].loc[corr['SalePrice'].index.str.contains('Neighborhood')].sort_values(ascending=False)

Neighborhood_NridgHt    0.448647
Neighborhood_NoRidge    0.263395
Neighborhood_StoneBr    0.256977
Neighborhood_Somerst    0.150078
Neighborhood_Timber     0.116400
Neighborhood_Veenker    0.083186
Neighborhood_CollgCr    0.082309
Neighborhood_Crawfor    0.058386
Neighborhood_ClearCr    0.052503
Neighborhood_GrnHill    0.038848
Neighborhood_NWAmes     0.034926
Neighborhood_Blmngtn    0.024900
Neighborhood_Gilbert    0.023974
Neighborhood_SawyerW    0.016708
Neighborhood_Greens     0.003476
Neighborhood_Landmrk   -0.012395
Neighborhood_Blueste   -0.025226
Neighborhood_Mitchel   -0.035574
Neighborhood_NPkVill   -0.047296
Neighborhood_SWISU     -0.074214
Neighborhood_BrDale    -0.095305
Neighborhood_MeadowV   -0.111558
Neighborhood_Sawyer    -0.133692
Neighborhood_BrkSide   -0.134790
Neighborhood_Edwards   -0.176119
Neighborhood_IDOTRR    -0.189237
Neighborhood_NAmes     -0.189387
Neighborhood_OldTown   -0.208371
Name: SalePrice, dtype: float64

In [74]:
corr_top = corr['SalePrice'].sort_values(ascending=False).loc[corr['SalePrice'] > .5]

In [75]:
corr_top

SalePrice         1.000000
Overall Qual      0.800207
Gr Liv Area       0.697038
Garage Area       0.650270
Garage Cars       0.648220
Total Bsmt SF     0.628925
1st Flr SF        0.618486
Year Built        0.571849
Year Remod/Add    0.550370
Full Bath         0.537969
Garage Yr Blt     0.533922
Mas Vnr Area      0.512230
TotRms AbvGrd     0.504014
Name: SalePrice, dtype: float64

In [76]:
temp_features = corr_top.index.values[1:]

In [77]:
temp_features

array(['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars',
       'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Year Remod/Add',
       'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area', 'TotRms AbvGrd'],
      dtype=object)

In [78]:
temp_X = df_train[temp_features].copy()
temp_X.dropna(inplace=True)

In [79]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(temp_X.values, i) for i in range(temp_X.shape[1])]
vif['variable'] = temp_X.columns

In [80]:
vif

Unnamed: 0,VIF,variable
0,46.373605,Overall Qual
1,48.879322,Gr Liv Area
2,34.659636,Garage Area
3,39.417879,Garage Cars
4,22.622198,Total Bsmt SF
5,33.894146,1st Flr SF
6,18268.089003,Year Built
7,13379.25186,Year Remod/Add
8,18.345397,Full Bath
9,25472.727264,Garage Yr Blt


Thinking that 'Garage Area' and 'Garage Cars' are redundant. 'Garage Yr Blt' and 'Year Remod/Add' are likely covered by 'Year Built'. 'TotRms AbvGrd' and '1st Flr SF' are probably covered by 'Gr Liv Area'. Will give it a go with stripping those out.

In [81]:
temp_X_2 = temp_X.copy()
temp_X_2.drop(columns=[
    'Garage Area', 
    'Garage Yr Blt', 
    'Year Remod/Add', 
    'TotRms AbvGrd', 
    '1st Flr SF',
    ],inplace=True)

In [82]:
temp_X_2

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Cars,Total Bsmt SF,Year Built,Full Bath,Mas Vnr Area
0,6,1479,2.0,725.0,1976,2,289.0
1,7,2122,2.0,913.0,1996,2,132.0
2,5,1057,1.0,1057.0,1953,1,0.0
3,5,1444,2.0,384.0,2006,2,0.0
4,6,1445,2.0,676.0,1900,2,0.0
...,...,...,...,...,...,...,...
2046,8,1728,2.0,1884.0,2007,2,0.0
2047,4,861,2.0,861.0,1940,1,0.0
2048,6,1913,2.0,896.0,1928,1,0.0
2049,4,1200,1.0,1200.0,1956,1,0.0


In [83]:
vif_2 = pd.DataFrame()
vif_2['VIF'] = [variance_inflation_factor(temp_X_2.values, i) for i in range(temp_X_2.shape[1])]
vif_2['variable'] = temp_X_2.columns
vif_2

Unnamed: 0,VIF,variable
0,44.231587,Overall Qual
1,20.620981,Gr Liv Area
2,16.354324,Garage Cars
3,10.53922,Total Bsmt SF
4,25.965773,Year Built
5,17.657673,Full Bath
6,1.812801,Mas Vnr Area


Seems worth giving a try with that, I guess...

In [84]:
features = temp_X_2.columns.values
features

array(['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF',
       'Year Built', 'Full Bath', 'Mas Vnr Area'], dtype=object)

In [85]:
df_train[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2051 non-null   int64  
 1   Gr Liv Area    2051 non-null   int64  
 2   Garage Cars    2050 non-null   float64
 3   Total Bsmt SF  2050 non-null   float64
 4   Year Built     2051 non-null   int64  
 5   Full Bath      2051 non-null   int64  
 6   Mas Vnr Area   2029 non-null   float64
dtypes: float64(3), int64(4)
memory usage: 112.3 KB


In [86]:
# Looking to drop rows where I have null values in my feature set
drop_rows = df_train.loc[df_train['Mas Vnr Area'].isnull()].index
drop_rows = drop_rows.append(df_train.loc[df_train['Garage Cars'].isnull()].index)
drop_rows = drop_rows.append(df_train.loc[df_train['Total Bsmt SF'].isnull()].index)

In [87]:
drop_rows

Int64Index([  22,   41,   86,  212,  276,  338,  431,  451,  591,  844,  913,
             939, 1025, 1244, 1306, 1430, 1434, 1606, 1699, 1815, 1820, 1941,
            1712, 1327],
           dtype='int64')

In [88]:
df_train_clean = df_train.drop(drop_rows)
df_train_clean[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2027 entries, 0 to 2050
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2027 non-null   int64  
 1   Gr Liv Area    2027 non-null   int64  
 2   Garage Cars    2027 non-null   float64
 3   Total Bsmt SF  2027 non-null   float64
 4   Year Built     2027 non-null   int64  
 5   Full Bath      2027 non-null   int64  
 6   Mas Vnr Area   2027 non-null   float64
dtypes: float64(3), int64(4)
memory usage: 126.7 KB


All variables have non-null and appropriate typed values

In [89]:
X = df_train_clean[features]
y = df_train_clean['SalePrice']

In [90]:
lr = LinearRegression()

In [91]:
lr.fit(X, y)

LinearRegression()

In [92]:
lr.coef_, lr.intercept_

(array([20259.1745533 ,    49.10520818, 13620.74541517,    27.1919158 ,
          324.12626488, -4825.2256406 ,    39.82511495]),
 -704441.393773311)

In [93]:
df_test = pd.read_csv('../datasets/test.csv')

In [94]:
df_test[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   878 non-null    int64  
 1   Gr Liv Area    878 non-null    int64  
 2   Garage Cars    878 non-null    int64  
 3   Total Bsmt SF  878 non-null    int64  
 4   Year Built     878 non-null    int64  
 5   Full Bath      878 non-null    int64  
 6   Mas Vnr Area   877 non-null    float64
dtypes: float64(1), int64(6)
memory usage: 48.1 KB


Ends up that there's a null value in the Mas Vnr Area variable of the test data... Guess I'll just do a run whithout it and ask into addressing it tomorrow. 

In [95]:
features_2 = temp_X_2.columns.values[:-1]
features_2

array(['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF',
       'Year Built', 'Full Bath'], dtype=object)

In [96]:
df_train[features_2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2051 non-null   int64  
 1   Gr Liv Area    2051 non-null   int64  
 2   Garage Cars    2050 non-null   float64
 3   Total Bsmt SF  2050 non-null   float64
 4   Year Built     2051 non-null   int64  
 5   Full Bath      2051 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 96.3 KB


In [97]:
drop_rows_2 = df_train.loc[df_train['Garage Cars'].isnull()].index
drop_rows_2 = drop_rows_2.append(df_train.loc[df_train['Total Bsmt SF'].isnull()].index)

In [98]:
drop_rows_2

Int64Index([1712, 1327], dtype='int64')

In [99]:
df_train_clean_2 = df_train.drop(drop_rows_2)
df_train_clean_2[features_2].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049 entries, 0 to 2050
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2049 non-null   int64  
 1   Gr Liv Area    2049 non-null   int64  
 2   Garage Cars    2049 non-null   float64
 3   Total Bsmt SF  2049 non-null   float64
 4   Year Built     2049 non-null   int64  
 5   Full Bath      2049 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 112.1 KB


In [100]:
X_2 = df_train_clean_2[features_2]
y_2 = df_train_clean_2['SalePrice']

In [101]:
lr_2 = LinearRegression()

In [102]:
lr_2.fit(X_2, y_2)

LinearRegression()

In [103]:
lr_2.coef_, lr_2.intercept_

(array([21209.23041413,    52.34307525, 14445.85024627,    29.94093653,
          351.10672996, -6821.19664199]),
 -765502.498047933)

In [104]:
pred_2 = lr_2.predict(df_test[features_2])

In [105]:
df_test['SalePrice'] = pred_2
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,164627.399975
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,211783.617545
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,200418.13543
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,117443.322994
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,187749.811628


In [106]:
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)

In [107]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,164627.399975
2718,211783.617545
2414,200418.13543
1989,117443.322994
625,187749.811628


In [108]:
submission.to_csv('../outputs/mlr_2_submission.csv')

In [109]:
df_train[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2051 non-null   int64  
 1   Gr Liv Area    2051 non-null   int64  
 2   Garage Cars    2050 non-null   float64
 3   Total Bsmt SF  2050 non-null   float64
 4   Year Built     2051 non-null   int64  
 5   Full Bath      2051 non-null   int64  
 6   Mas Vnr Area   2029 non-null   float64
dtypes: float64(3), int64(4)
memory usage: 112.3 KB


Following guide below from https://www.kaggle.com/dansbecker/handling-missing-values

In [110]:
X_train = X.copy()

In [111]:
X_test = df_test[features].copy()

In [112]:
y

0       130500
1       220000
2       109000
3       174000
4       138500
         ...  
2046    298751
2047     82500
2048    177000
2049    144000
2050    189000
Name: SalePrice, Length: 2027, dtype: int64

In [113]:
imputer_1 = SimpleImputer()

In [114]:
imputed_X_train = imputer_1.fit_transform(X_train)
imputed_X_test = imputer_1.transform(X_test)

In [115]:
# Produce model (lr_3) using imputed values for null, using full feature set
lr_3 = LinearRegression()

In [116]:
cross_val_score(lr_3, imputed_X_train, y).mean()

0.7839850036873244

In [117]:
lr_3.fit(imputed_X_train, y)

LinearRegression()

In [118]:
lr_3.coef_, lr_3.intercept_

(array([20259.1745533 ,    49.10520818, 13620.74541517,    27.1919158 ,
          324.12626488, -4825.2256406 ,    39.82511495]),
 -704441.393773311)

In [119]:
pred_3 = lr_3.predict(imputed_X_test)

In [120]:
df_test['SalePrice'] = pred_3
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,162575.709081
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,205319.587074
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,196406.05936
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,116421.167553
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,191984.770911


In [121]:
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,162575.709081
2718,205319.587074
2414,196406.059360
1989,116421.167553
625,191984.770911
...,...
1662,196176.452443
1234,215906.255878
1373,128883.598992
1672,103785.152820


In [122]:
submission.to_csv('../outputs/mlr_3_submission.csv')

In [123]:
self_test_2 = lr_2.predict(X_2)
metrics.mean_squared_error(y_2, self_test_2)

1358342813.9080396

In [124]:
self_test_3 = lr_3.predict(imputed_X_train)
metrics.mean_squared_error(y, self_test_3)

1321665298.6020255