# Project 2 - Ames Housing Data and Kaggle Challenge
## Matt Reed / DSI-124

In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression

In [116]:
df_train = pd.read_csv('../datasets/train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [117]:
df_train['Misc Feature'].unique()

array([nan, 'Shed', 'TenC', 'Gar2', 'Othr', 'Elev'], dtype=object)

In [118]:
df_train['Pool QC'].unique()

array([nan, 'Fa', 'Gd', 'Ex', 'TA'], dtype=object)

In [119]:
df_train['Alley'].unique()

array([nan, 'Pave', 'Grvl'], dtype=object)

In [120]:
df_train['Neighborhood'].unique()

array(['Sawyer', 'SawyerW', 'NAmes', 'Timber', 'Edwards', 'OldTown',
       'BrDale', 'CollgCr', 'Somerst', 'Mitchel', 'StoneBr', 'NridgHt',
       'Gilbert', 'Crawfor', 'IDOTRR', 'NWAmes', 'Veenker', 'MeadowV',
       'SWISU', 'NoRidge', 'ClearCr', 'Blmngtn', 'BrkSide', 'NPkVill',
       'Blueste', 'GrnHill', 'Greens', 'Landmrk'], dtype=object)

In [121]:
def dummies_add(column_name, dataframe):
    dummies = pd.get_dummies(dataframe[f'{column_name}'], prefix=f'{column_name}')
    dataframe[dummies.columns] = dummies
    dataframe.drop(columns=f'{column_name}', inplace=True)

In [122]:
df_train['Neighborhood']

0        Sawyer
1       SawyerW
2         NAmes
3        Timber
4       SawyerW
         ...   
2046     Timber
2047    Edwards
2048    Crawfor
2049      NAmes
2050    Gilbert
Name: Neighborhood, Length: 2051, dtype: object

In [123]:
dummies_add('Neighborhood', df_train)

In [124]:
corr = df_train.corr()
# Useful sorting bit from https://stackoverflow.com/questions/11350770/filter-pandas-dataframe-by-substring-criteria
corr['SalePrice'].loc[corr['SalePrice'].index.str.contains('Neighborhood')].sort_values(ascending=False)

Neighborhood_NridgHt    0.448647
Neighborhood_NoRidge    0.263395
Neighborhood_StoneBr    0.256977
Neighborhood_Somerst    0.150078
Neighborhood_Timber     0.116400
Neighborhood_Veenker    0.083186
Neighborhood_CollgCr    0.082309
Neighborhood_Crawfor    0.058386
Neighborhood_ClearCr    0.052503
Neighborhood_GrnHill    0.038848
Neighborhood_NWAmes     0.034926
Neighborhood_Blmngtn    0.024900
Neighborhood_Gilbert    0.023974
Neighborhood_SawyerW    0.016708
Neighborhood_Greens     0.003476
Neighborhood_Landmrk   -0.012395
Neighborhood_Blueste   -0.025226
Neighborhood_Mitchel   -0.035574
Neighborhood_NPkVill   -0.047296
Neighborhood_SWISU     -0.074214
Neighborhood_BrDale    -0.095305
Neighborhood_MeadowV   -0.111558
Neighborhood_Sawyer    -0.133692
Neighborhood_BrkSide   -0.134790
Neighborhood_Edwards   -0.176119
Neighborhood_IDOTRR    -0.189237
Neighborhood_NAmes     -0.189387
Neighborhood_OldTown   -0.208371
Name: SalePrice, dtype: float64

In [125]:
corr_top = corr['SalePrice'].sort_values(ascending=False).loc[corr['SalePrice'] > .5]

In [126]:
temp_features = corr_top.index.values[1:]

In [127]:
temp_features

array(['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars',
       'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Year Remod/Add',
       'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area', 'TotRms AbvGrd'],
      dtype=object)

In [128]:
temp_X = df_train[temp_features].copy()
temp_X.dropna(inplace=True)

In [129]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(temp_X.values, i) for i in range(temp_X.shape[1])]
vif['variable'] = temp_X.columns

In [130]:
vif

Unnamed: 0,VIF,variable
0,46.373605,Overall Qual
1,48.879322,Gr Liv Area
2,34.659636,Garage Area
3,39.417879,Garage Cars
4,22.622198,Total Bsmt SF
5,33.894146,1st Flr SF
6,18268.089003,Year Built
7,13379.25186,Year Remod/Add
8,18.345397,Full Bath
9,25472.727264,Garage Yr Blt


Thinking that 'Garage Area' and 'Garage Cars' are redundant. 'Garage Yr Blt' and 'Year Remod/Add' are likely covered by 'Year Built'. 'TotRms AbvGrd' and '1st Flr SF' are probably covered by 'Gr Liv Area'. Will give it a go with stripping those out.

In [131]:
temp_X_2 = temp_X.copy()
temp_X_2.drop(columns=[
    'Garage Area', 
    'Garage Yr Blt', 
    'Year Remod/Add', 
    'TotRms AbvGrd', 
    '1st Flr SF',
    ],inplace=True)

In [132]:
temp_X_2

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Cars,Total Bsmt SF,Year Built,Full Bath,Mas Vnr Area
0,6,1479,2.0,725.0,1976,2,289.0
1,7,2122,2.0,913.0,1996,2,132.0
2,5,1057,1.0,1057.0,1953,1,0.0
3,5,1444,2.0,384.0,2006,2,0.0
4,6,1445,2.0,676.0,1900,2,0.0
...,...,...,...,...,...,...,...
2046,8,1728,2.0,1884.0,2007,2,0.0
2047,4,861,2.0,861.0,1940,1,0.0
2048,6,1913,2.0,896.0,1928,1,0.0
2049,4,1200,1.0,1200.0,1956,1,0.0


In [133]:
vif_2 = pd.DataFrame()
vif_2['VIF'] = [variance_inflation_factor(temp_X_2.values, i) for i in range(temp_X_2.shape[1])]
vif_2['variable'] = temp_X_2.columns
vif_2

Unnamed: 0,VIF,variable
0,44.231587,Overall Qual
1,20.620981,Gr Liv Area
2,16.354324,Garage Cars
3,10.53922,Total Bsmt SF
4,25.965773,Year Built
5,17.657673,Full Bath
6,1.812801,Mas Vnr Area


Seems worth giving a try with that, I guess...

In [134]:
features = temp_X_2.columns.values
features

array(['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF',
       'Year Built', 'Full Bath', 'Mas Vnr Area'], dtype=object)

In [135]:
df_train[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 7 columns):
Overall Qual     2051 non-null int64
Gr Liv Area      2051 non-null int64
Garage Cars      2050 non-null float64
Total Bsmt SF    2050 non-null float64
Year Built       2051 non-null int64
Full Bath        2051 non-null int64
Mas Vnr Area     2029 non-null float64
dtypes: float64(3), int64(4)
memory usage: 112.3 KB


In [136]:
# Looking to drop rows where I have null values in my feature set
drop_rows = df_train.loc[df_train['Mas Vnr Area'].isnull()].index
drop_rows = drop_rows.append(df_train.loc[df_train['Garage Cars'].isnull()].index)
drop_rows = drop_rows.append(df_train.loc[df_train['Total Bsmt SF'].isnull()].index)

In [137]:
drop_rows

Int64Index([  22,   41,   86,  212,  276,  338,  431,  451,  591,  844,  913,
             939, 1025, 1244, 1306, 1430, 1434, 1606, 1699, 1815, 1820, 1941,
            1712, 1327],
           dtype='int64')

In [138]:
df_train_clean = df_train.drop(drop_rows)
df_train_clean[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2027 entries, 0 to 2050
Data columns (total 7 columns):
Overall Qual     2027 non-null int64
Gr Liv Area      2027 non-null int64
Garage Cars      2027 non-null float64
Total Bsmt SF    2027 non-null float64
Year Built       2027 non-null int64
Full Bath        2027 non-null int64
Mas Vnr Area     2027 non-null float64
dtypes: float64(3), int64(4)
memory usage: 126.7 KB


All variables have non-null and appropriate typed values

In [139]:
X = df_train_clean[features]
y = df_train_clean['SalePrice']

In [140]:
lr = LinearRegression()

In [141]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [142]:
lr.coef_, lr.intercept_

(array([20259.1745533 ,    49.10520818, 13620.74541517,    27.1919158 ,
          324.12626488, -4825.2256406 ,    39.82511495]), -704441.393773311)

In [143]:
df_test = pd.read_csv('../datasets/test.csv')

In [144]:
df_test[features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 7 columns):
Overall Qual     878 non-null int64
Gr Liv Area      878 non-null int64
Garage Cars      878 non-null int64
Total Bsmt SF    878 non-null int64
Year Built       878 non-null int64
Full Bath        878 non-null int64
Mas Vnr Area     877 non-null float64
dtypes: float64(1), int64(6)
memory usage: 48.1 KB


Ends up that there's a null value in the Mas Vnr Area variable of the test data... Guess I'll just do a run whithout it and ask into addressing it tomorrow. 

In [145]:
features_2 = temp_X_2.columns.values[:-1]
features_2

array(['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF',
       'Year Built', 'Full Bath'], dtype=object)

In [146]:
df_train[features_2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 6 columns):
Overall Qual     2051 non-null int64
Gr Liv Area      2051 non-null int64
Garage Cars      2050 non-null float64
Total Bsmt SF    2050 non-null float64
Year Built       2051 non-null int64
Full Bath        2051 non-null int64
dtypes: float64(2), int64(4)
memory usage: 96.3 KB


In [149]:
drop_rows_2 = df_train.loc[df_train['Garage Cars'].isnull()].index
drop_rows_2 = drop_rows_2.append(df_train.loc[df_train['Total Bsmt SF'].isnull()].index)

In [150]:
drop_rows_2

Int64Index([1712, 1327], dtype='int64')

In [152]:
df_train_clean_2 = df_train.drop(drop_rows_2)
df_train_clean_2[features_2].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049 entries, 0 to 2050
Data columns (total 6 columns):
Overall Qual     2049 non-null int64
Gr Liv Area      2049 non-null int64
Garage Cars      2049 non-null float64
Total Bsmt SF    2049 non-null float64
Year Built       2049 non-null int64
Full Bath        2049 non-null int64
dtypes: float64(2), int64(4)
memory usage: 112.1 KB


In [153]:
X_2 = df_train_clean_2[features_2]
y_2 = df_train_clean_2['SalePrice']

In [154]:
lr_2 = LinearRegression()

In [155]:
lr_2.fit(X_2, y_2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [156]:
lr_2.coef_, lr_2.intercept_

(array([21209.23041413,    52.34307525, 14445.85024627,    29.94093653,
          351.10672996, -6821.19664199]), -765502.498047933)

In [157]:
pred_2 = lr_2.predict(df_test[features_2])

In [158]:
df_test['SalePrice'] = pred_2
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,164627.399975
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,211783.617545
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,200418.13543
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,117443.322994
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,187749.811628


In [159]:
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)

In [160]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,164627.399975
2718,211783.617545
2414,200418.13543
1989,117443.322994
625,187749.811628


In [161]:
submission.to_csv('../outputs/mlr_2_submission.csv')