In [1]:
import numpy as np
import pandas as pd
%matplotlib inline 

housing = pd.read_csv('Ames_HousePrice.csv', index_col=0)

In [2]:
housing.columns

Index(['PID', 'GrLivArea', 'SalePrice', 'MSSubClass', 'MSZoning',
       'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond

In [3]:
price        = housing['SalePrice']
size_related = housing.filter(regex='SF$|Area$')
size_related.head()

Unnamed: 0,GrLivArea,LotArea,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GarageArea,WoodDeckSF,OpenPorchSF,PoolArea
1,856,7890,0.0,618.0,856.0,856,0,0,399.0,0,0,0
2,1049,4235,149.0,104.0,1049.0,1049,0,0,266.0,0,105,0
3,1001,6060,0.0,100.0,837.0,1001,0,0,216.0,154,0,0
4,1039,8146,0.0,405.0,405.0,717,322,0,281.0,0,0,0
5,1665,8400,0.0,167.0,810.0,810,855,0,528.0,0,45,0


In [4]:
size_related = size_related.fillna(0.0)

In [5]:
print(housing.isnull().sum(axis=0))

PID              0
GrLivArea        0
SalePrice        0
MSSubClass       0
MSZoning         0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 81, dtype: int64


In [6]:
size_related.columns

Index(['GrLivArea', 'LotArea', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'PoolArea'],
      dtype='object')

In [7]:
for col in size_related.columns:
    print(f'{col}: ', round(size_related[size_related[col] != 0 ][col].mean()))

GrLivArea:  1486
LotArea:  10120
MasVnrArea:  252
BsmtUnfSF:  588
TotalBsmtSF:  1065
1stFlrSF:  1145
2ndFlrSF:  779
LowQualFinSF:  332
GarageArea:  491
WoodDeckSF:  199
OpenPorchSF:  84
PoolArea:  476


In [8]:
for col in housing.columns:
    if housing[col].isnull().sum() != 0:
        print(f'{col}: ', housing[col].isnull().sum())

LotFrontage:  462
Alley:  2412
MasVnrType:  14
MasVnrArea:  14
BsmtQual:  69
BsmtCond:  69
BsmtExposure:  71
BsmtFinType1:  69
BsmtFinSF1:  1
BsmtFinType2:  70
BsmtFinSF2:  1
BsmtUnfSF:  1
TotalBsmtSF:  1
Electrical:  1
BsmtFullBath:  2
BsmtHalfBath:  2
FireplaceQu:  1241
GarageType:  127
GarageYrBlt:  129
GarageFinish:  129
GarageCars:  1
GarageArea:  1
GarageQual:  129
GarageCond:  129
PoolQC:  2571
Fence:  2055
MiscFeature:  2483


In [9]:
with open('housePrice_features') as hp_feat:
    hp_dict = {}
    for line in hp_feat.readlines():
        feature, description = [*line.split(':')]
        hp_dict[feature] = description

In [10]:
hp_dict['PoolQC']

' Pool quality\n'

In [11]:
housing['MiscFeature'].unique()

array([nan, 'Shed', 'Othr', 'Gar2', 'TenC'], dtype=object)

In [12]:
housing[housing['PoolQC'] == 'TA']['SalePrice']

180    228500
192    130000
657    153000
Name: SalePrice, dtype: int64

In [13]:
pools = housing.groupby('PoolQC')
pools = pd.DataFrame(pools['SalePrice'].aggregate(['mean', 'median']))
pools

Unnamed: 0_level_0,mean,median
PoolQC,Unnamed: 1_level_1,Unnamed: 2_level_1
Ex,465000.0,465000.0
Fa,215500.0,215500.0
Gd,215500.0,215500.0
TA,170500.0,153000.0
