# Find the comparables: exterior.txt

The file `exterior.txt` contains important property information about the areas of the property sections. Let's load this file and grab a subset with the important columns to continue our study.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import pandas as pd

from src.definitions import ROOT_DIR
from src.data.utils import Table, save_pickle

In [3]:
exterior_fn = ROOT_DIR / 'data/external/2016/Real_building_land/exterior.txt'
assert exterior_fn.exists()

In [4]:
exterior = Table(exterior_fn, '2016')

# Load accounts of interest
Let's remove the account numbers that don't meet free-standing single-family home criteria that we found while processing the `building_res.txt` file.

In [5]:
skiprows = exterior.get_skiprows()

In [6]:
exterior_df = exterior.get_df(skiprows=skiprows)

In [7]:
exterior_df.head()

Unnamed: 0,acct,bld_num,sar_cd,sar_dscr,area
0,101660000018,1,BAU,BASE AREA UPR,592
1,101660000018,1,BAU,BASE AREA UPR,592
2,101660000018,1,FSU,ONE STORY FRAME UPR,96
3,101660000033,1,FSP,ONE STORY FRAME PRI,271
4,101660000033,1,OMP,OPEN MAS PORCH PRI,130


In [8]:
exterior_df.sar_dscr.value_counts().head(20)

BASE AREA PRI           958579
OPEN FRAME PORCH PRI    714587
MAS/BRK GARAGE PRI      560301
OPEN MAS PORCH PRI      429101
BASE AREA UPR           346749
ONE STORY MAS PRI       267074
ONE STORY FRAME UPR     129798
ONE STORY FRAME PRI     121991
FRAME GARAGE PRI        117535
MAS/CONC PATIO PRI      116876
CARPORT PRI              40416
CANOPY PRI               40183
WOOD DECK PRI            35478
ONE STORY MAS UPR        33511
FRAME OVERHANG UPR       29155
ATTIC FINISHED           26280
ENCL FRAME PORCH PRI     26140
OPEN FRAME PORCH UPR     26117
BASE AREA LWR            12945
FRAME UTIL BLDG PRI      11732
Name: sar_dscr, dtype: int64

# Grab slice of the exterior features of interest
With the value counts on the exterior features description performed above we can see that the majority of the features land in the top 10 categories. Let's filter out the rests of the columns.

In [9]:
cols = exterior_df.sar_dscr.value_counts().head(10).index

In [10]:
cond0 = exterior_df['sar_dscr'].isin(cols)
exterior_df = exterior_df.loc[cond0, :]

# Build pivot table
Let's build a pivot table with the account number (`acct`) as index, surface area `sar_dscr` as column, and `area` as values. Since the areas are already split by primary (PRI) and upper (UPR) I'll assume there should be only one exterior feature category per property, and thus aggregate multiple occurrences by taking the maximum value.

In [11]:
exterior_pivot = exterior_df.pivot_table(index='acct',
                                         columns='sar_dscr',
                                         values='area',
                                         fill_value=0,
                                         aggfunc='max')

In [12]:
exterior_pivot.head()

sar_dscr,BASE AREA PRI,BASE AREA UPR,FRAME GARAGE PRI,MAS/BRK GARAGE PRI,MAS/CONC PATIO PRI,ONE STORY FRAME PRI,ONE STORY FRAME UPR,ONE STORY MAS PRI,OPEN FRAME PORCH PRI,OPEN MAS PORCH PRI
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21440000001,1658,879,0,0,0,0,0,0,105,0
21470000008,1000,0,0,0,0,0,0,0,0,0
21480000002,1496,0,0,0,0,0,0,0,182,0
21650000007,1517,1870,0,529,0,0,0,0,138,36
21650000011,1508,0,420,0,0,0,0,0,0,0


add `acct` column to make easier the merging process ahead

In [13]:
exterior_pivot.reset_index(inplace=True)

In [14]:
assert exterior_pivot['acct'].is_unique

# Fix column names
We would like the column names to be all lower case, with no spaces nor non-alphanumeric characters.

In [15]:
from src.data.utils import fix_column_names

In [16]:
exterior_pivot.columns

Index(['acct', 'BASE AREA PRI', 'BASE AREA UPR', 'FRAME GARAGE PRI',
       'MAS/BRK GARAGE PRI', 'MAS/CONC PATIO PRI', 'ONE STORY FRAME PRI',
       'ONE STORY FRAME UPR', 'ONE STORY MAS PRI', 'OPEN FRAME PORCH PRI',
       'OPEN MAS PORCH PRI'],
      dtype='object', name='sar_dscr')

In [17]:
exterior_pivot = fix_column_names(exterior_pivot)

In [18]:
exterior_pivot.columns

Index(['acct', 'base_area_pri', 'base_area_upr', 'frame_garage_pri',
       'mas_brk_garage_pri', 'mas_conc_patio_pri', 'one_story_frame_pri',
       'one_story_frame_upr', 'one_story_mas_pri', 'open_frame_porch_pri',
       'open_mas_porch_pri'],
      dtype='object')

### Find duplicated rows

In [19]:
cond0 = exterior_pivot.duplicated()
exterior_pivot.loc[cond0, :]

Unnamed: 0,acct,base_area_pri,base_area_upr,frame_garage_pri,mas_brk_garage_pri,mas_conc_patio_pri,one_story_frame_pri,one_story_frame_upr,one_story_mas_pri,open_frame_porch_pri,open_mas_porch_pri


# Describe

In [20]:
exterior_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957680 entries, 0 to 957679
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   acct                  957680 non-null  int64
 1   base_area_pri         957680 non-null  int64
 2   base_area_upr         957680 non-null  int64
 3   frame_garage_pri      957680 non-null  int64
 4   mas_brk_garage_pri    957680 non-null  int64
 5   mas_conc_patio_pri    957680 non-null  int64
 6   one_story_frame_pri   957680 non-null  int64
 7   one_story_frame_upr   957680 non-null  int64
 8   one_story_mas_pri     957680 non-null  int64
 9   open_frame_porch_pri  957680 non-null  int64
 10  open_mas_porch_pri    957680 non-null  int64
dtypes: int64(11)
memory usage: 80.4 MB


In [21]:
exterior_pivot.describe()

Unnamed: 0,acct,base_area_pri,base_area_upr,frame_garage_pri,mas_brk_garage_pri,mas_conc_patio_pri,one_story_frame_pri,one_story_frame_upr,one_story_mas_pri,open_frame_porch_pri,open_mas_porch_pri
count,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0,957680.0
mean,1013456000000.0,1495.021086,403.354231,45.982009,237.578059,35.882803,34.753537,30.488229,78.742689,68.406385,33.197644
std,268959800000.0,610.475319,617.077021,137.220659,233.084651,117.541261,134.097456,112.267015,220.327554,104.230796,75.936034
min,21440000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-63.0
25%,851040000000.0,1064.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1112500000000.0,1418.0,0.0,0.0,311.0,0.0,0.0,0.0,0.0,33.0,0.0
75%,1203490000000.0,1820.0,888.0,0.0,440.0,0.0,0.0,0.0,0.0,96.0,38.0
max,1955020000000.0,13451.0,12950.0,3120.0,3250.0,4328.0,4950.0,3597.0,5693.0,3312.0,3500.0


# Export real_acct

In [22]:
save_fn = ROOT_DIR / 'data/raw/2016/exterior_comps.pickle'
save_pickle(exterior_pivot, save_fn)