# Find the comparables: exterior.txt

The file `exterior.txt` contains important property information about the areas of the property sections. Let's load this file and grab a subset with the important columns to continue our study.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import pandas as pd

from src.definitions import ROOT_DIR
from src.data.utils import Table, save_pickle

In [3]:
exterior_fn = ROOT_DIR / 'data/external/2016/Real_building_land/exterior.txt'
assert exterior_fn.exists()

In [4]:
exterior = Table(exterior_fn, '2016')

In [5]:
exterior_df = exterior.get_df()

# Load accounts of interest
Let's remove the account numbers that don't meet free-standing single-family home criteria that we found while processing the `building_res.txt` file.

In [6]:
one_bld_in_acct_fn = ROOT_DIR / 'data/raw/2016/one_bld_in_acct.pickle'

In [7]:
with open(one_bld_in_acct_fn, 'rb') as f:
    one_bld_in_acct = pickle.load(f)

In [8]:
cond0 = exterior_df['acct'].isin(one_bld_in_acct)
exterior_df = exterior_df.loc[cond0, :]

In [9]:
exterior_df.head()

Unnamed: 0,acct,bld_num,sar_cd,sar_dscr,area
58,101660000018,1,BAU,BASE AREA UPR,592
59,101660000018,1,BAU,BASE AREA UPR,592
60,101660000018,1,FSU,ONE STORY FRAME UPR,96
61,101660000033,1,FSP,ONE STORY FRAME PRI,271
62,101660000033,1,OMP,OPEN MAS PORCH PRI,130


In [10]:
exterior_df.sar_dscr.value_counts().head(15)

BASE AREA PRI           958579
OPEN FRAME PORCH PRI    714587
MAS/BRK GARAGE PRI      560301
OPEN MAS PORCH PRI      429101
BASE AREA UPR           346749
ONE STORY MAS PRI       267074
ONE STORY FRAME UPR     129798
ONE STORY FRAME PRI     121991
FRAME GARAGE PRI        117535
MAS/CONC PATIO PRI      116876
CARPORT PRI              40416
CANOPY PRI               40183
WOOD DECK PRI            35478
ONE STORY MAS UPR        33511
FRAME OVERHANG UPR       29155
Name: sar_dscr, dtype: int64

# Grab slice of the exterior features of interest
With the value counts on the exterior features description performed above we can see that the majority of the features land in the top 10 categories. Let's filter out the rests of the columns.

In [11]:
cols = exterior_df.sar_dscr.value_counts().head(10).index

In [13]:
cond0 = exterior_df['sar_dscr'].isin(cols)
exterior_df = exterior_df.loc[cond0, :]

# Build pivot table
Let's build a pivot table with the account number (`acct`) as index, surface area `sar_dscr` as column, and `area` as values.

In [14]:
exterior_pivot = exterior_df.pivot_table(index='acct',
                                         columns='sar_dscr',
                                         values='area',
                                         fill_value=0)

In [15]:
exterior_pivot.head()

sar_dscr,BASE AREA PRI,BASE AREA UPR,FRAME GARAGE PRI,MAS/BRK GARAGE PRI,MAS/CONC PATIO PRI,ONE STORY FRAME PRI,ONE STORY FRAME UPR,ONE STORY MAS PRI,OPEN FRAME PORCH PRI,OPEN MAS PORCH PRI
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21440000001,1658.0,879.0,0.0,0.0,0.0,0.0,0.0,0.0,80.5,0.0
21470000008,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21480000002,1496.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,182.0,0.0
21650000007,1517.0,1870.0,0.0,529.0,0.0,0.0,0.0,0.0,138.0,36.0
21650000011,1508.0,0.0,420.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


add `acct` column to make easier the merging process ahead

In [16]:
exterior_pivot.reset_index(inplace=True)

In [17]:
assert exterior_pivot['acct'].is_unique

# Export real_acct

In [18]:
save_fn = ROOT_DIR / 'data/raw/2016/exterior_comps.pickle'
save_pickle(exterior_pivot, save_fn)