# Find the comparables: extra_features.txt

The file `extra_features.txt` contains important property information like number and quality of pools, detached garages, outbuildings, canopies, and more. Let's load this file and grab a subset with the important columns to continue our study.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import pandas as pd

from src.definitions import ROOT_DIR
from src.data.utils import Table, save_pickle

In [3]:
extra_features_fn = ROOT_DIR / 'data/external/2016/Real_building_land/extra_features.txt'
assert extra_features_fn.exists()

In [4]:
extra_features = Table(extra_features_fn, '2016')

In [5]:
extra_features_df = extra_features.get_df()

# Load accounts of interest
Let's remove the account numbers that don't meet free-standing single-family home criteria that we found while processing the `building_res.txt` file.

In [6]:
one_bld_in_acct_fn = ROOT_DIR / 'data/raw/2016/one_bld_in_acct.pickle'

In [7]:
with open(one_bld_in_acct_fn, 'rb') as f:
    one_bld_in_acct = pickle.load(f)

In [8]:
cond0 = extra_features_df['acct'].isin(one_bld_in_acct)
extra_features_df = extra_features_df.loc[cond0, :]

In [9]:
extra_features_df.head()

Unnamed: 0,acct,bld_num,count,grade,cd,s_dscr,l_dscr,cat,dscr,note,uts
1257,21440000001,0,2,4,RRS1,WDUtSh,Frame Utility Shed,OB,Outbuildings,,110.0
1258,21440000001,0,2,4,RRS1,WDUtSh,Frame Utility Shed,OB,Outbuildings,,130.0
1265,21480000002,1,1,4,ROGV,OtherRs,Residential Other Gross Value,MS,Miscellaneous,SALV GAR APMT.,0.5
1320,21650000007,0,1,4,RRP5,GnPool,Gunite Pool,PL,Pools,,368.0
1323,21700000013,0,1,5,RRG1,FrmGar,Frame Detached Garage,GR,Garage,,225.0


In [10]:
extra_features_df.columns

Index(['acct', 'bld_num', 'count', 'grade', 'cd', 's_dscr', 'l_dscr', 'cat',
       'dscr', 'note', 'uts'],
      dtype='object')

In [11]:
extra_features_df.dscr.value_counts()

Garage                     203945
Pools                      131083
Outbuildings               121614
Canopy                      85893
Carport                     76646
Foundation Defect           35431
Miscellaneous               21902
Boat Docks & Structures      2233
Greenhouse                    645
Solar                         439
Spaces for Mobile Homes       375
Skirting                      317
Tennis Courts                 287
Decking                        65
Paving                         49
Porches                        11
Additions                       3
Name: dscr, dtype: int64

# Grab slice of the extra features of interest
With the value counts on the extra feature description performed above we can see that the majority of the features land in the top 6 categories. Let's filter out the rests of the columns.

In [12]:
cols = extra_features_df.dscr.value_counts().head(6).index

In [13]:
cond0 = extra_features_df['dscr'].isin(cols)
extra_features_df = extra_features_df.loc[cond0, :]

# Build pivot tables for count and grade
There appear to be two important values related to each extra feature:count and grade. Let's build individual pivot tables for each and merge them before saving them out.

In [14]:
extra_features_pivot_count = extra_features_df.pivot_table(index='acct',
                                                           columns='dscr',
                                                           values='count')

In [15]:
extra_features_pivot_count.head()

dscr,Canopy,Carport,Foundation Defect,Garage,Outbuildings,Pools
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21440000001,,,,,2.0,
21650000007,,,,,,1.0
21700000013,,,,1.0,,
21750000013,,,,2.0,,
22070000007,,,,1.0,,


In [16]:
extra_features_pivot_grade = extra_features_df.pivot_table(index='acct',
                                                           columns='dscr',
                                                           values='grade')

In [17]:
extra_features_pivot_grade.head()

dscr,Canopy,Carport,Foundation Defect,Garage,Outbuildings,Pools
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21440000001,,,,,4.0,
21650000007,,,,,,4.0
21700000013,,,,5.0,,
21750000013,,,,4.0,,
22070000007,,,,5.0,,


In [18]:
extra_features_count_grade = extra_features_pivot_count.merge(extra_features_pivot_grade,
                                                              how='inner',
                                                              left_index=True,
                                                              right_index=True,
                                                              suffixes=('_count', '_grade'),
                                                              validate='one_to_one')

In [19]:
extra_features_count_grade.head()

dscr,Canopy_count,Carport_count,Foundation Defect_count,Garage_count,Outbuildings_count,Pools_count,Canopy_grade,Carport_grade,Foundation Defect_grade,Garage_grade,Outbuildings_grade,Pools_grade
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21440000001,,,,,2.0,,,,,,4.0,
21650000007,,,,,,1.0,,,,,,4.0
21700000013,,,,1.0,,,,,,5.0,,
21750000013,,,,2.0,,,,,,4.0,,
22070000007,,,,1.0,,,,,,5.0,,


In [20]:
assert extra_features_count_grade.index.is_unique

# Export real_acct

In [21]:
save_fn = ROOT_DIR / 'data/raw/2016/extra_features_count_grade.pickle'
save_pickle(extra_features_count_grade, save_fn)