# Fulton Parcel Processing

In [1]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)

In [2]:
# Original names of variables, include these on data format sheet
original_vars = [
    'Taxyr', 'Parid', 'Situs Adrno', 'Situs Adrdir',
    'Situs Adrstr', 'Situs Adrsuf', 'Situs Adrsuf2',
    'Cityname', 'Class', 'Luc', 'Livunit',
    'Taxdist', 'Own1', 'Own2', 'Owner Adrno',
    'Owner Adradd', 'Owner Adrdir', 'Owner Adrstr',
    'Owner Adrsuf', 'Owner Adrsuf2', 'Cityname.1',
    'Statecode', 'Country', 'Unitno', 'Zip1', 'Reascd',
    'Aprland', 'Aprbldg', 'Revcode', 'Revreas', 'Aprtot',
    'D Yrblt', 'D Effyr', 'D Yrremod', 'D Grade', 'Sfla'
]
# New names of variables, 'final variable name' on data format sheet
new_var_map = {
    'Taxyr': 'tax_year', 'Parid': 'parcel_id', 'site_addr': 'site_addr',
    'Class': 'site_class_parcel', 'Luc': 'site_luc_parcel',
    'Taxdist': 'tax_district', 'Own1': 'owner_name_1', 
    'Own2': 'owner_name_2', 'Zip1': 'owner_zip', 'owner_addr': 'owner_addr',
    'Reascd': 'reas_cd', 'Revcode': 'rev_code',
    'Revreas': 'rev_reas', 'Aprtot': 'appr_total',
    'Aprland': 'appr_land', 'Aprbldg': 'appr_build', 'D Yrblt': 'year_built',
    'D Effyr': 'year_eff', 'D Yrremod': 'year_remodel', 'D Grade': 'grade',
    'Livunit': 'num_units', 'Sfla': 'building_area'
}
# New variable names ('final variable name' on data format sheet)
# with their associated datatype mapping
dtypes = {
    'tax_year': 'int16', 'parcel_id': 'string', 'site_addr': 'string',
    'site_class_parcel': 'category', 'site_luc_parcel': 'category',
    'tax_district': 'category', 'owner_name_1': 'string',
    'owner_name_2': 'string', 'owner_zip': 'string', 'owner_addr': 'string',
    'reas_cd': 'string', 'rev_code': 'string', 'rev_reas': 'string',
    'appr_total': 'float32', 'appr_land': 'float32', 'appr_build': 'float32',
    'year_built': 'int16', 'year_eff': 'int16',
    'year_remodel': 'int16', 'grade': 'category',
    'num_units': 'int16', 'building_area': 'float32'
}

## Read in Files

Get all file names in directory.

In [3]:
prefix_path = '../data/fulton/parcels/'
file_paths = os.listdir(prefix_path)
file_paths[:2] # printing out the first 2 files

['14_2010-2011.xlsx', '14_2012-2013.xlsx']

Loop through files and read them in. Store them in a list of dataframes. appendeded on a single file first.

In [4]:
dfs = []

for file in file_paths[4:6]:
    dfs.append(
        pd.read_excel(prefix_path + file)
    )

## Append Files

Dropping extra vars.

Recommend dropping uneeded variables first if data size is large; the best way to do this is by only keeping variables we need because we don't need many.

We also want to remove the last row of each file (for Fulton, at least), because it simply contains a 'Count Distinct' value in the source data- this is not something we want.

In [5]:
# Calculating total num of rows to verify next step
total_rows = 0

for index, df in enumerate(dfs):
    init = len(df.columns)
    dfs[index] = df[original_vars][:-1]
    total_rows += len(df)
    print(f'Dropped {init - len(df.columns)} columns')

Dropped 0 columns
Dropped 0 columns


Now all dfs should have the same columns. We can append.

In [6]:
appended = pd.concat(dfs, ignore_index=True)

init_size = len(appended) # Take inital size of data for later calc

print(f'Appended has {len(appended)} rows; total rows calculated '
      + f'previously is {total_rows}')

Appended has 175077 rows; total rows calculated previously is 175079


Construct the derived variables (e.g. site_addr from the multiple site address variables in this data). However, before we can do this, we need to clean each of the columns we will join to create this variable.

More specifically, we need check how many None values there are in each column, then replace them with something that makes sense. This is because if you try to add a row with value None for one column (say address num) and "Main Street" for another, the result will be None.

In [7]:
check_vars = ['Situs Adrno', 'Situs Adrdir','Situs Adrstr',
              'Situs Adrsuf', 'Situs Adrsuf2', 'Cityname',
              'Owner Adrno', 'Owner Adrstr', 'Owner Adradd',
              'Owner Adrdir', 'Owner Adrsuf', 'Owner Adrsuf2',
              'Cityname.1', 'Statecode', 'Unitno']

print("Percent NA by Variable ---")
for v in check_vars:
    pct_na = len(appended[appended[v].isna()]) / len(appended)* 100
    print(f'{v}: {pct_na:.2f}')

Percent NA by Variable ---
Situs Adrno: 0.01
Situs Adrdir: 99.91
Situs Adrstr: 0.00
Situs Adrsuf: 1.39
Situs Adrsuf2: 25.59
Cityname: 0.08
Owner Adrno: 5.33
Owner Adrstr: 0.39
Owner Adradd: 99.53
Owner Adrdir: 96.72
Owner Adrsuf: 7.90
Owner Adrsuf2: 51.49
Cityname.1: 0.36
Statecode: 0.38
Unitno: 80.21


Let's investigate some examples of NA for Cityname because, while insignificant, it is an important column.

In [8]:
appended[appended['Cityname'].isna()].sample(2)

Unnamed: 0,Taxyr,Parid,Situs Adrno,Situs Adrdir,Situs Adrstr,Situs Adrsuf,Situs Adrsuf2,Cityname,Class,Luc,Livunit,Taxdist,Own1,Own2,Owner Adrno,Owner Adradd,Owner Adrdir,Owner Adrstr,Owner Adrsuf,Owner Adrsuf2,Cityname.1,Statecode,Country,Unitno,Zip1,Reascd,Aprland,Aprbldg,Revcode,Revreas,Aprtot,D Yrblt,D Effyr,D Yrremod,D Grade,Sfla
9737,2017.0,14 002000072170,791.0,,WYLIE,ST,SE,,R3,106,1.0,05T,SEMANICK KEVIN J,,791.0,,,WYLIE,ST,,ATLANTA,GA,,1006,30316,,22000.0,193700.0,,,215700.0,2006.0,,,A,1736.0
34060,2017.0,14 008400072538,425.0,,CHAPEL,ST,SW,,R3,106,1.0,05Z,CRUMP MARK & SHIRLEY,,425.0,,,CHAPEL,ST,,ATLANTA,GA,,3208,30313,,15100.0,66500.0,,,81600.0,2006.0,,,B+,1196.0


In [9]:
appended[appended['Situs Adrstr'].isna()].sample(1)

ValueError: a must be greater than 0 unless no samples are taken

Let's also investigate NA examples of 'Owner Adrno', 'Owner Adrstr', 'Cityname.1', and 'Statecode'

In [10]:
appended[appended['Owner Adrno'].isna()].sample(2)

Unnamed: 0,Taxyr,Parid,Situs Adrno,Situs Adrdir,Situs Adrstr,Situs Adrsuf,Situs Adrsuf2,Cityname,Class,Luc,Livunit,Taxdist,Own1,Own2,Owner Adrno,Owner Adradd,Owner Adrdir,Owner Adrstr,Owner Adrsuf,Owner Adrsuf2,Cityname.1,Statecode,Country,Unitno,Zip1,Reascd,Aprland,Aprbldg,Revcode,Revreas,Aprtot,D Yrblt,D Effyr,D Yrremod,D Grade,Sfla
166542,2018.0,14 021500030520,1140.0,,TUCKAWANNA,DR,SW,ATLANTA,R3,101,1.0,5,CHANEY FRANCES C MRS,,,,,P.O. BOX 347,,,UNION CITY,GA,,,30291,FN,13300.0,52600.0,3.0,,65900.0,1955.0,,,C,1662.0
29503,2017.0,14 007100020656,235.0,,HILLTOP,DR,SW,ATL,R3,101,1.0,5,235 HILLTOP DRIVE LAND TRUST THE,,,,,P.O. BOX 2327,,,ACWORTH,GA,,,30102,,5500.0,13000.0,3.0,,18500.0,1952.0,,1970.0,C-,1308.0


Looks like empty Adrno might be when the owner uses a P.O. Box; we definitely want to keep these.

In [11]:
appended[appended['Owner Adrstr'].isna()].sample(2)

Unnamed: 0,Taxyr,Parid,Situs Adrno,Situs Adrdir,Situs Adrstr,Situs Adrsuf,Situs Adrsuf2,Cityname,Class,Luc,Livunit,Taxdist,Own1,Own2,Owner Adrno,Owner Adradd,Owner Adrdir,Owner Adrstr,Owner Adrsuf,Owner Adrsuf2,Cityname.1,Statecode,Country,Unitno,Zip1,Reascd,Aprland,Aprbldg,Revcode,Revreas,Aprtot,D Yrblt,D Effyr,D Yrremod,D Grade,Sfla
14333,2017.0,14 004000050081,1761.0,,RICHMOND,AVE,SE,ATL,R3,101,1.0,5,RESI SFR SUB LLC,,,,,,,,CHRISTIANSTED,VI,VIRGIN ISLAND,,,,7100.0,37900.0,3.0,30.0,45000.0,1920.0,,2000.0,C,1035.0
43579,2017.0,14 011000100840,656.0,,MAGNOLIA,ST,NW,ATL,R3,100,0.0,05Z,BELTEM TRUST THE,,,,,,,,,,UNITED KINGDOM,,,,6500.0,0.0,,,6500.0,,,,,


Hard to determine anything. But we should just keep them for now.

In [12]:
appended[appended['Cityname.1'].isna()].sample(2)

Unnamed: 0,Taxyr,Parid,Situs Adrno,Situs Adrdir,Situs Adrstr,Situs Adrsuf,Situs Adrsuf2,Cityname,Class,Luc,Livunit,Taxdist,Own1,Own2,Owner Adrno,Owner Adradd,Owner Adrdir,Owner Adrstr,Owner Adrsuf,Owner Adrsuf2,Cityname.1,Statecode,Country,Unitno,Zip1,Reascd,Aprland,Aprbldg,Revcode,Revreas,Aprtot,D Yrblt,D Effyr,D Yrremod,D Grade,Sfla
142657,2018.0,14 014100061094,116.0,,CHICAMAUGA,AVE,SW,ATLANTA,R3,101,1.0,5,CX HOLDINGS LLC,,,,,,,,,,CHINA,,,FN,11300.0,30100.0,3.0,,41400.0,1940.0,,,C-,2037.0
136308,2018.0,14 012100100680,1772.0,,MELROSE,DR,SW,ATLANTA,R3,101,1.0,5,ARLP REO 400 LLC,,,,,,,,,,,,,E8,35400.0,142800.0,3.0,OVR,178200.0,1940.0,1985.0,,C+,2371.0


Hard to determine anything. But we should just keep them for now.

In [13]:
appended[appended['Statecode'].isna()].sample(2)

Unnamed: 0,Taxyr,Parid,Situs Adrno,Situs Adrdir,Situs Adrstr,Situs Adrsuf,Situs Adrsuf2,Cityname,Class,Luc,Livunit,Taxdist,Own1,Own2,Owner Adrno,Owner Adradd,Owner Adrdir,Owner Adrstr,Owner Adrsuf,Owner Adrsuf2,Cityname.1,Statecode,Country,Unitno,Zip1,Reascd,Aprland,Aprbldg,Revcode,Revreas,Aprtot,D Yrblt,D Effyr,D Yrremod,D Grade,Sfla
54722,2017.0,14 014100040312,126.0,,WELLINGTON,ST,SW,ATL,R3,102,2.0,5,ARLP REO 400 LLC,,,,,,,,,,,,,,10700.0,26400.0,,,37100.0,1940.0,,,C-,1472.0
69056,2017.0,14 017900040499,2274.0,,TIGER FLOWERS,DR,NW,ATL,R3,101,1.0,5,STAMPER FAMILY SUPER INVESTMENTS LLC,,,,,,,,,,,,,,4900.0,33400.0,,,38300.0,2008.0,,,C,2113.0


Many observations without Statecode appear to be in other countries.

There are not any major problems here, so let's just fill all NA with empty strings ''. We need to convert all these variables to strings first though.

In [14]:
print("Fill the above variables with empty strings when NA ---")

for v in check_vars:
    appended[v] = appended[v].astype('string').fillna('')

Fill the above variables with empty strings when NA ---


Construct the derived varibles

In [15]:
appended['site_addr'] = (
    appended['Situs Adrno'] + ' ' +
    appended['Situs Adrdir'] + ' ' +
    appended['Situs Adrstr'] + ' ' +
    appended['Situs Adrsuf'] + ' ' +
    appended['Situs Adrsuf2'] + ' ' +
    appended['Cityname']
)

In [16]:
appended['owner_addr'] = (
    appended['Owner Adrno'] + ' ' +
    appended['Owner Adrstr'] + ' ' +
    appended['Owner Adradd'] + ' ' +
    appended['Owner Adrdir'] + ' ' +
    appended['Owner Adrsuf'] + ' ' +
    appended['Owner Adrsuf2'] + ' ' +
    appended['Cityname.1'] + ' ' +
    appended['Statecode'] + ' ' +
    appended['Unitno']
)

Check the created variables look as expected

In [17]:
appended[['site_addr', 'owner_addr']].sample(5)

Unnamed: 0,site_addr,owner_addr
139686,1246.0 WESTMONT RD SW ATLANTA,1246.0 WESTMONT RD SW ATLANTA GA
23326,171.0 AUBURN AVE NE ATL,6301.0 OWENSMOUTH AVE WOODLAND HILLS CA 730
58099,1573.0 LAUREL PARK PL,1.0 ST PAUL ST ST. CATHARINES ON 100
156396,2210.0 BAKER TER NW ATLANTA,824.0 MEMORIAL DR SE ATLANTA GA
169126,3083.0 GREEN VALLEY DR EAST POINT,3083.0 GREEN VALLEY DR EAST POINT GA


Clean the decimal points from the numbers

In [18]:
appended['site_addr'] = appended['site_addr'].str.replace('.0','')

In [19]:
appended['owner_addr'] = appended['owner_addr'].str.replace('.0','')

In [20]:
appended[['site_addr', 'owner_addr']].sample(5)

Unnamed: 0,site_addr,owner_addr
109824,850 PIEDMONT AVE NE ATLANTA,850 PIEDMONT AVE NE ATLANTA GA 1402
153172,1328 POLLARD DR SW ATLANTA,1388 HAIGHT ST SAN FRANCISCO CA #213
133993,254 HOLDERNESS ST SW ATLANTA,2912 CLOVERHURST DR EAST POINT GA
47492,1151 ARLINGTON AVE SW ATL,P O BOX 161922 ATLANTA GA
129656,1034 ALLENE AVE SW ATLANTA,P O BOX 2071 LITHONIA GA


Rename the variables and cast all the columns to the correct types

In [21]:
appended.rename(columns=new_var_map, inplace=True)

In [22]:
appended = appended[list(new_var_map.values())]

In [23]:
appended.columns

Index(['tax_year', 'parcel_id', 'site_addr', 'site_class_parcel',
       'site_luc_parcel', 'tax_district', 'owner_name_1', 'owner_name_2',
       'owner_zip', 'owner_addr', 'reas_cd', 'rev_code', 'rev_reas',
       'appr_total', 'appr_land', 'appr_build', 'year_built', 'year_eff',
       'year_remodel', 'grade', 'num_units', 'building_area'],
      dtype='object')

When you try to cast, expect there to be problems. It is likely that some of the vars will have values that cannot be easily casted (e.g. 'A' cannot be casted to an Int). You will need to resolve these for each column.

In [35]:
appended = appended.astype(dtypes)

Clean nan values in 'owner_zip'

In [25]:
len(appended[appended['owner_zip'].isna()]) / len(appended) * 100

0.4283829400777943

In [26]:
appended[appended['owner_zip'].isna()].sample(3)

Unnamed: 0,tax_year,parcel_id,site_addr,site_class_parcel,site_luc_parcel,tax_district,owner_name_1,owner_name_2,owner_zip,owner_addr,reas_cd,rev_code,rev_reas,appr_total,appr_land,appr_build,year_built,year_eff,year_remodel,grade,num_units,building_area
63177,2017.0,14 016300120687,1895 DUNLAP AVE EP,R3,107,20,M&N FINE FUTURE INVESTMENTS GA LLC,,,,,,,41000.0,8000.0,33000.0,1968.0,,,C-,1.0,1024.0
138637,2018.0,14 013300010059,1457 WADLEY AVE EAST POINT,R3,101,20,ARLP REO 400 LLC,,,,E8,3.0,70.0,170500.0,27600.0,142900.0,1950.0,1988.0,,C+,1.0,1472.0
120370,2018.0,14 007900130788,400 WEST PEACHTREE ST NW ATLANTA,R3,106,05W,SECRETARY OF VETERANS AFFAIRS,,,3401 END W AVE NASHVILLE TN 760 W,FN,3.0,,317400.0,40900.0,276500.0,2007.0,,,X,1.0,1127.0


It is an insignficant portion of the data, and quite important, so lets drop rows with None values for owner_zip

In [27]:
appended = appended[~appended['owner_zip'].isna()]

Let's investigate year_built

In [28]:
len(appended[appended['year_built'].isna()]) / len(appended) * 100

11.94020432864674

Significant number of None values. Let's replace them with 0, and same for the other year columns.

In [29]:
appended['year_built'].fillna('0', inplace=True)
appended['year_eff'].fillna('0', inplace=True)
appended['year_remodel'].fillna('0', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  appended['year_built'].fillna('0', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  appended['year_eff'].fillna('0', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  appended['year_remodel'].fillna('0', inplace=True)


Investigate 'num_units'

In [32]:
len(appended[appended['num_units'].isna()]) / len(appended) * 100

0.21855478497306785

In [33]:
appended[appended['num_units'].isna()].sample(5)

Unnamed: 0,tax_year,parcel_id,site_addr,site_class_parcel,site_luc_parcel,tax_district,owner_name_1,owner_name_2,owner_zip,owner_addr,reas_cd,rev_code,rev_reas,appr_total,appr_land,appr_build,year_built,year_eff,year_remodel,grade,num_units,building_area
104004,2018.0,14 0045 LL1130,170 BOULEVARD SE ATLANTA,R3,106,05W,BOWER EVE W,,30312,170 BOULEVARD SE ATLANTA GA H-327,FN,3.0,,250400.0,26200.0,224200.0,1910.0,1985.0,0,B+,,995.0
50962,2017.0,14 013300020272,0 WADLEY AVE EP,E1,600,20,MARTA,,30308,401 PEACHTREE W ST NW ATLANTA GA,,1.0,20.0,13300.0,13300.0,0.0,0.0,0.0,0,,,
130332,2018.0,14 0108 LL2421,898 OAK ST SW ATLANTA,R3,106,05T,MUHAMMAD RABB O,,30310,898 OAK ST SW ATLANTA GA 3417,FN,3.0,,97900.0,13000.0,84900.0,2007.0,0.0,0,B+,,1019.0
43675,2017.0,14 011100010808,533 JOSEPH E LOWERY BLVD NW ATL,R3,100,05Z,CANOPY DEVELOPMENT GROUP LLC,,30325,P O BOX 19696 ATLANTA GA,0,,,3900.0,3900.0,0.0,0.0,0.0,0,,,
33547,2017.0,14 008300040411,534 SPENCER ST NW ATL,R3,100,05Z,NEF ENTERPRISES LLC,,30328,7095 NORTHGREEN DR SANDY SPRINGS GA,,,,6300.0,6300.0,0.0,0.0,0.0,0,,,


Let's fill these with 0.

In [34]:
appended['num_units'].fillna('0', inplace=True)

Check the datatypes

In [36]:
appended.info()

<class 'pandas.core.frame.DataFrame'>
Index: 174327 entries, 0 to 175076
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   tax_year           174327 non-null  int16   
 1   parcel_id          174327 non-null  string  
 2   site_addr          174327 non-null  string  
 3   site_class_parcel  174327 non-null  category
 4   site_luc_parcel    174327 non-null  category
 5   tax_district       174327 non-null  category
 6   owner_name_1       174327 non-null  string  
 7   owner_name_2       19943 non-null   string  
 8   owner_zip          174327 non-null  string  
 9   owner_addr         174327 non-null  string  
 10  reas_cd            100349 non-null  string  
 11  rev_code           109046 non-null  string  
 12  rev_reas           26241 non-null   string  
 13  appr_total         174327 non-null  float32 
 14  appr_land          174327 non-null  float32 
 15  appr_build         174127 non-null  flo

Export the data

In [None]:
appended.to_csv('../output/all_parcels_fulton_2011_2022.csv')
appended.to_parquet('../output/all_parcels_fulton_2011_2022.parquet')