In [2]:
import pandas as pd
import fiona
import geopandas as gpd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import pyArango
import usaddress
from usaddress import tag
from scourgify import normalize_address_record
import re
import json

In [4]:
load_dotenv()
PG_CONNECT = os.getenv("PG_CONNECT")

In [7]:
#process geodatabase
l3_assess_geo = gpd.read_file("../MassGIS_L3_Parcels_gdb/MassGIS_L3_Parcels.gdb", driver='FileGDB', layer=2) #L3_ASSESS
l3_assess = pd.DataFrame(l3_assess_geo)
print(l3_assess)

            PROP_ID            LOC_ID  BLDG_VAL  LAND_VAL  OTHER_VAL  \
0            D9 4 3  F_335507_2850670    111000     98900          0   
1            D9 5 3  F_335433_2850987    140200     90500       6600   
2           D9 6 14  F_335682_2851486    344900    184100       9200   
3           E10 1 5  F_337304_2851580    375800    200900      18500   
4          E10 2 23  F_336769_2852989         0     29500          0   
...             ...               ...       ...       ...        ...   
2536875  0104966006  F_781897_2959209    551200         0          0   
2536876  0104972008  F_781998_2959152    536400         0          0   
2536877  1904214002  F_756936_2929234    437900         0          0   
2536878  1200335006  F_769944_2942985    436500         0          0   
2536879  0903674006  F_766578_2944741    383700         0          0   

         TOTAL_VAL    FY  LOT_SIZE   LS_DATE  LS_PRICE  ...          STYLE  \
0           209900  2021      0.39  19930802  106000.0  .

In [9]:
# Parcel data from MassGIS Data
# https://www.mass.gov/forms/massgis-request-statewide-parcel-data

# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex

USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'

In [10]:
l3_assess = l3_assess[l3_assess['USE_CODE'].str.contains(USE_CODES, regex=True)]

In [17]:
# concatenate property address
l3_assess.loc[:,'PropAddr'] = [', '.join((str(a),str(b))) for a,b in zip(l3_assess['SITE_ADDR'],l3_assess['CITY'])]
# append zip to address with no comma
l3_assess.loc[:,'PropAddr'] = [' '.join((str(a),str(b))) for a,b in zip(l3_assess['PropAddr'],l3_assess['ZIP'])]
l3_assess.loc[:,'PropAddr'] = l3_assess.PropAddr.str.strip()

# concatenate owner address
l3_assess.loc[:,'OwnAddr'] = [', '.join((str(a),str(b),str(c))) for a,b,c in zip(l3_assess['OWN_ADDR'],l3_assess['OWN_CITY'],l3_assess['OWN_STATE'])]
# append zip to address with no comma
l3_assess.loc[:,'OwnAddr'] = [' '.join((str(a),str(b))) for a,b in zip(l3_assess['OwnAddr'],l3_assess['OWN_ZIP'])]
l3_assess.loc[:,'OwnAddr'] = l3_assess.OwnAddr.str.strip()

In [18]:
# two normalization methods to deal with failures of the normalize_address_record() method

def normalize_dict(x):
    try:
        x_1=normalize_address_record(x)
    except:
        y=corp_df_sliced.index[corp_df_sliced['EntityAddr'] == x]
        addr1=corp_df.loc[y,'Addr1'].to_string()
        addr2=corp_df.loc[y,'Addr2'].to_string()
        city=corp_df.loc[y,'City'].to_string()
        state=corp_df.loc[y,'State'].to_string()
        pc=corp_df.loc[y,'PostalCode'].to_string()
        x_1 = {'address_line_1': addr1, 'address_line_2': addr2, 'city': city, 'state': state, 'postal_code': pc}
        pass
    return x_1

def normalize_concat(x):
    if(x==', , ,'):
        x_1 = None
    else:
        try:
            y = normalize_address_record(x)
            try:
                y_0 = ', '.join([y['address_line_1'],y['address_line_2'],y['city'],y['state']])
            except:
                y_0 = ', '.join([y['address_line_1'],y['city'],y['state']])
            y_1 = ' '.join((y_0,y['postal_code']))
            x_1 = y_1.strip()
        except:
            x_1 = x
            pass
    return x_1

In [19]:
#normalize addresses
l3_assess.loc[:,'PropAddr']=l3_assess.loc[:,'PropAddr'].apply(lambda x:normalize_concat(x))
l3_assess.loc[:,'OwnAddr']=l3_assess.loc[:,'OwnAddr'].apply(lambda x:normalize_concat(x))

In [23]:
l3_assess.loc[:,'LS_DATE'] = pd.to_datetime(l3_assess['LS_DATE'], format='%Y%m%d', errors='coerce')
l3_assess = l3_assess[['PROP_ID', 'LOC_ID', 'PropAddr', 'UNITS', 'OWNER1', 'OwnAddr', 'TOTAL_VAL', 'FY', 'LS_DATE', 'LS_PRICE']]

In [27]:
# create json and write it to an existing empty json file
result = l3_assess.to_json(orient="records")
parsed = json.loads(result)
l3_json = json.dumps(parsed, indent=4) 

with open('../l3.json', 'w') as outfile:
    outfile.write(l3_json)