In [2]:
import pandas as pd
import fiona
import geopandas as gpd
from sqlalchemy import create_engine
import numpy as np
#import dedupe
from dotenv import load_dotenv
import os
import pyArango
import usaddress
from usaddress import tag
from scourgify import normalize_address_record
import re
import json

In [3]:
from pyArango.connection import *
conn = Connection(username="root", password="0505")

In [4]:
load_dotenv()
PG_CONNECT = os.getenv("PG_CONNECT")

In [4]:
# create arangoDB database, or open it if it already exists
try:
    corp_db = conn.createDatabase(name="corp_data")
    # from command line:
    # arangoimport --file '''path to CorpData.csv''' --collection corp_data --create-collection true --type csv --server.database corp_data
    # arangoimport --file '''path to CorpIndividualExport.csv''' --collection corp_individual --create-collection true --type csv --server.database corp_data

except:
    corp_db = conn["corp_data"]

In [None]:
# arangoDB database structure:

# corp_db: arangoDB database
# corp_db["corp_data"]: arangoDB collection
# corp_db["corp_data"][17777999]: arangoDB document (one piece of data, or one corporation)

In [5]:
# extracting the values of the corp_data_processed collection
val_aql = "FOR x IN corp_data_processed RETURN x"
value_query_result = corp_db.AQLQuery(val_aql,rawResults=True, batchSize = 1000)
col_value = {}
ind_val = 0

for value in value_query_result:
    col_value[ind_val] = value
    ind_val += 1
    
# create dataframe from dictionary of dictionaries
corp = pd.DataFrame.from_dict(data = col_value, orient = 'index')

In [6]:
# extracting the values of the indiv_data_processed collection
indiv_aql = "FOR x IN indiv_data_processed RETURN x"
indiv_query_result = corp_db.AQLQuery(indiv_aql,rawResults=True, batchSize = 1000)
col_indiv = {}
ind_indiv = 0

for indiv in indiv_query_result:
    col_indiv[ind_indiv] = indiv
    ind_indiv += 1

indiv = pd.DataFrame.from_dict(data = col_indiv, orient = 'index')

In [7]:
print(corp)

             _key                           _id         _rev  DataID  \
0        17777999  corp_data_processed/17777999  _eZCy3XK---  00wepr   
1        17778000  corp_data_processed/17778000  _eZCy3XK--_  00v4bw   
2        17778001  corp_data_processed/17778001  _eZCy3XK--A  00n9yj   
3        17778002  corp_data_processed/17778002  _eZCy3XK--B  00l0iw   
4        17778003  corp_data_processed/17778003  _eZCy3XK--C  000002   
...           ...                           ...          ...     ...   
1283562  19129552  corp_data_processed/19129552  _eZCy6bW--I  00syho   
1283563  19129553  corp_data_processed/19129553  _eZCy6bW--J  00seli   
1283564  19129555  corp_data_processed/19129555  _eZCy6bW--K  00p1mp   
1283565  19129556  corp_data_processed/19129556  _eZCy6bW--L  00t7h3   
1283566  19129557  corp_data_processed/19129557  _eZCy6bW--M  00t4c9   

                             EntityTypeDescriptor  \
0        Domestic Limited Liability Company (LLC)   
1        Domestic Limited Lia

In [110]:
def corp2indiv(corp_df, indiv_df, batch_size):
    
    num_batch = int(corp_df.shape[0] / batch_size)
    
    for i in range(0,num_batch+1):
        print(i)
        print(i*batch_size+1,(i+1)*batch_size)
        batch_df = corp_df.loc[i*batch_size+1:(i+1)*batch_size,:]
        
        for index, row in batch_df.iterrows():
            cid = row['DataID']
            c_indiv = indiv_df[indiv_df.DataID == cid]
            corp_id = row['_id']
            for index2, row2 in c_indiv.iterrows():
                indiv_id = row2['_id']
                edge_attributes = {
                       "_from": corp_id,
                       "_to": indiv_id,
                       "type": 'corp2indiv',
                       "DataID":cid}
                edge = corp_db["corp2indiv"].createDocument(edge_attributes)
                edge.save()
                
            indiv_df = indiv_df[indiv_df.DataID != cid]
        print(len(indiv_df))

In [78]:
corp2 = corp.loc[0:0,:]

In [111]:
corp2indiv(corp,indiv,100)

0
1 100
4905267
1
101 200
4904867
2
201 300
4904460
3
301 400
4904105
4
401 500
4903753
5
501 600


KeyboardInterrupt: 

In [112]:
print(len(indiv))

4905639


In [5]:
#process geodatabase
l3_assess_geo = gpd.read_file("../MassGIS_L3_Parcels_gdb/MassGIS_L3_Parcels.gdb", driver='FileGDB', layer=2) #L3_ASSESS
l3_assess = pd.DataFrame(l3_assess_geo)
print(l3_assess)

            PROP_ID            LOC_ID  BLDG_VAL  LAND_VAL  OTHER_VAL  \
0            D9 4 3  F_335507_2850670    111000     98900          0   
1            D9 5 3  F_335433_2850987    140200     90500       6600   
2           D9 6 14  F_335682_2851486    344900    184100       9200   
3           E10 1 5  F_337304_2851580    375800    200900      18500   
4          E10 2 23  F_336769_2852989         0     29500          0   
...             ...               ...       ...       ...        ...   
2536875  0104966006  F_781897_2959209    551200         0          0   
2536876  0104972008  F_781998_2959152    536400         0          0   
2536877  1904214002  F_756936_2929234    437900         0          0   
2536878  1200335006  F_769944_2942985    436500         0          0   
2536879  0903674006  F_766578_2944741    383700         0          0   

         TOTAL_VAL    FY  LOT_SIZE   LS_DATE  LS_PRICE  ...          STYLE  \
0           209900  2021      0.39  19930802  106000.0  .

In [8]:
print(l3_assess.loc[0,:])

PROP_ID                   D9 4 3
LOC_ID          F_335507_2850670
BLDG_VAL                  111000
LAND_VAL                   98900
OTHER_VAL                      0
TOTAL_VAL                 209900
FY                          2021
LOT_SIZE                    0.39
LS_DATE                 19930802
LS_PRICE                106000.0
USE_CODE                     101
SITE_ADDR         27 OVERLOOK DR
ADDR_NUM                      27
FULL_STR             OVERLOOK DR
LOCATION                    None
CITY               FEEDING HILLS
ZIP                        01030
OWNER1          MORRIS RAYMOND T
OWN_ADDR          27 OVERLOOK DR
OWN_CITY           FEEDING HILLS
OWN_STATE                     MA
OWN_ZIP               01030-2007
OWN_CO                      None
LS_BOOK                     8512
LS_PAGE                      274
REG_ID                      None
ZONING                        AG
YEAR_BUILT                1978.0
BLD_AREA                  1092.0
UNITS                        1.0
RES_AREA  

In [None]:
# Parcel data from MassGIS Data
# https://www.mass.gov/forms/massgis-request-statewide-parcel-data

# Residential Land Use Codes from MA Dept of Revenue
# https://www.mass.gov/files/documents/2016/08/wr/classificationcodebook.pdf
# Codes are 101*-109*, 031*, and 013*
# Often include suffixes (letters, zeroes or no character), thus regex

USE_CODES = '^1[0-1][1-9]*?|^013*?|^031*?'