In [1]:
import pandas as pd
import fiona
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import pyArango
import usaddress
from usaddress import tag
from scourgify import normalize_address_record
import re
import json

In [2]:
from pyArango.connection import *
conn = Connection(username="root", password="0505")

In [3]:
load_dotenv()
PG_CONNECT = os.getenv("PG_CONNECT")

In [4]:
# create arangoDB database, or open it if it already exists
try:
    corp_db = conn.createDatabase(name="corp_data")
    # from command line:
    # arangoimport --file '''path to CorpData.csv''' --collection corp_data --create-collection true --type csv --server.database corp_data
    # arangoimport --file '''path to CorpIndividualExport.csv''' --collection corp_individual --create-collection true --type csv --server.database corp_data

except:
    corp_db = conn["corp_data"]

In [None]:
# ***** start of ONLY RUN ONCE for corp_data *****
# process the raw corp_data collection, including normalization of names, addresses, and slicing data.

In [5]:
# extracting the values of the corp_data collection
val_aql = "FOR x IN corp_data RETURN x" # here, corp_data is the name of the collection, not the database
value_query_result = corp_db.AQLQuery(val_aql,rawResults=True, batchSize = 1000)
col_value = {}
ind_val = 0

for value in value_query_result:
    col_value[ind_val] = value
    ind_val += 1
    
# create dataframe from dictionary of dictionaries
corp_df = pd.DataFrame.from_dict(data = col_value, orient = 'index')

In [6]:
# concatenate entity address
corp_df.loc[:,'EntityAddr'] = [', '.join((str(a),str(b),str(c),str(d))) for a,b,c,d in zip(corp_df['Addr1'],corp_df['Addr2'],corp_df['City'],corp_df['State'])]
# double comma problem
# append zip to address with no comma
corp_df.loc[:,'EntityAddr'] = [' '.join((str(a),str(b))) for a,b in zip(corp_df['EntityAddr'],corp_df['PostalCode'])]
corp_df.loc[:,'EntityAddr'] = corp_df.EntityAddr.str.strip()

# concatenate agent address
corp_df.loc[:,'AgentAddr'] = [', '.join((str(a),str(b),str(c),str(d))) for a,b,c,d in zip(corp_df['AgentAddr1'],corp_df['AgentAddr2'],corp_df['AgentCity'],corp_df['AgentState'])]
# append zip to agent address
corp_df.loc[:,'AgentAddr'] = [' '.join((str(a),str(b))) for a,b in zip(corp_df['AgentAddr'],corp_df['AgentPostalCode'])]
corp_df.loc[:,'AgentAddr'] = corp_df.AgentAddr.str.strip()

# slice
corp_df_sliced = corp_df[['_key','DataID','EntityTypeDescriptor','EntityName','EntityAddr','AgentName','AgentAddr']]

In [8]:
# back up
corp_df_sliced_dup = corp_df_sliced.copy()
corp_df_sliced = corp_df_sliced_dup.copy()

In [8]:
# two normalization methods to deal with failures of the normalize_address_record() method

def normalize_dict(x):
    try:
        x_1=normalize_address_record(x)
    except:
        y=corp_df_sliced.index[corp_df_sliced['EntityAddr'] == x]
        addr1=corp_df.loc[y,'Addr1'].to_string()
        addr2=corp_df.loc[y,'Addr2'].to_string()
        city=corp_df.loc[y,'City'].to_string()
        state=corp_df.loc[y,'State'].to_string()
        pc=corp_df.loc[y,'PostalCode'].to_string()
        x_1 = {'address_line_1': addr1, 'address_line_2': addr2, 'city': city, 'state': state, 'postal_code': pc}
        pass
    return x_1

def normalize_concat(x):
    if(x==', , ,'):
        x_1 = None
    else:
        try:
            y = normalize_address_record(x)
            try:
                y_0 = ', '.join([y['address_line_1'],y['address_line_2'],y['city'],y['state']])
            except:
                y_0 = ', '.join([y['address_line_1'],y['city'],y['state']])
            y_1 = ' '.join((y_0,y['postal_code']))
            x_1 = y_1.strip()
        except:
            x_1 = x
            pass
    return x_1

In [8]:
# filtering out non-profit organizations
corp_profit = corp_df_sliced[corp_df_sliced.EntityTypeDescriptor != 'Nonprofit Corporation']

#normalize addresses
corp_profit.loc[:,'EntityAddr']=corp_profit.loc[:,'EntityAddr'].apply(lambda x:normalize_concat(x))
corp_profit.loc[:,'AgentAddr']=corp_profit.loc[:,'AgentAddr'].apply(lambda x:normalize_concat(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


In [9]:
# name standardization method

text2num = {"ONE":"1",'TWO':'2','THREE':'3','FOUR':'4','FIVE':'5','SIX':'6','SEVEN':'7','EIGHT':'8','NINE':'9',' TEN':' 10',
           'TEN ':'10 ',"ELEVEN":"11",'TWELVE':'12','THIRTEEN':'13','FOURTEEN':'14','FIFTEEN':'15','SIXTEEN':'16','SEVENTEEN':'17',
            'EIGHTEEN':'18','NINETEEN':'19','TWENTY':'20','THIRTY':'30','FORTY':'40','FIFTY':'50','SIXTY':'60',
            'SEVENTY':'70','EIGHTY':'80','NINETY':'90'}

def normalize_name(x):
    # remove special characters
    y = re.sub(r'[^\w\s]','', x)
    # substitute words for numbers with numbers
    for text, num in text2num.items():
        y = y.replace(text, num)
    y = y.strip()
    
    return y

In [12]:
# back up
corp_normalized = corp_profit.copy()

# standardize names 
corp_normalized.loc[:,'EntityName']=corp_normalized.loc[:,'EntityName'].apply(lambda x:normalize_name(x))
corp_normalized.loc[:,'AgentName']=corp_normalized.loc[:,'AgentName'].apply(lambda x:normalize_name(x))

# create json and write it to an existing empty json file
result = corp_normalized.to_json(orient="records")
parsed = json.loads(result)
corp_json = json.dumps(parsed, indent=4) 

with open('../corp.json', 'w') as outfile:
    outfile.write(corp_json)

In [None]:
# convert json to jsonl with jq because file is big, or else it throws error: https://stedolan.github.io/jq/
# from command line:
# jq -c ".[]" inputFile.json > outputFile.jsonl

# from command line:
# arangoimport --file '''path to corp.jsonl''' --collection corp_data_processed --create-collection true --type jsonl --server.database corp_data

In [None]:
# ***** end of ONLY RUN ONCE for corp_data *****

# ***** start of ONLY RUN ONCE for corp_individual *****
# process the raw corp_individual collection, including normalization of names, addresses, and slicing data

In [5]:
# extracting the values of the corp_individual collection
indiv_aql = "FOR x IN corp_individual RETURN x"
indiv_query_result = corp_db.AQLQuery(indiv_aql,rawResults=True, batchSize = 1000)
col_indiv = {}
ind_indiv = 0

for indiv in indiv_query_result:
    col_indiv[ind_indiv] = indiv
    ind_indiv += 1

indiv_df = pd.DataFrame.from_dict(data = col_indiv, orient = 'index')

In [6]:
# concatenate business address
indiv_df.loc[:,'BusAddr'] = [', '.join((str(a),str(b),str(c))) for a,b,c in zip(indiv_df['BusAddr1'],indiv_df['BusCity'],indiv_df['BusState'])]
# append zip to address with no comma
indiv_df.loc[:,'BusAddr'] = [' '.join((str(a),str(b))) for a,b in zip(indiv_df['BusAddr'],indiv_df['BusPostalCode'])]
indiv_df.loc[:,'BusAddr'] = indiv_df.BusAddr.str.strip()

# concatenate residential address (although currently unused)
indiv_df.loc[:,'ResAddr'] = [', '.join((str(a),str(b),str(c))) for a,b,c in zip(indiv_df['ResAddr1'],indiv_df['ResCity'],indiv_df['ResState'])]
# append zip to address with no comma
indiv_df.loc[:,'ResAddr'] = [' '.join((str(a),str(b))) for a,b in zip(indiv_df['ResAddr'],indiv_df['ResPostalCode'])]
indiv_df.loc[:,'ResAddr'] = indiv_df.ResAddr.str.strip()

# concatenate name
indiv_df.loc[:,'IndividualName'] = [' '.join((str(a),str(b),str(c))) for a,b,c in zip(indiv_df['LastName'],indiv_df['FirstName'],indiv_df['MiddleName'])]
indiv_df.loc[:,'IndividualName'] = indiv_df.IndividualName.str.strip()
indiv_df.loc[:,'IndividualNameShort'] = [' '.join((str(a),str(b))) for a,b in zip(indiv_df['LastName'],indiv_df['FirstName'])]

# slice
indiv_df_sliced = indiv_df[['_key','DataID','IndividualName','IndividualNameShort','BusAddr']]

In [9]:
# normalize business address
indiv_df_sliced.loc[:,'BusAddr']=indiv_df_sliced.loc[:,'BusAddr'].apply(lambda x:normalize_concat(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


In [10]:
# create json and write it to an existing empty json file
result = indiv_df_sliced.to_json(orient="records")
parsed = json.loads(result)
indiv_json = json.dumps(parsed, indent=4) 

with open('../indiv_r.json', 'w') as outfile:
    outfile.write(indiv_json)

In [None]:
# convert json to jsonl with jq because file is big, or else it throws error: https://stedolan.github.io/jq/
# from command line:
# jq -c ".[]" inputFile.json > outputFile.jsonl

# from command line:
# arangoimport --file '''path to indiv.jsonl''' --collection indiv_data_processed --create-collection true --type jsonl --server.database corp_data

# ***** end of ONLY RUN ONCE for corp_individual *****