# S2_Get_Geo_locator_from_address.ipynb
## Descriptions
* Address as string
* Convert to geo locator using osm 

## Pool the address

In [1]:
import sqlite3 as lite
def select_addr(conn):
    """
    """
    cur = conn.cursor()
    cur.execute("select ci_text from corp_info where ci_name in ('REGISTERED OFFICE ADDRESS','DIRECTORS')") 
    rows = cur.fetchall()
    return([r[0] for r in rows])

ddb = "./data/corporate/corp.db"                    

con = lite.connect(ddb)
l_addr = list(set(select_addr(con)))
l_addr = l_addr[1:]
con.close()

In [2]:
len(l_addr)

1600298

## Pre-process Address 

In [3]:
# extract the pretext before the address

import re
def getPreText(txtaddr):
    preText = " : "
    m = re.search("([^0-9]*:[^0-9]*)*", txtaddr)     
    strRemain = txtaddr
    try:
        strRemain = str.replace(txtaddr,str(m.group(0)),"")
    except Exception:
        pass
    try:
        preText = str(m.group(0)).strip()
    except Exception:
        pass
    
    field = ''
    txt = ''
    try:
        l_buf = preText.split(":")
        field = l_buf[0]
        txt = l_buf[1]
    except:
        pass
    return ((txt.lower().strip(), field.lower().strip()), strRemain)
    
print(getPreText('460, RUE SAINTE-CATHERINE OUEST BUREAU 503 MONTREAL QC H3B 1A7 CANADA'))

(('', ''), '460, RUE SAINTE-CATHERINE OUEST BUREAU 503 MONTREAL QC H3B 1A7 CANADA')


## Level 1: Generate Address Key for Address Identifier

### libpostal: international street address NLP
#### Install libpostal 
https://github.com/openvenues/libpostal
#### Install python package postal
pip install postal

In [4]:
# Function to create standardize addr_key
# try to standardize address 

# https://github.com/openvenues/libpostal
# pip install postal

import re
def splitstr(a):
    b = a.translate ({ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"})
    return b.split()

def rplceToken(srch_pat, rplce_pat, txtaddr):        
    m = re.search(srch_pat, txtaddr) 
    strRplce = txtaddr
    try:
        strRplce = str.replace(txtaddr,str(m.group(0)),rplce_pat)
    except Exception:
        pass
    return (strRplce)

def rplcRoadToken(road):
    strR4= rplceToken("(drive[\s\,])", "dr ", road)
    strR5= rplceToken("(boulevard[\s\,])", "blvd ", strR4)
    strR6= rplceToken("(crescent[\s\,])", "cres ", strR5)
    strR7= rplceToken("(street[\s\,]|STR[\s\,])", "st ", strR6)
    strR8= rplceToken("(road[\s\,]|RAOD[\s\,])", "rd ", strR7)
    strR8A= rplceToken("(avenue[\s\,])", "ave ", strR8)
    cleanedRoad = rplceToken("(highway[\s\,])", "hwy ", strR8A)
    return cleanedRoad

def gen_addr_key(rec):
    dict_addr = dict(zip(([v[1] for v in rec]), ([v[0] for v in rec])))    
    try:
        buf = "_".join([e[:2] for e in splitstr(dict_addr['country'])]) + "|"  
    except:
        buf = "_" + "|"          
    try:
        buf = buf + dict_addr['postcode'].replace(" ","").replace(",","").replace("-","") + "|"  
    except:
        buf = buf + "_" + "|"  
    try:
        buf = buf + "_".join([e[:2] for e in splitstr(dict_addr['state'])]) + "|"   
    except:
        buf = buf + "_" + "|"  
    try:
        buf = buf + "_".join([e[:2] for e in splitstr(dict_addr['city'])]) + "|"    
    except:
        buf = buf + "_" + "|"  
    try:
        buf = buf + "_".join([e[:2] for e in splitstr(rplcRoadToken(dict_addr['road']))]) + "|"    
    except:
        buf = buf + "_" + "|"  
    try:
        buf = buf + "_".join([e[:2] for e in splitstr(dict_addr['house'])]) + "|"    
    except:
        buf = buf + "_" + "|"          
    try:
        buf = buf + re.sub("[^0-9]", "", dict_addr['house_number']) + "|" 
    except:
        buf = buf + "_" + "|" 
    try:
        buf = buf + re.sub("[^0-9]", "", dict_addr['unit']) + "|" 
    except:
        buf = buf + "_" + "|"         
    try:
        buf = buf + re.sub("[^0-9]", "", dict_addr['po_box']) + "|" 
    except:
        buf = buf + "_" + "|" 

    return buf

In [5]:
# Test Case 1
'''
_addr = '640 BOULEVARD ROBERVAL OUEST LONGUEUIL QC J4L 3B7 CANADA'
rec = parse_address(_addr)
dict_addr = dict(zip(([v[1] for v in rec]), ([v[0] for v in rec]))) 
print(dict_addr['road'])
rplcRoadToken(dict_addr['road'])
'''

"\n_addr = '640 BOULEVARD ROBERVAL OUEST LONGUEUIL QC J4L 3B7 CANADA'\nrec = parse_address(_addr)\ndict_addr = dict(zip(([v[1] for v in rec]), ([v[0] for v in rec]))) \nprint(dict_addr['road'])\nrplcRoadToken(dict_addr['road'])\n"

In [6]:
# Test Case 2
'''
_addr = "6 PORTNEUF C.P. 399 CANTLEY QC J0X 1L0 CANADA"
parse_address(_addr)
'''

'\n_addr = "6 PORTNEUF C.P. 399 CANTLEY QC J0X 1L0 CANADA"\nparse_address(_addr)\n'

In [7]:
# Test Case 3
'''
from postal.parser import parse_address
for addr in l_addr[0:10]:
    t, _addr = getPreText(addr)
    rec = parse_address(_addr)
    if len(t[0]) > 0:
        rec.append(t) 
    print("ADDR:   " + addr)
    print("KEY:    " + gen_addr_key(rec))
'''

'\nfrom postal.parser import parse_address\nfor addr in l_addr[0:10]:\n    t, _addr = getPreText(addr)\n    rec = parse_address(_addr)\n    if len(t[0]) > 0:\n        rec.append(t) \n    print("ADDR:   " + addr)\n    print("KEY:    " + gen_addr_key(rec))\n'

## Output files for Graph Generation
* Tables to input to neo4j   
#### address_nodes.csv  
* addr_key: primary key, varchar  
* addr_addr: full address, varchar  
* addr_long: longitude, float  
* addr_lat: latitude, float  

#### director_nodes.csv  
* dirr_id: primary key, int  
* dirr_name: full name, varchar  
* dirr_addr_key: foreign key, varchar  

#### corporation_nodes.csv  
* corp_id: primary key, int  
* corp_addr_key: foreign key, varchar  

#### director_corporation_edges.csv  
* dirr_corp_key: primary key, int
* dc_corp_id: foreign key, int  
* dc_dirr_id: foreign key, int  




In [8]:
# address_nodes.csv
addr_keys = []
from postal.parser import parse_address
i = 0
for addr in l_addr:
    i += 1
    if (i%500000==0):
        print (i)
    t, _addr = getPreText(addr)
    rec = parse_address(_addr)
    addr_keys.append(gen_addr_key(rec))   

500000
1000000
1500000


### address_nodes.csv  

In [9]:
import pandas as pd
df_address = pd.DataFrame({"addr_key":addr_keys, "addr": l_addr})
df_address_nodes = df_address.groupby('addr_key').first().reset_index()
len(df_address_nodes)

1371889

In [10]:
df_address_nodes.head()

Unnamed: 0,addr_key,addr
0,_|000000|_|al_za|_|li_ar_ja|_|_|_|,DOWNTOWN DISTRICT. AL ZAWIYAH 000000 LIBYAN AR...
1,_|000000|_|te|za_4|ir_is_re|109|_|_|,"PADISAN ZANBAGH 4 NO 109 TEHRAN 000000 IRAN, I..."
2,_|00000|_|ro_to_to|_|co_ce|_|_|4428|,"COLUMBUS CENTRE P.O. BOX 4428 ROAD TOWN, TORTO..."
3,_|00243|_|ki|av_lu_co_de_le|co_th_de_re_of_th|...,"26, AVENUE LUNEKO COMMUNE DE LEMBA KINSHASA 00..."
4,_|01|_|co|_|ma_gl|_|_||,MAISON GLOGLO COTONOU 01 BP BENIN


In [11]:
path = "./out/"
df_address_nodes.to_csv(path+"address_nodes.csv", index = False)

### director_nodes.csv  

In [12]:
import sqlite3 as lite
def select_dirr(conn):
    """
    """
    cur = conn.cursor()
    cur.execute("select ci_corp_id, ci_lab, ci_text from corp_info where ci_name in ('DIRECTORS')") 
    rows = cur.fetchall()
    return([r[0] for r in rows], [r[1] for r in rows], [r[2] for r in rows])

ddb = "./data/corporate/corp.db"                    

con = lite.connect(ddb)
l_dirr_corp_id, l_dirr_name, l_dirr_addr  = select_dirr(con)
con.close()

df_dirr = pd.DataFrame({"corp_id":l_dirr_corp_id, "name": l_dirr_name, "addr":l_dirr_addr})

In [14]:
df_dirr['addr_key'] = [gen_addr_key(parse_address(getPreText(addr)[1])) for addr in df_dirr.addr]

In [15]:
df_director_nodes = df_dirr.groupby(['name','addr_key']).first().reset_index()

In [16]:
print(len(df_dirr))
print(len(df_director_nodes))

1491366
1287973


In [17]:
df_director_nodes = df_director_nodes[['name', 'addr_key']].reset_index()

In [18]:
df_director_nodes.columns = ["dirr_id", 'name', 'addr_key']
df_director_nodes.head()

Unnamed: 0,dirr_id,name,addr_key
0,0,#DAVID HASSAN,ca|n5w4e3|on|lo|as_av|_|219|_|_|
1,1,#DAVID MCFADDEN,ca|m5n1m1|on|to|st_cl_av|_|454|_|_|
2,2,#IRVING TEITELBAUM,ca|h3y1k9|qc|we|le_av|_|789|_|_|
3,3,#ROBERT E LORD,ca|l4w1s4|on|to|cl_av|_|5|_|_|
4,4,( JERRY ) ZHENYU FANG,ca|m1p5h5|on|to|je_wa|_|59|_|_|


In [19]:
path = "./out/"
df_director_nodes.to_csv(path+"director_nodes.csv", index = False)

### corporation_nodes.csv  

In [21]:
import sqlite3 as lite
def select_corp(conn):
    """
    """
    cur = conn.cursor()
    cur.execute("select ci_corp_id, ci_text from corp_info where ci_name in ('REGISTERED OFFICE ADDRESS')") 
    rows = cur.fetchall()
    return([r[0] for r in rows], [r[1] for r in rows])

ddb = "./data/corporate/corp.db"                    

con = lite.connect(ddb)
corp_id, corp_addr  = select_corp(con)
con.close()

df_corp = pd.DataFrame({"corp_id":corp_id, "addr":corp_addr})
print(len(df_corp))
#hacking capture wrong address in the 2nd lines. Need to fix the capturing code...
df_corp = df_corp.groupby(['corp_id']).first().reset_index()
print(len(df_corp))
df_corp['addr_key'] = [gen_addr_key(parse_address(getPreText(addr)[1])) for addr in df_corp.addr]
df_corporation_nodes = df_corp.groupby(["corp_id",'addr_key']).first().reset_index()
print (len(df_corporation_nodes))


906016
810149
810149


In [22]:
df_corporation_nodes = df_corporation_nodes[['corp_id', 'addr_key']]
print(len(df_corporation_nodes))
df_corporation_nodes.head()

810149


Unnamed: 0,corp_id,addr_key
0,1007,ca|v2s5a1|bc|ab|32_so_fr_wa|_|207|_|_|
1,1015,ca|j0h1a0|qc|ac_va|st_an|_|1053|_|_|
2,1031,ca|l1t2z9|on|aj|ol_ki_rd|_|3144|_|_|
3,1040,ca|v9y8p2|bc|po_al|po_al_hw|_|2533|_|_|
4,1066,ca|t0e0a0|ab|al_be|_|_|280|_|_|


In [24]:
df_corporation_nodes[df_corporation_nodes.corp_id == 20931]

Unnamed: 0,corp_id,addr_key
1191,20931,ca|g5r1c6|qc|ri_du_lo|ru_fr|_|70|_|_|


In [25]:
path = "./out/"
df_corporation_nodes.to_csv(path+"corporation_nodes.csv", index = False)

### director_corporation_edges.csv    

In [26]:
df_dirr.head()

Unnamed: 0,corp_id,name,addr,addr_key
0,1007,JASON DYKSTRA,30854 OLUND ROAD ABBOTSFORD BC V4X 1Z9 CANADA,ca|v4x1z9|bc|ab|ol_ro|_|30854|_|_|
1,1007,LA VONNE BANDSMA,207-32900 SOUTH FRASER WAY ABBOTSFORD BC V2S 5...,ca|v2s5a1|bc|ab|so_fr_wa|_|20732900|_|_|
2,1007,KRISTIN VAN VLOTEN,34820 CASSIAR COURT ABBOTSFORD BC V2S 7G9 CANADA,ca|v2s7g9|bc|ab|ca_co|_|34820|_|_|
3,1007,DAVID MILLER,2001 ABBOTSFORD WAY ABBOTSFORD BC V2S 6Y5 CANADA,ca|v2s6y5|bc|ab|ab_wa|_|2001|_|_|
4,1007,CRAIG TOEWS,33844 KING ROAD ABBOTSFORD BC V2S 7M8 CANADA,ca|v2s7m8|bc|ab|ki_ro|_|33844|_|_|


In [27]:
df_director_nodes.head()

Unnamed: 0,dirr_id,name,addr_key
0,0,#DAVID HASSAN,ca|n5w4e3|on|lo|as_av|_|219|_|_|
1,1,#DAVID MCFADDEN,ca|m5n1m1|on|to|st_cl_av|_|454|_|_|
2,2,#IRVING TEITELBAUM,ca|h3y1k9|qc|we|le_av|_|789|_|_|
3,3,#ROBERT E LORD,ca|l4w1s4|on|to|cl_av|_|5|_|_|
4,4,( JERRY ) ZHENYU FANG,ca|m1p5h5|on|to|je_wa|_|59|_|_|


In [28]:
df_director_corporation_edges = pd.merge(df_dirr, df_director_nodes, on=['name','addr_key'])[['corp_id','dirr_id']].reset_index()
df_director_corporation_edges.columns = ["dirr_corp_key", 'corp_id', 'dirr_id']
print(len(df_director_corporation_edges))
df_director_corporation_edges.head()

1491366


Unnamed: 0,dirr_corp_key,corp_id,dirr_id
0,0,1007,542479
1,1,1007,682545
2,2,1007,674756
3,3,1007,259284
4,4,1007,216816


In [29]:
path = "./out/"
df_director_corporation_edges.to_csv(path+"director_corporation_edges.csv", index = False)

# (not cleaned, ignore below for now)
## LEVEL 2: Generate Geo-locator for Address Identifier 

### Postal Code
* Forward sortation areas (FSA)  
NL        -                 A  
NS        -                 B  
PE        -                 C  
NB        -                 E  
QC        -                 G, H, J  
ON        -                 K, L, M, N, P  
MB        -                 R  
SK        -                 S  
AB        -                 T  
BC        -                 V  
NU/NT     -                 X  
YT        -                 Y  
* Local delivery units (LDU)  
LDUs ending in zero correspond to postal facilities, from post offices and small franchised retail postal outlets all the way up to sortation plants. 

In [None]:
import re
addr = "j8p 1r4"
m = re.search('([A-Za-z]\d[A-Za-z]) ?(\d[A-Za-z]\d)', addr)
print (m.group(1))

In [None]:
import re
def getPostalCode(addr):
    m = re.search('([A-Za-z]\d[A-Za-z]) ?(\d[A-Za-z]\d)', addr)
    return m.group(0), m.group(1), m.group(2)

for addr in l_addr[:10]:
    try:
        PC, FSA, LDU = getPostalCode(addr)
    except:
        PC = "UNK"
        FSA = "UNK"
        LDU = "UNK"
    print(addr + ": " + PC + "," + FSA + "," + LDU)

In [None]:
def getStrInParentheses(Instr):
    m = re.findall("\((.*?)\)", Instr)
    if m == []:
        m = ""
        n = Instr
    else:
        m = m[len(m)-1]
        n = (Instr[:-1*(len(m)+3)])
    l = n.split(" ")
    return (l, m)

print(getStrInParentheses("STEVE ODENY (SECRETARY)"))

### 
#### CREATE TABLE corp_addr_ext(
####     ci_corp_id INT,
####     ci_name TEXT,
####     ci_lab TEXT,
####     ci_text TEXT, ci_postal TEXT, ci_firstname TEXT, ci_lastname TEXT);

In [None]:
import time
i = 0

import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select * from corp_addr'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

#print(all_record[0])

dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'
con = lite.connect(dba)
sqlstr = "delete from corp_addr_ext"
con.execute(sqlstr)
con.commit()

for record in all_record:
    i = i + 1
    try:
        ci_corp_id = int(record[0])
        ci_name = str(record[1])
        ci_lab = str(record[2])
        ci_text = str(record[3])
        postal = getPostalCode(ci_text)
        (n, r) = getStrInParentheses(ci_lab) 
    except Exception:
        pass
    
    if i == 1:
        sstr = "insert into corp_addr_ext values "
        
    sstr = sstr + "(" + str(ci_corp_id) + ",'"
    sstr = sstr + ci_name + "','"
    sstr = sstr + ci_lab+ "','"
    sstr = sstr + ci_text + "','"
    sstr = sstr + postal + "','"
    sstr = sstr + " ".join(n[:-1]) + "','"
    sstr = sstr + n[-1:][0] + "'), "
    
    if i > 1000:
        con.execute(sstr[:-2])
        con.commit()
        con.close()
        con = lite.connect(db)
        i = 0
        
con.close()

#### CREATE TABLE addr_cleaned(
#### addr_text TEXT,
#### addr_addr TEXT,
#### addr_box TEXT,
#### addr_unit TEXT,
#### addr_suite TEXT,
#### addr_province TEXT,
#### addr_postal TEXT,
#### addr_country TEXT
#### );

In [None]:
import sqlite3 as lite
dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'

con = lite.connect(dba)
cur = con.cursor()
with con:
    cmdstr = 'select ci_text from address order by ci_text DESC'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

# Clear the repository
con = lite.connect(dba)
sqlstr = "delete from addr_cleaned"
con.execute(sqlstr)
con.commit()

i = 0
for record in all_record:
    i = i + 1
    new_text = ""
    try:
        ci_text = str(record[0])
        new_text = ci_text.replace("!", "1").replace("É", "E")  
    except Exception:
        pass
    
    (province,postalcode,country,strR) = getProvPSCountry(new_text)
    (unit,strR1) = getUnitNumber("UNIT|Unit|UNIT", strR)
    (suite,strR2A) = getUnitNumber("SUITE|Suite|suite", strR1) 
    (room,strR2B) = getUnitNumber("\sAPP|\sAPT|\sRM|ROOM", strR2A)  
    (box,strR3A) = getUnitNumber("\(?P\.?\s?O\.?\)?\s?BOX\.?|BOX\.?|POB\.X", strR2B)
    (preAddrTxt,strR3B) = getPreText(strR3A.strip())
    strR4= rplceToken("(DRIVE[\s\,])", "DR ", strR3B)
    strR5= rplceToken("(BOULEVARD[\s\,])", "BLVD ", strR4)
    strR6= rplceToken("(CRESCENT[\s\,])", "CRES ", strR5)
    strR7= rplceToken("(STREET[\s\,]|STR[\s\,])", "ST ", strR6)
    strR8= rplceToken("(ROAD[\s\,]|RAOD[\s\,])", "RD ", strR7)
    strR8A= rplceToken("(AVENUE[\s\,])", "AVE ", strR8)
    strR8B= re.sub('[^0-9a-zA-Z`’]+', ' ', strR8A)
    strR8C= rplceToken("(HIGHWAY[\s\,])", "HWY ", strR8B)
    strR8D= rplceToken("(UNIVERSITY[\s\,])", "UVTY ", strR8C)    
    strR9= str.replace(strR8D,".","")
    strR10= rplceToken("\,?(\s+)", " ", strR9).lstrip('-').rstrip()

#    print(strR10 + " | " + province + " | " + postalcode + " | " + country + " | " + unit + " | " + suite + " | " + box + " | ")    
    
    if len(ci_text) > 1:    
        sstr = "insert into addr_cleaned values "        
        sstr = sstr + '("' + ci_text + '","'
        sstr = sstr + strR10.strip() + '","'
        sstr = sstr + preAddrTxt.strip() + '","'
        sstr = sstr + box.strip() + '","'
        sstr = sstr + room.strip() + '","'
        sstr = sstr + unit.strip() + '","'
        sstr = sstr + suite.strip() + '","'
        sstr = sstr + province.strip() + '","'
        sstr = sstr + postalcode.strip() + '","'
        sstr = sstr + country.strip() + '")'
  
        try:
            con.execute(sstr)
            con.commit()
        except Exception:
            pass
    
#    time.sleep(1)
    
    if ((i % 3000) == 1):
        print(str(i) + ": |" + preAddrTxt.strip() + " | " + strR10.strip() + " | " + ci_text)  
    
        
con.close()

In [None]:
new_text = "SUITE 3500 2 BLOOR STREET WEST TORONTO ON M4W 1A8 CANADA"
(province,postalcode,country,strR) = getProvPSCountry(new_text)
(unit,strR1) = getUnitNumber("UNIT|Unit|UNIT", strR)
(suite,strR2A) = getUnitNumber("SUITE|Suite|suite", strR1) 
(room,strR2B) = getUnitNumber("\sAPT|\sRM|ROOM", strR2A)  
(box,strR3A) = getUnitNumber("\(?P\.?\s?O\.?\)?\s?BOX\.?|BOX\.?|POB\.X", strR2B)
(preAddrTxt,strR3B) = getPreText(strR3A.strip())
strR4= rplceToken("(DRIVE[\s\,])", "DR ", strR3B)
strR5= rplceToken("(BOULEVARD[\s\,])", "BLVD ", strR4)
strR6= rplceToken("(CRESCENT[\s\,])", "CRES ", strR5)
strR7= rplceToken("(STREET[\s\,]|STR[\s\,])", "ST ", strR6)
strR8= rplceToken("(ROAD[\s\,]|RAOD[\s\,])", "RD ", strR7)
strR8A= rplceToken("(AVENUE[\s\,])", "AVE ", strR8)
strR8B= re.sub('[^0-9a-zA-Z`’]+', ' ', strR8A)
strR8C= rplceToken("(HIGHWAY[\s\,])", "HWY ", strR8B)
strR8D= rplceToken("(UNIVERSITY[\s\,])", "UVTY ", strR8C)    
strR9= str.replace(strR8D,".","")
strR10= rplceToken("\,?(\s+)", " ", strR9).lstrip('-').rstrip()
print(strR1)
print(strR2A)
print(strR3A)
print(strR3B)
print(strR4)
print(strR5)

### This is to run address to geocode
### This part is to add more context on the addr: Mainly to get the longitude and latitude of the address

In [34]:
import geocoder
import time
url = 'http://localhost/nominatim/'
buf = "2 BLOOR STREET WEST TORONTO ON M4W 1A8 CANADA"
#g = geocoder.osm(buf, url=url)
g = geocoder.osm(buf)
print(g.json)

{'accuracy': 1.0777191505560488, 'address': 'Two Bloor West, 2, Bloor Street West, Yorkville, University—Rosedale, Old Toronto, Toronto, Golden Horseshoe, Ontario, M4W 3L8, Canada', 'bbox': {'northeast': [43.6707146, -79.3869411], 'southwest': [43.6702351, -79.3875565]}, 'city': 'Toronto', 'confidence': 10, 'country': 'Canada', 'country_code': 'ca', 'district': 'University—Rosedale', 'housenumber': '2', 'importance': 1.0777191505560488, 'lat': 43.6704731, 'lng': -79.38725203134258, 'neighborhood': 'Yorkville', 'ok': True, 'osm_id': 27991582, 'osm_type': 'way', 'place_id': 91859360, 'place_rank': 30, 'postal': 'M4W 3L8', 'quality': 'office', 'raw': {'place_id': 91859360, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'way', 'osm_id': 27991582, 'boundingbox': ['43.6702351', '43.6707146', '-79.3875565', '-79.3869411'], 'lat': '43.6704731', 'lon': '-79.38725203134258', 'display_name': 'Two Bloor West, 2, Bloor Street West, Yorkville, Univer

In [None]:
from geopy.geocoders import Nominatim

def get_coordinates(address, timeout=10):
    """
    Geolocate an address.

    Returns the latitude and longitude of the given address using
    OpenStreetMap's Nominatim service. If the coordinates of the
    address cannot be found then ``(None, None)`` is returned.

    As per Nominatim's terms of service this function is rate limited
    to at most one call per second.

    ``timeout`` gives the timeout in seconds.
    """
    geo_locator = Nominatim(domain='localhost/nominatim', scheme='http', timeout=30) 
    location = geo_locator.geocode(address, timeout=timeout)
    if not location:
        return None, None
    return location.latitude, location.longitude 

buf = "2 BLOOR STREET WEST TORONTO ON M4W 1A8 CANADA"
get_coordinates(buf)

In [None]:
from geopy.geocoders import Nominatim
def get_geo_info(place_name):
    """
    Gets coordinates and address for a given place name using geopy.

    :param place_name: Name of place to search for, eg "San Francisco"
    :return: Location object with latitude, longitude and name attributes
    """    

    # Create geo_locator object instance
    geo_locator = Nominatim()

    # Attempt to obtain geo data for given place name
    try:
        location = geo_locator.geocode(place_name, timeout=timeout)
    except Exception:
        raise Exception("Location error")

    if not location:
        raise Exception("Location error")

    return location 
place_name = "2 BLOOR STREET WEST TORONTO ON M4W 1A8 CANADA"
get_geo_info(place_name)

In [None]:
import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select * from addr_cleaned order by addr_text'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

import geocoder
import time
url = 'http://localhost/nominatim/'
db = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'

def runGeoCoder(buf, url, trial):
    g = None
    t = trial
    if t < 3:
        t = t + 1
        try:
            g = geocoder.osm(buf, url=url)
        except Exception:
            g = runGeoCoder(buf, url, t)
            pass
    return g

icnt = 1
ncnt = 0
for record in all_record:
    ncnt = ncnt + 1
    
    if icnt == 1:cmdstr = 'INSERT INTO addr_complete VALUES '

    buf = record[1]
    buf = buf.replace("UVTY", "UNIVERSITY")
    g = runGeoCoder(buf, url, 1)    

    lon = "0"
    lat = "0"
    raw = ""
    place_id = "0"
    osm_id = "0"
    
    i = 1
    while (g.json is None):
        try:
            buf = buf.split(' ', 1)[1]
            g = runGeoCoder(buf, url, 1)
            sleep(0.1)
        except Exception:
            pass
        i = i + 1
        if i > 2: break
        if len(buf) < 10:break
            
    if  not (g.json is None):
        icnt = icnt + 1
        try:
            lon = g.json["raw"]['lon']
            lat = g.json["raw"]['lat']
        except Exception:
            pass
        
        try:
            state = g.json['raw']['address']['state']
        except Exception:
            pass

        try:
            city = g.json['raw']['address']['city']
        except Exception:
            pass
        
        try:
            country = g.json['raw']['address']['country']
        except Exception:
            pass

        try:    
            raw = g.json['raw']['display_name']
            place_id = g.json["place_id"]
            osm_id = g.json["osm_id"]            
            con = lite.connect(db)
            cur = con.cursor()
            
            cmdstr = cmdstr + "('" + record[0] + "','" + record[1] + "','" + record[2] + "','" + record[3] + "','"
            cmdstr = cmdstr + record[4] + "','" + record[5] + "','" + record[6] + "','" + record[7] + "','"
            cmdstr = cmdstr + record[8] + "','" + record[9] + "'," 
            cmdstr = cmdstr + place_id + "," + osm_id + "," + lon + "," + lat + ',"' + state  + '","' + city + '","' + country + '","' 
            cmdstr = cmdstr + str(raw) + '"), '
            
            if icnt > 1000:
                print (str(ncnt) + ":" + buf + ":" + str(g))
                icnt = 0;
                con = lite.connect(db)
                con.execute(cmdstr[:-2])
                con.commit()
                con.close()            
        except Exception:
            pass
        

In [None]:
#print(cmdstr[:-2])
print(g.json)
print(icnt)
print(buf)
print(record[1])
print(str(record))
buf = "2 BLOOR STREET WEST TORONTO"
print(runGeoCoder(buf, url, 1).json['raw']['display_name'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['state'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['city'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['country'])

#### This part is to add more context on the addr: Mainly to get the longitude and latitude of the address

In [None]:
import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select ci_text from address where ci_text not in (select ci_text from address_ext where not address = "" ) order by ci_text'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()


In [None]:
import geocoder
url = 'http://localhost/nominatim/'
buf = '32 Locheland crescent, ottawa'
geocoder.osm(buf, url=url)

In [None]:
import time
import geocoder


def getAddressInfo(addrIn):
    addrDictOut = {}
    if len(addrIn) > 0:
        try:
 #           g = geocoder.google(addrIn)
            m = re.search('([^0-9]*)\d+[\,-\]*\s[A-Z]\s+(.+)', addrIn)
            print(m.group(2))
    
            g = geocoder.osm(m.group(2))
            addrDictOut = g.json
            print(addrDictOut)
        except Exception:
            pass
    return (addrDictOut)

def getDictFieldText(d, f):
    retstr = ""
    try:
        retstr = d[f] 
    except Exception:
        pass
    return (retstr)

def getDictFieldValue(d, f):
    retv = 0
    try:
        retstr = d[f] 
    except Exception:
        pass
    return (retv)

In [None]:
import re
addr = "3 - 4023 26 AVE SW CALGARY AB T3E 0P1 CANADA"
m = re.search('([^0-9]*)\d+[\,-\]*\s[A-Z]\s+(.+)', addr)
print (m.group(2))

In [None]:
dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'
con = lite.connect(dba)
sqlstr = "delete from address_ext"
con.execute(sqlstr)
con.commit()

i = 0
for record in all_record:
    i = i + 1
    new_text = ""
    try:
        ci_text = str(record[0])
        new_text = ci_text.replace("!", "1")  
    except Exception:
        pass
    
    d = getAddressInfo(new_text)
    
    e = d
    
    '''
    while len(str(e)) < 10:
        e = getAddressInfo('!0 BAISINGER DRIVE WINNIPEG MB R2N 3Y2 CANADA')
        if len(str(e)) < 10:
            time.sleep(60)
    
    if len(str(d)) < 10:
        d = getAddressInfo(new_text)
    '''
    
    if len(str(d)) < 10:    
        sstr = "insert into address_ext values "        
        sstr = sstr + '("' + ci_text + '","'
        sstr = sstr + getDictFieldText(d, 'address') + '","'
        sstr = sstr + getDictFieldText(d, 'city') + '","'
        sstr = sstr + getDictFieldText(d, 'county') + '","'    
        sstr = sstr + getDictFieldText(d, 'country') + '",'    
        sstr = sstr + str(getDictFieldValue(d, 'lat')) + ','    
        sstr = sstr + str(getDictFieldValue(d, 'lng')) + ',"'
        sstr = sstr + getDictFieldText(d, 'neighborhood') + '","'    
        sstr = sstr + getDictFieldText(d, 'osm_id') + '","'    
        sstr = sstr + getDictFieldText(d, 'postal').replace(" ","") + '","'  
        sstr = sstr + getDictFieldText(d, 'quality') + '",' 
        sstr = sstr + '"' + str(d).replace('"', "^") + '")'    

        try:
            con.execute(sstr)
            con.commit()
        except Exception:
            pass
    
#    time.sleep(1)
    
    if ((i % 1000) == 0):
        print(str(i) + ":" + str(d))
        
con.close()

## Geocoder to Standardize Data

In [None]:
import geocoder

i = 0
for addr in l_addr:    
    g = geocoder.osm(addr)
    with open("./data/corporate/address.csv","a") as file_w:
        if g.json is None:
            file_w.write(str(i) + "," + ' "' + addr + '",' + '"' + "" + '"' + "\n")
        else:
            file_w.write(str(i) + "," + ' "' + addr + '",' + '"' + str(g.json) + '"' + "\n")
    i+=1
