# S2_Get_Geo_locator_from_address.ipynb
## Descriptions
* Address as string
* Convert to geo locator using osm 

In [1]:
import sqlite3 as lite
def select_addr(conn):
    """
    """
    cur = conn.cursor()
    cur.execute("select ci_text from corp_info where ci_name in ('REGISTERED OFFICE ADDRESS','DIRECTORS')") 
    rows = cur.fetchall()
    return([r[0] for r in rows])

ddb = "./data/corporate/corp.db"                    

con = lite.connect(ddb)
l_addr = list(set(select_addr(con)))
l_addr = l_addr[1:]
con.close()

## Pre-process Address 

In [53]:
# extract the pretext before the address

import re
def getPreText(txtaddr):
    preText = " : "
    m = re.search("([^0-9]*:[^0-9]*)*", txtaddr)     
    strRemain = txtaddr
    try:
        strRemain = str.replace(txtaddr,str(m.group(0)),"")
    except Exception:
        pass
    try:
        preText = str(m.group(0)).strip()
    except Exception:
        pass
    
    field = ''
    txt = ''
    try:
        l_buf = preText.split(":")
        field = l_buf[0]
        txt = l_buf[1]
    except:
        pass
    return ((txt.lower().strip(), field.lower().strip()), strRemain)
    
print(getPreText('460, RUE SAINTE-CATHERINE OUEST BUREAU 503 MONTREAL QC H3B 1A7 CANADA'))

(('', ''), '460, RUE SAINTE-CATHERINE OUEST BUREAU 503 MONTREAL QC H3B 1A7 CANADA')


## libpostal: international street address NLP
### Install libpostal 
https://github.com/openvenues/libpostal
### Install python package postal
pip install postal

In [55]:
# https://github.com/openvenues/libpostal
# pip install postal
from postal.parser import parse_address
for addr in l_addr[:10]:
    t, _addr = getPreText(addr)
    rec = parse_address(_addr)
    if len(t[0]) > 0:
        rec.append(t) 
    print(addr)
    print(rec)
    

1276 COUTURE GATINEAU QC J8P 1R4 CANADA
[('1276', 'house_number'), ('couture', 'road'), ('gatineau', 'city'), ('qc', 'state'), ('j8p 1r4', 'postcode'), ('canada', 'country')]
24, LOCKPORT CRESCENT BRAMPTON ON L6P 3X9 CANADA
[('24', 'house_number'), ('lockport crescent', 'road'), ('brampton', 'city'), ('on', 'state'), ('l6p 3x9', 'postcode'), ('canada', 'country')]
1 E. CORDOVA STREET, 402 VANCOUVER BC V6A 4H3 CANADA
[('1', 'house_number'), ('e. cordova street 402', 'road'), ('vancouver', 'city'), ('bc', 'state'), ('v6a 4h3', 'postcode'), ('canada', 'country')]
55 QUEEN ST. E. SUITE 1400 TORONTO ON CANADA
[('55', 'house_number'), ('queen st. e.', 'road'), ('suite 1400', 'unit'), ('toronto', 'city'), ('on', 'state'), ('canada', 'country')]
110 PATTON COURT S.W. CALGARY AB T2V 5G3 CANADA
[('110', 'house_number'), ('patton court s.w.', 'road'), ('calgary', 'city'), ('ab', 'state'), ('t2v 5g3', 'postcode'), ('canada', 'country')]
43, OLD FOREST HILL ROAD TORONTO ON M5P 2P8 CANADA
[('43', 'h

### Postal Code
* Forward sortation areas (FSA)  
NL        -                 A  
NS        -                 B  
PE        -                 C  
NB        -                 E  
QC        -                 G, H, J  
ON        -                 K, L, M, N, P  
MB        -                 R  
SK        -                 S  
AB        -                 T  
BC        -                 V  
NU/NT     -                 X  
YT        -                 Y  
* Local delivery units (LDU)  
LDUs ending in zero correspond to postal facilities, from post offices and small franchised retail postal outlets all the way up to sortation plants. 

In [56]:
import re
addr = "j8p 1r4"
m = re.search('([A-Za-z]\d[A-Za-z]) ?(\d[A-Za-z]\d)', addr)
print (m.group(1))

j8p


In [57]:
import re
def getPostalCode(addr):
    m = re.search('([A-Za-z]\d[A-Za-z]) ?(\d[A-Za-z]\d)', addr)
    return m.group(0), m.group(1), m.group(2)

for addr in l_addr[:10]:
    try:
        PC, FSA, LDU = getPostalCode(addr)
    except:
        PC = "UNK"
        FSA = "UNK"
        LDU = "UNK"
    print(addr + ": " + PC + "," + FSA + "," + LDU)

1276 COUTURE GATINEAU QC J8P 1R4 CANADA: J8P 1R4,J8P,1R4
24, LOCKPORT CRESCENT BRAMPTON ON L6P 3X9 CANADA: L6P 3X9,L6P,3X9
1 E. CORDOVA STREET, 402 VANCOUVER BC V6A 4H3 CANADA: V6A 4H3,V6A,4H3
55 QUEEN ST. E. SUITE 1400 TORONTO ON CANADA: UNK,UNK,UNK
110 PATTON COURT S.W. CALGARY AB T2V 5G3 CANADA: T2V 5G3,T2V,5G3
43, OLD FOREST HILL ROAD TORONTO ON M5P 2P8 CANADA: M5P 2P8,M5P,2P8
8 AVENUE BRITTANY MONT-ROYAL QC H3P 1A6 CANADA: H3P 1A6,H3P,1A6
519 DWYER STREET HUDSON QC J0P 1H0 CANADA: J0P 1H0,J0P,1H0
9 CARR CRT CHARLOTTETOWN PE C1E 1W6 CANADA: C1E 1W6,C1E,1W6
8 PEMBERTON COURT BRAMPTON ON L6W 4K5 CANADA: L6W 4K5,L6W,4K5


In [None]:
def getStrInParentheses(Instr):
    m = re.findall("\((.*?)\)", Instr)
    if m == []:
        m = ""
        n = Instr
    else:
        m = m[len(m)-1]
        n = (Instr[:-1*(len(m)+3)])
    l = n.split(" ")
    return (l, m)

print(getStrInParentheses("STEVE ODENY (SECRETARY)"))

### 
#### CREATE TABLE corp_addr_ext(
####     ci_corp_id INT,
####     ci_name TEXT,
####     ci_lab TEXT,
####     ci_text TEXT, ci_postal TEXT, ci_firstname TEXT, ci_lastname TEXT);

In [None]:
import time
i = 0

import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select * from corp_addr'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

#print(all_record[0])

dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'
con = lite.connect(dba)
sqlstr = "delete from corp_addr_ext"
con.execute(sqlstr)
con.commit()

for record in all_record:
    i = i + 1
    try:
        ci_corp_id = int(record[0])
        ci_name = str(record[1])
        ci_lab = str(record[2])
        ci_text = str(record[3])
        postal = getPostalCode(ci_text)
        (n, r) = getStrInParentheses(ci_lab) 
    except Exception:
        pass
    
    if i == 1:
        sstr = "insert into corp_addr_ext values "
        
    sstr = sstr + "(" + str(ci_corp_id) + ",'"
    sstr = sstr + ci_name + "','"
    sstr = sstr + ci_lab+ "','"
    sstr = sstr + ci_text + "','"
    sstr = sstr + postal + "','"
    sstr = sstr + " ".join(n[:-1]) + "','"
    sstr = sstr + n[-1:][0] + "'), "
    
    if i > 1000:
        con.execute(sstr[:-2])
        con.commit()
        con.close()
        con = lite.connect(db)
        i = 0
        
con.close()

#### CREATE TABLE addr_cleaned(
#### addr_text TEXT,
#### addr_addr TEXT,
#### addr_box TEXT,
#### addr_unit TEXT,
#### addr_suite TEXT,
#### addr_province TEXT,
#### addr_postal TEXT,
#### addr_country TEXT
#### );

In [None]:
import sqlite3 as lite
dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'

con = lite.connect(dba)
cur = con.cursor()
with con:
    cmdstr = 'select ci_text from address order by ci_text DESC'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

# Clear the repository
con = lite.connect(dba)
sqlstr = "delete from addr_cleaned"
con.execute(sqlstr)
con.commit()

i = 0
for record in all_record:
    i = i + 1
    new_text = ""
    try:
        ci_text = str(record[0])
        new_text = ci_text.replace("!", "1").replace("É", "E")  
    except Exception:
        pass
    
    (province,postalcode,country,strR) = getProvPSCountry(new_text)
    (unit,strR1) = getUnitNumber("UNIT|Unit|UNIT", strR)
    (suite,strR2A) = getUnitNumber("SUITE|Suite|suite", strR1) 
    (room,strR2B) = getUnitNumber("\sAPP|\sAPT|\sRM|ROOM", strR2A)  
    (box,strR3A) = getUnitNumber("\(?P\.?\s?O\.?\)?\s?BOX\.?|BOX\.?|POB\.X", strR2B)
    (preAddrTxt,strR3B) = getPreText(strR3A.strip())
    strR4= rplceToken("(DRIVE[\s\,])", "DR ", strR3B)
    strR5= rplceToken("(BOULEVARD[\s\,])", "BLVD ", strR4)
    strR6= rplceToken("(CRESCENT[\s\,])", "CRES ", strR5)
    strR7= rplceToken("(STREET[\s\,]|STR[\s\,])", "ST ", strR6)
    strR8= rplceToken("(ROAD[\s\,]|RAOD[\s\,])", "RD ", strR7)
    strR8A= rplceToken("(AVENUE[\s\,])", "AVE ", strR8)
    strR8B= re.sub('[^0-9a-zA-Z`’]+', ' ', strR8A)
    strR8C= rplceToken("(HIGHWAY[\s\,])", "HWY ", strR8B)
    strR8D= rplceToken("(UNIVERSITY[\s\,])", "UVTY ", strR8C)    
    strR9= str.replace(strR8D,".","")
    strR10= rplceToken("\,?(\s+)", " ", strR9).lstrip('-').rstrip()

#    print(strR10 + " | " + province + " | " + postalcode + " | " + country + " | " + unit + " | " + suite + " | " + box + " | ")    
    
    if len(ci_text) > 1:    
        sstr = "insert into addr_cleaned values "        
        sstr = sstr + '("' + ci_text + '","'
        sstr = sstr + strR10.strip() + '","'
        sstr = sstr + preAddrTxt.strip() + '","'
        sstr = sstr + box.strip() + '","'
        sstr = sstr + room.strip() + '","'
        sstr = sstr + unit.strip() + '","'
        sstr = sstr + suite.strip() + '","'
        sstr = sstr + province.strip() + '","'
        sstr = sstr + postalcode.strip() + '","'
        sstr = sstr + country.strip() + '")'
  
        try:
            con.execute(sstr)
            con.commit()
        except Exception:
            pass
    
#    time.sleep(1)
    
    if ((i % 3000) == 1):
        print(str(i) + ": |" + preAddrTxt.strip() + " | " + strR10.strip() + " | " + ci_text)  
    
        
con.close()

In [None]:
new_text = "SUITE 3500 2 BLOOR STREET WEST TORONTO ON M4W 1A8 CANADA"
(province,postalcode,country,strR) = getProvPSCountry(new_text)
(unit,strR1) = getUnitNumber("UNIT|Unit|UNIT", strR)
(suite,strR2A) = getUnitNumber("SUITE|Suite|suite", strR1) 
(room,strR2B) = getUnitNumber("\sAPT|\sRM|ROOM", strR2A)  
(box,strR3A) = getUnitNumber("\(?P\.?\s?O\.?\)?\s?BOX\.?|BOX\.?|POB\.X", strR2B)
(preAddrTxt,strR3B) = getPreText(strR3A.strip())
strR4= rplceToken("(DRIVE[\s\,])", "DR ", strR3B)
strR5= rplceToken("(BOULEVARD[\s\,])", "BLVD ", strR4)
strR6= rplceToken("(CRESCENT[\s\,])", "CRES ", strR5)
strR7= rplceToken("(STREET[\s\,]|STR[\s\,])", "ST ", strR6)
strR8= rplceToken("(ROAD[\s\,]|RAOD[\s\,])", "RD ", strR7)
strR8A= rplceToken("(AVENUE[\s\,])", "AVE ", strR8)
strR8B= re.sub('[^0-9a-zA-Z`’]+', ' ', strR8A)
strR8C= rplceToken("(HIGHWAY[\s\,])", "HWY ", strR8B)
strR8D= rplceToken("(UNIVERSITY[\s\,])", "UVTY ", strR8C)    
strR9= str.replace(strR8D,".","")
strR10= rplceToken("\,?(\s+)", " ", strR9).lstrip('-').rstrip()
print(strR1)
print(strR2A)
print(strR3A)
print(strR3B)
print(strR4)
print(strR5)

### This is to run address to geocode
### This part is to add more context on the addr: Mainly to get the longitude and latitude of the address

In [None]:
import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select * from addr_cleaned order by addr_text'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()

import geocoder
import time
url = 'http://localhost/nominatim/'
db = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'

def runGeoCoder(buf, url, trial):
    g = None
    t = trial
    if t < 3:
        t = t + 1
        try:
            g = geocoder.osm(buf, url=url)
        except Exception:
            g = runGeoCoder(buf, url, t)
            pass
    return g

icnt = 1
ncnt = 0
for record in all_record:
    ncnt = ncnt + 1
    
    if icnt == 1:cmdstr = 'INSERT INTO addr_complete VALUES '

    buf = record[1]
    buf = buf.replace("UVTY", "UNIVERSITY")
    g = runGeoCoder(buf, url, 1)    

    lon = "0"
    lat = "0"
    raw = ""
    place_id = "0"
    osm_id = "0"
    
    i = 1
    while (g.json is None):
        try:
            buf = buf.split(' ', 1)[1]
            g = runGeoCoder(buf, url, 1)
            sleep(0.1)
        except Exception:
            pass
        i = i + 1
        if i > 2: break
        if len(buf) < 10:break
            
    if  not (g.json is None):
        icnt = icnt + 1
        try:
            lon = g.json["raw"]['lon']
            lat = g.json["raw"]['lat']
        except Exception:
            pass
        
        try:
            state = g.json['raw']['address']['state']
        except Exception:
            pass

        try:
            city = g.json['raw']['address']['city']
        except Exception:
            pass
        
        try:
            country = g.json['raw']['address']['country']
        except Exception:
            pass

        try:    
            raw = g.json['raw']['display_name']
            place_id = g.json["place_id"]
            osm_id = g.json["osm_id"]            
            con = lite.connect(db)
            cur = con.cursor()
            
            cmdstr = cmdstr + "('" + record[0] + "','" + record[1] + "','" + record[2] + "','" + record[3] + "','"
            cmdstr = cmdstr + record[4] + "','" + record[5] + "','" + record[6] + "','" + record[7] + "','"
            cmdstr = cmdstr + record[8] + "','" + record[9] + "'," 
            cmdstr = cmdstr + place_id + "," + osm_id + "," + lon + "," + lat + ',"' + state  + '","' + city + '","' + country + '","' 
            cmdstr = cmdstr + str(raw) + '"), '
            
            if icnt > 1000:
                print (str(ncnt) + ":" + buf + ":" + str(g))
                icnt = 0;
                con = lite.connect(db)
                con.execute(cmdstr[:-2])
                con.commit()
                con.close()            
        except Exception:
            pass
        

In [None]:
#print(cmdstr[:-2])
print(g.json)
print(icnt)
print(buf)
print(record[1])
print(str(record))
buf = "2 BLOOR STREET WEST TORONTO"
print(runGeoCoder(buf, url, 1).json['raw']['display_name'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['state'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['city'])
print(runGeoCoder(buf, url, 1).json['raw']['address']['country'])

#### This part is to add more context on the addr: Mainly to get the longitude and latitude of the address

In [None]:
import sqlite3 as lite

con = lite.connect('/home/wk/myProjects/FinancialModel/data_acquisition/corp.db')
cur = con.cursor()
with con:
    cmdstr = 'select ci_text from address where ci_text not in (select ci_text from address_ext where not address = "" ) order by ci_text'
    cur.execute(cmdstr)
    all_record = cur.fetchall()
con.close()


In [None]:
import time
import geocoder


def getAddressInfo(addrIn):
    addrDictOut = {}
    if len(addrIn) > 0:
        try:
 #           g = geocoder.google(addrIn)
            m = re.search('([^0-9]*)\d+[\,-\]*\s[A-Z]\s+(.+)', addrIn)
            print(m.group(2))
    
            g = geocoder.osm(m.group(2))
            addrDictOut = g.json
            print(addrDictOut)
        except Exception:
            pass
    return (addrDictOut)

def getDictFieldText(d, f):
    retstr = ""
    try:
        retstr = d[f] 
    except Exception:
        pass
    return (retstr)

def getDictFieldValue(d, f):
    retv = 0
    try:
        retstr = d[f] 
    except Exception:
        pass
    return (retv)

In [None]:
import re
addr = "3 - 4023 26 AVE SW CALGARY AB T3E 0P1 CANADA"
m = re.search('([^0-9]*)\d+[\,-\]*\s[A-Z]\s+(.+)', addr)
print (m.group(2))

In [None]:
dba = '/home/wk/myProjects/FinancialModel/data_acquisition/corp.db'
con = lite.connect(dba)
sqlstr = "delete from address_ext"
con.execute(sqlstr)
con.commit()

i = 0
for record in all_record:
    i = i + 1
    new_text = ""
    try:
        ci_text = str(record[0])
        new_text = ci_text.replace("!", "1")  
    except Exception:
        pass
    
    d = getAddressInfo(new_text)
    
    e = d
    
    '''
    while len(str(e)) < 10:
        e = getAddressInfo('!0 BAISINGER DRIVE WINNIPEG MB R2N 3Y2 CANADA')
        if len(str(e)) < 10:
            time.sleep(60)
    
    if len(str(d)) < 10:
        d = getAddressInfo(new_text)
    '''
    
    if len(str(d)) < 10:    
        sstr = "insert into address_ext values "        
        sstr = sstr + '("' + ci_text + '","'
        sstr = sstr + getDictFieldText(d, 'address') + '","'
        sstr = sstr + getDictFieldText(d, 'city') + '","'
        sstr = sstr + getDictFieldText(d, 'county') + '","'    
        sstr = sstr + getDictFieldText(d, 'country') + '",'    
        sstr = sstr + str(getDictFieldValue(d, 'lat')) + ','    
        sstr = sstr + str(getDictFieldValue(d, 'lng')) + ',"'
        sstr = sstr + getDictFieldText(d, 'neighborhood') + '","'    
        sstr = sstr + getDictFieldText(d, 'osm_id') + '","'    
        sstr = sstr + getDictFieldText(d, 'postal').replace(" ","") + '","'  
        sstr = sstr + getDictFieldText(d, 'quality') + '",' 
        sstr = sstr + '"' + str(d).replace('"', "^") + '")'    

        try:
            con.execute(sstr)
            con.commit()
        except Exception:
            pass
    
#    time.sleep(1)
    
    if ((i % 1000) == 0):
        print(str(i) + ":" + str(d))
        
con.close()

## Geocoder to Standardize Data

In [None]:
import geocoder

i = 0
for addr in l_addr:    
    g = geocoder.osm(addr)
    with open("./data/corporate/address.csv","a") as file_w:
        if g.json is None:
            file_w.write(str(i) + "," + ' "' + addr + '",' + '"' + "" + '"' + "\n")
        else:
            file_w.write(str(i) + "," + ' "' + addr + '",' + '"' + str(g.json) + '"' + "\n")
    i+=1
