# Program: Scrape Federal Corporation Information 
* Source: https://www.ic.gc.ca/app/scr/cc/CorporationsCanada/fdrlCrpDtls.html?corpId=1007  
* CorpId list: https://open.canada.ca/data/en/dataset/0032ce54-c5dd-4b66-99a0-320a7b5e99f2


In [1]:
from bs4 import BeautifulSoup as bsoup
import re
import math
import requests as rq
import sqlite3 as lite

In [2]:
def getList(t):
    l_e = t.get_text(separator=u'^').split("^")
    e = [re.sub( '\s+', ' ', x.upper().replace("\t", " ").replace('\n', '').replace('(MM-DD)',"").strip()) for x in l_e]
    e[:] = [item for item in e if item != '']
    return e

In [3]:
def appendRecord(id, l_info, e):
    l_info.append([id] + [l.upper().strip().replace("'", "`").replace('"', "`") for l in e])
    return

In [4]:
def saveRecord(ddb,l_info):
    if len(l_info) > 0:
        sqlstr = "insert into corp_info (ci_corp_id, ci_name, ci_lab, ci_text) VALUES "
        for r in l_info:
            sqlstr = sqlstr + "(" + r[0] + ",'" + r[1] + "','" + r[2] + "','" + r[3] + "'), "
        sqlstr = sqlstr[:-2] + ";"

        conn = lite.connect(ddb)
        cur = conn.cursor()

        with conn:
            cur.execute(sqlstr)
        conn.close()
    return

In [5]:
def scrapInfo(id):
    url = "https://www.ic.gc.ca/app/scr/cc/CorporationsCanada/fdrlCrpDtls.html?corpId=" + id
    r = rq.get(url)
    in_data = r.text
    in_data = in_data.replace("</cite>", " ").replace("<cite>", " ")
    soup = bsoup(in_data, "html.parser")
    # misc information
    try:
        table = soup.find_all("div",{'class': "row data-display-group"})
        l_info = []    
        for t in table:
            e = getList(t)
            if e[0] == 'STATUS OF ANNUAL FILINGS' and len(e) > 2:
                for i in range(0,len(e)-1):
                    appendRecord(id, l_info, [e[0]] + (str(e[i+1]).replace(" ", "").split('-')))
            else:
                appendRecord(id, l_info, [e[0], "", ' '.join(e[1:])])
    except:
        print (str(id) + ":error")
        pass
    # Registered Office Address
    try:
        table = soup.find_all(lambda tag: tag.name == 'div' and 
                                           tag.get('class') == ["well"])
        for t in table:
            appendRecord(id,l_info,["Registered Office Address", "", ' '.join(str(e) for e in getList(t))])
    except:
        pass
    # Num of Directors
    try:

        table = soup.find_all("div",{'class': "col-sm-4 nowrap"})
        for t in table:
            l = getList(t)
            appendRecord(id,l_info,["Number of Directors", l[0], l[1]])
    except:
        pass
    # Name History
    try:
        table = soup.find_all("div",{'class': "table-responsive"})
        for t in table:
            e = getList(t)
            for i in range (0, math.floor(len(e)/2)):
                appendRecord(id, l_info, [e[0], e[i*2+1], e[i*2+2]])
    except:
        pass    
    # Directors
    try:
        table = soup.find_all('li', { "class" : "full-width pad-bttm-md list-inline-block" })
        for t in table:
            l = getList(t)
            appendRecord(id,l_info,["Directors", l[0], ' '.join(str(e) for e in l[1:])])
    except:
        pass
    return (l_info)

In [6]:
def select_corp(conn):
    """
    """
    cur = conn.cursor()
    cur.execute("SELECT cm_corp_id from corp_master order by cm_corp_id") 
    rows = cur.fetchall()
    return([r[0] for r in rows]) 

In [7]:
ddb = "./data/corporate/corp.db"                    

con = lite.connect(ddb)
l_corp = select_corp(con)
con.close()

'''
# Testing: overwrite the database functions
def appendRecord(id, l_info, e):
    l_info.append([id] + [l.upper().strip().replace("'", "`").replace('"', "`") for l in e])
    print([id] + [l.upper().strip().replace("'", "`").replace('"', "`") for l in e])
    return

def saveRecord(ddb, l_info):
    print('_'*300)
    print('')
    print(l_info)
    print('')
    print('_'*300)
    return
'''

cnt_corp = -1
#for corp in l_corp[0:3]:
for corp in l_corp:
    cnt_corp = cnt_corp + 1
    if (cnt_corp % 100) == 0:
        print(str(cnt_corp) + ":" + str(corp))
    l_info = []  
    try:
        l_info = scrapInfo(str(corp))
    except:
        print (str(corp) + " has problem in scrap")
        pass
        
    if len(l_info) > 0:
        saveRecord(ddb,l_info)
    else:
        print (str(corp) + " has NO RECORD")

0:1007
['1007', 'CORPORATION NUMBER', '', '000100-7']
['1007', 'BUSINESS NUMBER (BN)', '', '106679285RC0001']
['1007', 'CORPORATE NAME', '', 'ABBOTSFORD CHAMBER OF COMMERCE']
['1007', 'STATUS', '', 'ACTIVE']
['1007', 'GOVERNING LEGISLATION', '', 'BOARDS OF TRADE ACT - PART II - 1947-01-10']
['1007', 'ANNIVERSARY DATE', '', '01-10']
['1007', 'DATE OF LAST ANNUAL MEETING', '', '2019-03-27']
['1007', 'ANNUAL FILING PERIOD', '', '04-01 TO 06-01']
['1007', 'TYPE OF CORPORATION', '', 'NOT AVAILABLE']
['1007', 'STATUS OF ANNUAL FILINGS', '2020', 'NOTDUE']
['1007', 'STATUS OF ANNUAL FILINGS', '2019', 'FILED']
['1007', 'STATUS OF ANNUAL FILINGS', '2018', 'FILED']
['1007', 'INCORPORATION', '', '1947-01-10']
['1007', 'BY-LAWS', '', 'RECEIVED ON 2005-12-20']
['1007', 'BY-LAWS', '', 'RECEIVED ON 2007-03-20']
['1007', 'BY-LAWS', '', 'RECEIVED ON 2010-04-28']
['1007', 'BY-LAWS', '', 'RECEIVED ON 2011-02-04']
['1007', 'BY-LAWS', '', 'RECEIVED ON 2012-04-17']
['1007', 'REGISTERED OFFICE ADDRESS', '', '

In [8]:
ddb = "./data/corporate/corp.db"                   
con = lite.connect(ddb)
l_corp = select_corp(con)
con.close()


In [9]:
l_corp[0:3]

[1007, 1015, 1031]