
# Federal Corporations

The database contains information about the following types of federal corporations:

Business corporations created under the Canada Business Corporations Act (CBCA), Not-for-profit corporations created under the Canada Corporations Act, Part II (CCA II), Not-for-profit corporations created under the Canada Not-for profit Corporations Act (NFP), Cooperatives created under the Canada Cooperatives Act (COOP), Board of trades created under the Boards of Trade Act (BOTA), Other corporations regulated by Corporations Canada (e.g., special act corporation)

The database does NOT include information about corporations created under financial legislation (such as financial institutions, insurance companies or loan and trust companies) or those created under provincial, territorial or other corporate legislation.


https://open.canada.ca/data/en/dataset/0032ce54-c5dd-4b66-99a0-320a7b5e99f2

/data/corporate/OPEN_DATA_SPLIT.zip

* note only the corp_master is used. Rest of the information are gather by scrapping web page

In [None]:
import requests as rq
import json
import os
import time
import random
from lxml import etree
from bs4 import BeautifulSoup as bsoup
import sqlite3 as lite

In [None]:
dn = "C:/myProjects/GetCorpData/data/OPEN_DATA_SPLIT/"
ddb = "C:/myProjects/FinancialModel/data_acquisition/"
xsd_doc = etree.parse(dn+'corpcan-codes.xsd')
xsd = etree.XMLSchema(xsd_doc)
xml = etree.parse(dn+'OPEN_DATA_1.xml')
xsd.validate(xml)
print (xsd.error_log)

In [None]:
def cleanText(intext):
    ""
    try:
        my_text = str(intext)
    except TypeError:
        my_text=""    
    my_text = my_text.rstrip() 
    if my_text == "None":
        my_text=""
    return my_text

In [None]:
def formatDate(d):
    ""
    #yyyy-MM-dd HH:mm:ss
    e = d.split('T')
    ee = e[0] + " " + e[1]
    return (ee)

In [None]:
# CREATE TABLE CORP_MASTER
# (
#     CM_CORP_ID INT,
#     CM_ACT INT,
#     CM_STATUS INT,
#     CM_BN TEXT,
#     CM_DIRECTOR_MIN INT,
#     CM_DIRECTOR_MAX INT,
#     CM_CTRLDATE DATE,
#     PRIMARY KEY (CM_CORP_ID)
# );

In [None]:
def AppendCorporateMaster(con, corp_id, act, status, bn, dir_min, dir_max):
    ""
    cur = con.cursor()
    sqlstr = "INSERT INTO CORP_MASTER " 
    sqlstr = sqlstr + " (CM_CORP_ID, CM_ACT, CM_STATUS, CM_BN, CM_DIRECTOR_MIN, CM_DIRECTOR_MAX, CM_CTRLDATE) "
    sqlstr = sqlstr + " VALUES "
    sqlstr = sqlstr + "(" + str(corp_id) +"," + str(act) + "," + str(status) + ",'" + str(bn) + "'," + str(dir_min) + "," + str(dir_max) + ",date('now'))"
    with con:
        cur.execute(sqlstr)
    
    return 

In [None]:
# CREATE TABLE ANNUALRETURN
# (		
#     AR_CORP_ID INT,
#     AR_YEAROFFILING INT,
#     AR_ANNUALMEETINGDATE DATE
# );

In [None]:
def AppendAnnaulReturn(con,annualReturns):
    ""
    cur = con.cursor()    
    
    for r in annualReturns:
        sqlstr = "INSERT INTO ANNUALRETURN " 
        sqlstr = sqlstr + " (AR_CORP_ID, AR_YEAROFFILING, AR_ANNUALMEETINGDATE) "
        sqlstr = sqlstr + " VALUES "
        sqlstr = sqlstr + "(" + r[0] + "," + r[1] + ",'" + formatDate(r[2]) +"')"
        with con:
            cur.execute(sqlstr) 
    return 

In [None]:
# CREATE TABLE ACTIVITIES
# (		
#     ACTY_CORP_ID INT,
#     ACTY_CODE INT,
#     ACTY_DATE DATE
# );

In [None]:
def AppendActivities(con, activities):
    ""
    cur = con.cursor()    
    
    for r in activities:
        sqlstr = "INSERT INTO ACTIVITIES " 
        sqlstr = sqlstr + " (ACTY_CORP_ID, ACTY_CODE, ACTY_DATE) "
        sqlstr = sqlstr + " VALUES "
        sqlstr = sqlstr + "(" + r[0] + "," + r[1] + ",'" + formatDate(r[2]) +"')"
        with con:
            cur.execute(sqlstr)         
    return 

In [None]:
# CREATE TABLE CORP_NAME
# (
#     CN_CORP_ID INT,
#     CN_CODE INT,
#     CN_EFF_DATE DATE,
#     CN_EXP_DATE DATE,
#     CN_NAME TEXT		
# );
# 

In [None]:
def AppendCorpName(con,corp_name):
    ""
    cur = con.cursor()    
    for r in corp_name:
        sqlstr = "INSERT INTO CORP_NAME " 
        sqlstr = sqlstr + " (CN_CORP_ID, CN_CODE, CN_EFF_DATE, CN_EXP_DATE, CN_NAME) "
        sqlstr = sqlstr + " VALUES "
        s = "NULL"
        if r[3] != "" :
            s = "'" + formatDate(r[3]) + "'"
        t = str(r[4]).replace("'", "`")
        sqlstr = sqlstr + "(" + r[0] + "," + r[1] + ",'" + formatDate(r[2]) + "'," + s + ",'" + t + "')"
        with con:
            cur.execute(sqlstr)         
    return 

In [None]:
# CREATE TABLE CORP_ADDR
# (
#     CN_CORP_ID INT,
# 	CN_CODE INT,
#     CN_ADDR_LN TEXT,
#     CN_CITY TEXT,
#     CN_PROV TEXT,
#     CN_CNTY TEXT,
#     CN_PC TEXT
# );
# 

In [None]:
def AppendCorpAddr(con,addresses):
    ""
    cur = con.cursor()    
    
    for r in addresses:
        sqlstr = "INSERT INTO CORP_ADDR " 
        sqlstr = sqlstr + " (CN_CORP_ID, CN_CODE, CN_ADDR_LN, CN_CITY, CN_PROV, CN_CNTY, CN_PC) "
        sqlstr = sqlstr + " VALUES "
        t = str(r[2]).replace("'", "`")
        c = str(r[3]).replace("'", "`")
        p = str(r[6]).replace(" ", "")
        sqlstr = sqlstr + "(" + r[0] + "," + r[1] + ",'" + t + "','" + c + "','" + r[4] + "','" + r[5] + "','" + p + "')"
        with con:
            cur.execute(sqlstr)         
    return 


### Scroll through files under the directory

In [None]:
for i_f in range(1, 39):
    filn = dn + 'OPEN_DATA_' + str(i_f) + '.xml'
    xml = etree.parse(filn)
    root = xml.getroot()
    cnt_corp = -1
    for corporation in root.iter('corporation'): 
        cnt_corp = cnt_corp + 1
        cnt_elem = 0

        #<corporation corporationId="1007">
        corporationId = corporation.items()[0][1]

        #<act code="8"/>
        try:
            act = corporation.iter('act')
            actcode = next(act).items()[0][1] 
        #<status code="1"/>
            status = corporation.iter('status')
            statuscode = next(status).items()[0][1]
        except:
            print(str(cnt_corp) + ":" + corporationId + ":no Info")
            continue
            pass
        #<annualReturn annualMeetingDate="2017-03-29T00:00:00" yearOfFiling="2017"/>
        annualReturns = []
        for element in corporation.iter('annualReturn'):
            annualReturns.append([corporationId,element.items()[1][1],element.items()[0][1]]) 

        #<activity code="1" date="1947-01-10T00:00:00"/>
        activities = []
        for element in corporation.iter('activity'):
            activities.append([corporationId,element.items()[0][1],element.items()[1][1]])
       #<name code="1" effectiveDate="1991-12-27T00:00:00" expiryDate="1995-02-06T00:00:00">ABBOTSFORD-MATSQUI CHAMBER OF COMMERCE</name>
        names = []

        for element in corporation.iter('name'):
            d_i = {}
            i = 0
            d_i["expiryDate"] = ""
            for e in element.items(): 
                d_i[element.items()[i][0]] = element.items()[i][1]
    #            if element.items()[i][0] == 'current':
    #                d_i["expiryDate"] = ""
                i = i + 1
            names.append([corporationId, d_i['code'],d_i['effectiveDate'],d_i['expiryDate'],element.text])
    #   <addresses>
    #      <address code="2">
    #         <addressLine>1053 ST-ANDRE</addressLine>
    #         <city>ACTON VALE</city>
    #         <province code="QC"/>
    #         <country code="CA"/>
    #         <postalCode>J0H 1A0</postalCode>
    #      </address>
    #    </addresses>
        try:
            addresses = []
            addr_collection = next(corporation.iter('addresses'))
            for e in addr_collection:
                addr = next(e.iter('address'))
                addr_code = addr.items()[0][1]
                addrline = next(addr.iter('addressLine')).text
                cit = next(addr.iter('city')).text 
                province = next(addr.iter('province')).items()[0][1]          
                country = next(addr.iter('country')).items()[0][1]
                postalCode = next(addr.iter('postalCode')).text
                addresses.append([corporationId, addr_code, addrline, cit, province, country, postalCode])     
        except:
            print(str(cnt_corp) + ":" + corporationId + ":no Address")
            pass
    #    <directorLimit>
    #         <minimum>3</minimum>
    #         <maximum>30</maximum>
    #    <directorLimit>  
        try:
            directorlimit = next(corporation.iter('directorLimit'))
            dl_min = next(directorlimit.iter('minimum')).text
            dl_max = next(directorlimit.iter('maximum')).text
        except:
            print(str(cnt_corp) + ":" + corporationId + ":no Directors")
            pass
    #    <businessNumber>106679285</businessNumber>
        try:
            bn = next(corporation.iter('businessNumber')).text
#            print (str(cnt_corp) + ":" + corporationId + ":" + bn)
        except:
#            print (str(cnt_corp) + ":" + corporationId + ":no bn")
            pass

        if cnt_corp % 500 == 0:
            con = lite.connect(ddb + 'test.db')

        AppendCorporateMaster(con,corporationId, actcode, statuscode, bn, dl_min, dl_max)
        AppendAnnaulReturn(con,annualReturns)
        AppendActivities(con,activities)
        AppendCorpName(con,names)
        AppendCorpAddr(con,addresses)

        if cnt_corp % 500 == 499:
            con.close()
