In [5]:
# Python3
import sqlite3
from urllib import request, error
from sqlite3 import Error
from bs4 import BeautifulSoup, Tag

database = "./ACCT/database2.db"

headTblFields = """accNum text NOT NULL,
headId text PRIMARY KEY NOT NULL,
rowNumber integer DEFAULT 1,
documentType text NOT NULL,
publicDocCount integer DEFAULT 0,
periodOfReport text,
filedDate text,
changedDate text,
schemaVersion text,
dateOfOriginalSubmission text,
notSubjectToSection16 integer DEFAULT 0,
issuerName text,
issuerCik text,
issuerIndustrialClassification text,
issuerIrs text,
issuerIncorpState text,
issuerFiscalYrEnd text,
issuerBusinessStreet1 text,
issuerBusinessCity text,
issuerBusinessState text,
issuerBusinessZip text,
issuerBusinessPhone text,
issuerMailStreet1 text,
issuerMailStreet2 text,
issuerMailCity text,
issuerMailState text,
issuerMailZip text,
issuerTradingSymbol text,
rptOwnerName text,
rptOwnerCik text DEFAULT 0,
rptOwnerFormType text,
rptOwnerSecAct text,
rptOwnerSecFileNum text,
rptOwnerFilmNum text,
rptOwnerBusinessPhone text,
rptOwnerStreet1 text,
rptOwnerStreet2 text,
rptOwnerCity text,
rptOwnerState text,
rptOwnerZipCode text,
rptOwnerStateDescription text,
rptOwnerMailStreet1 text,
rptOwnerMailStreet2 text,
rptOwnerMailCity text,
rptOwnerMailState text,
rptOwnerMailZip text,
rptOwnerisDirector integer DEFAULT 0,
rptOwnerisOfficer integer DEFAULT 0,
rptOwnerisTenPercentOwner integer DEFAULT 0,
rptOwnerisOther integer DEFAULT 0"""

dTTblFields = """accNum text NOT NULL,
dTId text PRIMARY KEY NOT NULL,
rowNumber integer DEFAULT 1,
securityTitle text,
conversionOrExercisePrice text,
transactionDate text,
transactionFormType text,
transactionCode text,
equitySwapInvolved text,
transactionShares text,
transactionPricePerShare text,
transactionAcquiredDisposedCode text,
exerciseDate text,
expirationDate text,
underlyingSecurityTitle text,
underlyingSecurityShares text,
sharesOwnedFollowingTransaction text,
directOrIndirectOwnership text,
type text,
documentType text,
footNoteId text,"""

nDTTblFields = """accNum text NOT NULL,
nDTId text PRIMARY KEY NOT NULL,
rowNumber integer DEFAULT 1,
securityTitle text,
transactionDate text,
transactionFormType text,
transactionCode text,
equitySwapInvolved text,
transactionTimelines text,
transactionShares text,
transactionPricePerShare text,
transactionAcquiredDisposedCode text,
sharesOwnedFollowingTransaction text,
directOrIndirectOwnership text,
natureOfOwnership text,
type text,
documentType text,
footNoteId text,"""

footNoteTblFields = """accNum text NOT NULL,
rowNumber integer DEFAULT 1,
footNoteId text PRIMARY KEY NOT NULL,
fId text,
originalTableType text,
documentType text,
footNoteField text,
footNote text"""

dTTblSql = dTTblFields + "FOREIGN KEY(footNoteId) REFERENCES form4footNote(footNoteId)"
nDTTblSql = nDTTblFields + "FOREIGN KEY(footNoteId) REFERENCES form4footNote(footNoteId)"

In [6]:
textHeadMap = {"ACCESSION NUMBER": "accNum",
              "CONFORMED SUBMISSION TYPE": "documentType",
              "PUBLIC DOCUMENT COUNT": "publicDocCount",
              "CONFORMED PERIOD OF REPORT": "periodOfReport",
              "FILED AS OF DATE": "filedDate",
              "DATE AS OF CHANGE": "changedDate"}
issuerMap = {"COMPANY CONFORMED NAME": "issuerName",
              "CENTRAL INDEX KEY": "issuerCik",
              "STANDARD INDUSTRIAL CLASSIFICATION": "issuerIndustrialClassification",
              "IRS NUMBER": "issuerIrs",
              "STATE OF INCORPORATION": "issuerIncorpState",
              "FISCAL YEAR END": "issuerFiscalYrEnd"}
issuerBizMap = {"STREET 1": "issuerBusinessStreet1",
              "CITY": "issuerBusinessCity",
              "STATE": "issuerBusinessState",
              "ZIP": "issuerBusinessZip",
              "BUSINESS PHONE": "issuerBusinessPhone"}
issuerMailMap = {"STREET 1": "issuerMailStreet1",
              "STREET 2": "issuerMailStreet2",
              "CITY": "issuerMailCity",
              "STATE": "issuerMailState",
              "ZIP": "issuerMailZip"}
ownerMap = {"COMPANY CONFORMED NAME": "rptOwnerName",
          "CENTRAL INDEX KEY": "rptOwnerCik",
          "FORM TYPE": "rptOwnerFormType",
          "SEC ACT": "rptOwnerSecAct",
          "SEC FILE NUMBER": "rptOwnerSecFileNum",
          "FILM NUMBER": "rptOwnerFilmNum",
          "STREET 1": "rptOwnerMailStreet1",
          "STREET 2": "rptOwnerMailStreet2",
          "CITY": "rptOwnerMailCity",
          "STATE": "rptOwnerMailState",
          "ZIP": "rptOwnerMailZip",
          "BUSINESS PHONE": "rptOwnerBusinessPhone"}
xml2SqlOwnFields = {"isDirector" : "rptOwnerisDirector",
                 "isOfficer": "rptOwnerisOfficer", 
                 "isTenPercentOwner": "rptOwnerisTenPercentOwner", 
                 "isOther": "rptOwnerisOther"}
xmlEtcHeaders = ["schemaVersion", "documentType", "periodOfReport", "dateOfOriginalSubmission", "notSubjectToSection16"]
xmlIssHeaders = ["issuerCik", "issuerName", "issuerTradingSymbol"]
xmlOwnHeaders = ["rptOwnerCik", "rptOwnerName", "rptOwnerStreet1", "rptOwnerStreet2", "rptOwnerCity", "rptOwnerState","rptOwnerZipCode", "rptOwnerStateDescription", "isDirector", "isOfficer", "isTenPercentOwner", "isOther"]

In [7]:
def parseSecHeader(lines, sqlMap, sqlDic):
    lines = lines.replace(":", "").split('\n')
    for line in lines:
        if "\t" in line:
            parts = line.split('\t')
            parts = [x for x in parts if x != '']
            if parts and parts[0] in sqlMap:
                sqlDic[sqlMap[parts[0]]] = parts[1]
    return sqlDic

def addXmlHeader(sqlDic, xmlFields, xml2SqlMap):
    for field in xmlFields:
        val = soup.find(field)
        if val:
            val = val.getText()
        key = field
        if xml2SqlMap and field in xml2SqlMap:
            key = xml2SqlMap[field]
        if key == 'dateOfOriginalSubmission' and val:
            val = val.replace("-", "")
        sqlDic[key] = val
        
def addXmlHeaderMultiple(sqlDics, xmlFields, xml2SqlMap, index):
    for field in xmlFields:
        key = field
        if xml2SqlMap and field in xml2SqlMap:
            key = xml2SqlMap[field]
        
        vals = soup.find_all(field)
        if vals:
            sqlDics[key] = vals[index].getText()
#         for i in range(len(sqlDics)):
#             sqlDics[i][key] = vals[i].getText()

def parseXml(xmlFieldList, transactionsList, transactionType, accNum):
    xmlLis = xmlFieldList.split('\n')
    xmlFields = []
    for field in xmlLis:
        fieldParts = field.split(' ')
        xmlFields.append(fieldParts[0])

    sqlDic = {}
    if transactionType == 'derivativeTransaction' or transactionType == 'derivativeHolding':
        footNoteTag = 'dt'
    else:
        footNoteTag = 'ndt'
    if transactionType == 'derivativeTransaction' or transactionType == 'nonDerivativeTransaction':
        entryType = 'transaction'
    else:
        entryType = 'holding'
    for row in range(len(transactionsList)):
        sqlDic[row+1] = {}
        sqlDic[row+1]['footnotes'] = []

        for field in xmlFields:
            if field == 'accNum':
                val = self.accNum
            elif field == 'rowNumber':
                val = row+1
            elif field == 'dTId' or field == 'nDTId':
                val = self.accNum+'-'+str(row+1)
                if entryType == 'holding':
                    val += '-h'
            elif field == 'footNoteId':
                continue
            elif field == 'type':
                val = entryType
            elif field == 'documentType':
                val = self.documentType
            elif field == 'transactionTimelines':
                field = 'transactionTimeliness'
            else:
                results = transactionsList[row].find(field)
                if results:
                    val = ''.join([str(x) for x in results.contents if x != '\n'])
                    for x in results.contents:
                        if x and isinstance(x, Tag):
                            fId = x.get('id')
                            if fId is not None:
                                footNoteId = self.accNum+'-'+ str(row+1)+'-'+footNoteTag + '-' + fId + '-' + field
                                if entryType == 'holding':
                                    footNoteId += '-h'
                                foot = {'accNum': self.accNum,
                                       'rowNumber': row+1,
                                       'footNoteId': footNoteId,
                                       'fId': fId,
                                       'originalTableType': transactionType,
                                       'footNoteField': field
                                       }
                                sqlDic[row+1]['footnotes'].append(foot)
                else:
                    val = None
            if field == 'transactionTimeliness':
                field = 'transactionTimelines'
            sqlDic[row+1][field] = val
    return sqlDic

def fillFootNoteText(footnotes):
    for note in footnotes:
        fId = note['fId']
        footnote = soup.find('footnote', {'id': fId})
        if footnote:
            note['footNote'] = footnote.contents[0].replace('\n', '')

In [8]:
### Database utility functions ###
def connectToDb(db):
    try:
        conn = sqlite3.connect(db)
        return conn
    except Error as e:
        print(e)

def createTable(conn, tableName, tableFields):
    try:
        c = conn.cursor()
        create_sql = "CREATE TABLE IF NOT EXISTS "+tableName+" (" + tableFields +");"
        c.execute(create_sql)
    except Error as e:
        print(e)
        
def insertToTable(table, dictionary, conn):
    columns = ', '.join(dictionary.keys())
    placeholders = ', '.join('?' * len(dictionary))
    sql = 'INSERT INTO ' + table + ' ({}) VALUES ({})'.format(columns, placeholders)
    try:
        conn.execute(sql, list(dictionary.values()))
    except Exception as e:
        print(e)
        conn.close()
        return
    conn.commit()
    
def insertToTransacTables(sql, tableName):
    for rowNum in sql.keys():
        row = sql[rowNum]
        footnotes = row.pop('footnotes')
        fillFootNoteText(footnotes)

        conn = connectToDb(database)
#         insertToTable(tableName, row, conn)
        print(footnotes)
    
        for note in footnotes:
            insertToTable('form4footNote', note, conn)


In [9]:
def parseHead(soup, accNum):
    sqlDic = {} 

    # SEC-HEADER
    sec_header = soup.find("ACCEPTANCE-DATETIME").getText()
    issuerBegin = sec_header.find("ISSUER")
    ownerBegin = sec_header.find("REPORTING-OWNER")

    # Process top of SEC-HEADER
    etcHeader = sec_header[:issuerBegin]
    sqlDic = parseSecHeader(etcHeader, textHeadMap, sqlDic)
    addXmlHeader(sqlDic, xmlEtcHeaders, None)

    # SEC-HEADER Issuer section
    issuerHead = sec_header[issuerBegin:ownerBegin]

    # Issuer company data
    sqlDic = parseSecHeader(issuerHead, issuerMap, sqlDic)
    addXmlHeader(sqlDic, xmlIssHeaders, None)

    bizBegin = issuerHead.find("BUSINESS ADDRESS")
    mailBegin = issuerHead.find("MAIL ADDRESS")

    # SEC-HEADER Issuer business address
    issuerHeadBiz = issuerHead[bizBegin:mailBegin]
    sqlDic = parseSecHeader(issuerHeadBiz, issuerBizMap, sqlDic)

    # SEC-HEADER Issuer mail address
    issuerHeadMail = issuerHead[mailBegin:]
    sqlDic = parseSecHeader(issuerHeadMail, issuerMailMap, sqlDic)

    # SEC-HEADER Reporting owners section & insert into table
    rOwners = sec_header.split("REPORTING-OWNER:")[1:]
    for index in range(len(rOwners)):
        sqlDic = parseSecHeader(rOwners[index], ownerMap, sqlDic)
        addXmlHeaderMultiple(sqlDic, xmlOwnHeaders, xml2SqlOwnFields, index)
        sqlDic['rowNumber'] = index+1 
        sqlDic['accNum'] = accNum
        sqlDic['headId'] = accNum + "-" + str(index+1)
        #print(sqlDic)
        #insertToTable('form4head', sqlDic, conn)

def parseTransacs(soup, accNum):
    # Process derivative transactions
#     derivativeTransactions = soup.find_all("derivativeTransaction")
#     sql = parseXml(dTTblFields, derivativeTransactions, 'derivativeTransaction', accNum)
# #     print(sql)
#     insertToTransacTables(sql, 'form4dT')

#     derivativeHoldings = soup.find_all("derivativeHolding")
#     sql = parseXml(dTTblFields, derivativeHoldings, 'derivativeHolding', accNum)
# #     print(sql)
#     insertToTransacTables(sql, 'form4dT')

#     # Process non-derivative transactions
#     nonDerivativeTransactions = soup.find_all("nonDerivativeTransaction")
#     sqlNT = parseXml(nDTTblFields, nonDerivativeTransactions, 'nonDerivativeTransaction', accNum)
# #     print(sqlNT)
#     insertToTransacTables(sqlNT, 'form4nDT')

    nonDerivativeHoldings = soup.find_all("nonDerivativeHolding")
#     print('ndh', len(nonDerivativeHoldings))
    sqlNT = parseXml(nDTTblFields, nonDerivativeHoldings, 'nonDerivativeHolding', accNum)
#     print(sqlNT)
    insertToTransacTables(sqlNT, 'form4nDT')

In [28]:
conn = connectToDb(database)

# fname = "2017Form4.csv"
fname = "2017Form4a" #400 4/a

with open(fname) as f:
    urls = f.read().splitlines()

thisTime = urls #400 forms 4as
# thisTime = urls[6050:8001] #5000 form 4s
# thisTime = urls[4001:4002]
accs = []

for link in thisTime:
    links = link.split(",")
    parts = links[len(links)-1].split('/')
    if len(parts) == 5: #.../data/1084869/000108486914000025/0001084869-14-000025.txt
        accNum = parts[len(parts)-3] + '/' +parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    if len(parts) == 4: #.../data/1214101/0001104659-07-084171.txt
        accNum = parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    accs.append(accNum)
accs = tuple(accs)

In [32]:
sql = "Update form4dt set documentType = '4/A' where accNum in " + str(accs) + ";"
print(sql)
# sql2 = "Update form4ndt set documentType = '4/A' where accNum in " + str(accs) + ";"
# print(sql2)
# sql3 = "Update form4footnote set documentType = '4/A' where accNum in " + str(accs) + ";"
# print(sql3)

Update form4dt set documentType = '4/A' where accNum in ('66740/000112760217004862/0001127602-17-004862', '1498132/000002547517000026/0000025475-17-000026', '927003/000120919117032230/0001209191-17-032230', '893739/000121102217000006/0001211022-17-000006', '66740/000112760217004878/0001127602-17-004878', '1305168/000089924317006824/0000899243-17-006824', '1708881/000110465917052970/0001104659-17-052970', '899051/000112760217034686/0001127602-17-034686', '881890/000114036117019447/0001140361-17-019447', '949858/000120919117052375/0001209191-17-052375', '6176/000120919117031283/0001209191-17-031283', '1220630/000117911017003098/0001179110-17-003098', '1314626/000131462617000006/0001314626-17-000006', '741516/000120839417000002/0001208394-17-000002', '1668028/000110465917026605/0001104659-17-026605', '1438731/000094787117000047/0000947871-17-000047', '1652044/000120919117007350/0001209191-17-007350', '1178879/000120919117040582/0001209191-17-040582', '887546/000095011717000076/0000950117-

In [33]:
conn.close()

In [35]:
conn = connectToDb(database)
createTable(conn, 'form4Head', headTblFields)
createTable(conn, 'form4dT', dTTblSql)
createTable(conn, 'form4nDT', nDTTblSql)
createTable(conn, 'form4footNote', footNoteTblFields)

#fname = "2007_4A_accNum" #fname = "flowers_com_inc2014.txt"
# fname = "2017Form4.csv"
fname = "2017Form4a" #400 4/a

with open(fname) as f:
    urls = f.read().splitlines()

thisTime = urls #400 forms 4as
# thisTime = urls[6050:8001] #5000 form 4s
# thisTime = urls[4001:4002]
for link in thisTime:
    links = link.split(",")
#     url = "https://www.sec.gov/Archives/" + links[len(links)-1] #for 4s
#     url = "https://www.sec.gov/Archives/edgar/data/1173479/000117347917000003/0001173479-17-000003.txt"
    url = "https://www.sec.gov/Archives/" + link + ".txt" #for 2014form4a    
    try:
        response = request.urlopen(url)
    except error.HTTPError as err:
        failed.append(url)
        continue
    
    the_page = response.read()
    content = the_page.decode(encoding='latin-1')
    file = open("test", "w")
    file.write(content) 

    #parts = link.split("/")
    parts = links[len(links)-1].split('/')
    if len(parts) == 5: #.../data/1084869/000108486914000025/0001084869-14-000025.txt
        accNum = parts[len(parts)-3] + '/' +parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    if len(parts) == 4: #.../data/1214101/0001104659-07-084171.txt
        accNum = parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    print(parts, links, accNum)
    # start parsing
    begin = content.find("<SEC-DOCUMENT>")
    end = content.find("-----END")
    xmlFile = content[begin:end]
    soup = BeautifulSoup(xmlFile, 'xml')
    #parseHead(soup, accNum)
    parseTransacs(soup, accNum)


[0, 'edgar/data/1173479/000117347917000003/0001173479-17-000003.txt']
['edgar', 'data', '1173479', '000117347917000003', '0001173479-17-000003.txt'] [0, 'edgar/data/1173479/000117347917000003/0001173479-17-000003.txt'] 1173479/000117347917000003/0001173479-17-000003
1173479/000117347917000003/0001173479-17-000003-1-h
1173479/000117347917000003/0001173479-17-000003-2-h
1173479/000117347917000003/0001173479-17-000003-3-h
1173479/000117347917000003/0001173479-17-000003-4-h
1173479/000117347917000003/0001173479-17-000003-5-h
[{'accNum': '1173479/000117347917000003/0001173479-17-000003', 'fId': 'F1', 'originalTableType': 'nonDerivativeHolding', 'footNoteField': 'directOrIndirectOwnership', 'footNote': 'Director is a trustee and beneficiary of various trusts.', 'footNoteId': '1173479/000117347917000003/0001173479-17-000003-1-ndt-F1-directOrIndirectOwnership-h', 'rowNumber': 1}]
UNIQUE constraint failed: form4footNote.footNoteId
[{'accNum': '1173479/000117347917000003/0001173479-17-000003', '

In [None]:
query = "select * from form4footNote;" 

    matches = cur.execute(query).fetchall() #grouped by [(aAcc, bAcc)]

In [9]:
conn = connectToDb(database)
createTable(conn, 'form4Head', headTblFields)
createTable(conn, 'form4dT', dTTblSql)
createTable(conn, 'form4nDT', nDTTblSql)
createTable(conn, 'form4footNote', footNoteTblFields)

#fname = "2007_4A_accNum" #fname = "flowers_com_inc2014.txt"
fname = "2017Form4.csv"
#fname = "2017Form4a" #400 4/a

with open(fname) as f:
    urls = f.read().splitlines()

#thisTime = urls #400 forms 4as
thisTime = urls[8001:9001] #5000 form 4s
#thisTime = urls[4001:4002]
for link in thisTime:
    links = link.split(",")
    url = "https://www.sec.gov/Archives/" + links[len(links)-1] #for 4s
    #url = "https://www.sec.gov/Archives/edgar/data/1305168/0000899243-17-004732.txt"
    #url = "https://www.sec.gov/Archives/" + link + ".txt" #for 2014form4a    
    try:
        response = request.urlopen(url)
    except error.HTTPError as err:
        failed.append(url)
        continue
    
    the_page = response.read()
    content = the_page.decode(encoding='latin-1')
    file = open("test", "w")
    file.write(content) 

    #parts = link.split("/")
    parts = links[len(links)-1].split('/')
    if len(parts) == 5: #.../data/1084869/000108486914000025/0001084869-14-000025.txt
        accNum = parts[len(parts)-3] + '/' +parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    if len(parts) == 4: #.../data/1214101/0001104659-07-084171.txt
        accNum = parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
        
    # start parsing
    begin = content.find("<SEC-DOCUMENT>")
    end = content.find("-----END")
    xmlFile = content[begin:end]
    soup = BeautifulSoup(xmlFile, 'xml')
    parseHead(soup, accNum)
    parseTransacs(soup, accNum)


1495757/0001467373-17-000092-1-h
UNIQUE constraint failed: form4nDT.nDTId
1495757/0001467373-17-000117-1-h
UNIQUE constraint failed: form4nDT.nDTId


KeyboardInterrupt: 

In [12]:
dropHeadFields = ['accNum', 'rowNumber', 'documentType', 'filedDate', 'changedDate', 'periodOfReport', 'dateOfOriginalSubmission', "headId", 'rptOwnerFormType', 'rptOwnerSecAct', 'rptOwnerSecFileNum', 'rptOwnerFilmNum', 'rptOwnerBusinessPhone', 'rptOwnerStateDescription']

def filterFields(allFields, dropFields):
    fields = allFields.replace("\n", "").split(",")
    fields = [field.split(" ")[0] for field in fields]
    fields = [field for field in fields if field not in dropFields and field != '']
    return fields
headFields = filterFields(headTblFields, dropHeadFields)
headFields

['publicDocCount',
 'schemaVersion',
 'notSubjectToSection16',
 'issuerName',
 'issuerCik',
 'issuerIndustrialClassification',
 'issuerIrs',
 'issuerIncorpState',
 'issuerFiscalYrEnd',
 'issuerBusinessStreet1',
 'issuerBusinessCity',
 'issuerBusinessState',
 'issuerBusinessZip',
 'issuerBusinessPhone',
 'issuerMailStreet1',
 'issuerMailStreet2',
 'issuerMailCity',
 'issuerMailState',
 'issuerMailZip',
 'issuerTradingSymbol',
 'rptOwnerName',
 'rptOwnerCik',
 'rptOwnerStreet1',
 'rptOwnerStreet2',
 'rptOwnerCity',
 'rptOwnerState',
 'rptOwnerZipCode',
 'rptOwnerisDirector',
 'rptOwnerisOfficer',
 'rptOwnerisTenPercentOwner',
 'rptOwnerisOther']

In [7]:
query = "select A.accNum as aAcc, B.accNum as bAcc from form4head A, form4head B where A.documentType = '4/A' and B.documentType = '4' and A.dateOfOriginalSubmission = B.filedDate"
for field in headFields:
    query += " and A." + field + " = " "B." + field
query += ";"
print(query)

cur = conn.cursor()
cur.execute(query)
matches = cur.fetchall()
matches

select A.accNum as aAcc, B.accNum as bAcc from form4head A, form4head B where A.documentType = '4/A' and B.documentType = '4' and A.dateOfOriginalSubmission = B.filedDate and A.publicDocCount = B.publicDocCount and A.schemaVersion = B.schemaVersion and A.notSubjectToSection16 = B.notSubjectToSection16 and A.issuerName = B.issuerName and A.issuerCik = B.issuerCik and A.issuerIndustrialClassification = B.issuerIndustrialClassification and A.issuerIrs = B.issuerIrs and A.issuerIncorpState = B.issuerIncorpState and A.issuerFiscalYrEnd = B.issuerFiscalYrEnd and A.issuerBusinessStreet1 = B.issuerBusinessStreet1 and A.issuerBusinessCity = B.issuerBusinessCity and A.issuerBusinessState = B.issuerBusinessState and A.issuerBusinessZip = B.issuerBusinessZip and A.issuerBusinessPhone = B.issuerBusinessPhone and A.issuerMailStreet1 = B.issuerMailStreet1 and A.issuerMailStreet2 = B.issuerMailStreet2 and A.issuerMailCity = B.issuerMailCity and A.issuerMailState = B.issuerMailState and A.issuerMailZip

NameError: name 'conn' is not defined

In [13]:
dropNDTFields = ['accNum', 'nDTId', 'rowNumber', 'footNoteId']
ndtFields = filterFields(nDTTblFields, dropNDTFields)
dropDTFields = ['accNum', 'dTId', 'rowNumber', 'footNoteId']
dtFields = filterFields(dTTblFields, dropDTFields)

ndtFields

['securityTitle',
 'transactionDate',
 'transactionFormType',
 'transactionCode',
 'equitySwapInvolved',
 'transactionTimelines',
 'transactionShares',
 'transactionPricePerShare',
 'transactionAcquiredDisposedCode',
 'sharesOwnedFollowingTransaction',
 'directOrIndirectOwnership',
 'natureOfOwnership']

In [17]:
query = "select"
for field in ndtFields:
    query += " A." + field + " as a_" + field + ", B." + field + " as b_" + field +","
query = query[:len(query)-1] + " from form4ndT A, form4ndT B where A.accNum = '1084869/000108486914000025/0001084869-14-000025' and B.accNum = '1084869/000108486914000024/0001084869-14-000024';";
print(query)
#maybe dont do this, just compare fields from 2 transacs one by one....

select A.securityTitle as a_securityTitle, B.securityTitle as b_securityTitle, A.transactionDate as a_transactionDate, B.transactionDate as b_transactionDate, A.transactionFormType as a_transactionFormType, B.transactionFormType as b_transactionFormType, A.transactionCode as a_transactionCode, B.transactionCode as b_transactionCode, A.equitySwapInvolved as a_equitySwapInvolved, B.equitySwapInvolved as b_equitySwapInvolved, A.transactionTimelines as a_transactionTimelines, B.transactionTimelines as b_transactionTimelines, A.transactionShares as a_transactionShares, B.transactionShares as b_transactionShares, A.transactionPricePerShare as a_transactionPricePerShare, B.transactionPricePerShare as b_transactionPricePerShare, A.transactionAcquiredDisposedCode as a_transactionAcquiredDisposedCode, B.transactionAcquiredDisposedCode as b_transactionAcquiredDisposedCode, A.sharesOwnedFollowingTransaction as a_sharesOwnedFollowingTransaction, B.sharesOwnedFollowingTransaction as b_sharesOwnedFol

In [28]:
conn = connectToDb(database)
cur = conn.cursor()

In [39]:
#for match in matches:
# aDT = cur.execute("select * from form4dT where accNum = '" + match[0]+ + "';").fetchall()
# bDT = cur.execute("select * from form4dT where accNum = '" + match[0]+ + "';").fetchall()


match= ["1084869/000108486914000025/0001084869-14-000025", "1084869/000108486914000024/0001084869-14-000024"]


aNDT = cur.execute("select nDTId," + ','.join(ndtFields) +" from form4ndT where accNum = '" + match[0]+ "';").fetchall()
oNDT = cur.execute("select nDTId," + ','.join(ndtFields) +" from form4ndT where accNum = '" + match[1]+ "';").fetchall()

'''
# max matches; dont account for new records
matchingTransacs = {}
for each transaction in amendment:
    numMatches = 0
    originalTransac = None
    for each transaction in original:
        thisMatches = num(fields that match)
        if thisMatches > numMatches:
            numMatches = thisMatches
            originalTransac = transaction
'''
# matchingTransacs = {}
# for a in aNDT:
#     mostMatches = 0
#     originalTransac = None
#     for o in oNDT:
#         thisMatches = sum([1 for i in range(len(ndtFields)) if a[i] == o[i]])
#         if thisMatches > mostMatches:
#             originalTransac = o
#     matchingTransacs[a[0]] = originalTransac[0]
#     oNDT.remove(originalTransac)
# matchingTransacs



{'1084869/000108486914000025/0001084869-14-000025-1': '1084869/000108486914000024/0001084869-14-000024-1'}

In [36]:
a

('1084869/000108486914000025/0001084869-14-000025-1',
 '<value>Class A Common Stock</value>',
 '<value>2014-12-08</value>',
 '4',
 'A',
 '0',
 '0',
 '<value>2516</value>',
 '<value>0</value><footnoteId id="F1"/>',
 '<value>A</value>',
 '<value>30808</value>',
 '<value>D</value>',
 None)

In [13]:
def main():
    conn = connectToDb(database)
    createTable(conn, 'form4Head', headTblFields)
    createTable(conn, 'form4dT', dTTblSql)
    createTable(conn, 'form4nDT', nDTTblSql)
    createTable(conn, 'form4footNote', footNoteTblFields)
    
    fname = "2007_4A_accNum"
    with open(fname) as f:
        urls = f.read().splitlines()
        
    i = 0
    for link in urls:
        url = "https://www.sec.gov/Archives/" + link
        response = urllib.request.urlopen(url)
        the_page = response.read()
        content = the_page.decode(encoding='latin-1')
        file = open("test", "w")
        file.write(content) 

        parts = urls[1].split('/')
        accNum = parts[len(parts)-1].split('.')[0]
        #accNum = '0001104659-07-084171'

        # start parsing
        begin = content.find("<SEC-DOCUMENT>")
        end = content.find("</SEC-DOCUMENT>")#("-----END")
        xmlFile = content[begin:end]
        soup = BeautifulSoup(xmlFile, 'xml')
        
        parseHead(soup, accNum)
        parseTransacs(soup, accNum)
        i+=1
        if i ==50:
            break

In [14]:
main()

NameError: name 'soup' is not defined

In [106]:
fname = "test.txt"
with open(fname) as f:
    urls = f.read()
soup = BeautifulSoup(urls, 'html')
soup

<?xml version="1.0" encoding="ISO-8859-1" ?><html><body><feed xmlns="http://www.w3.org/2005/Atom">
<title>Archive of Historical EDGAR Documents</title>
<link href="http://www.sec.gov//cgi-bin/srch-edgar?text=TYPE%3D4%2FA&amp;start=321&amp;count=80&amp;first=2017&amp;last=2017&amp;output=atom" rel="alternate"/>
<link href="http://www.sec.gov//cgi-bin/srch-edgar?text=TYPE%3D4%2FA&amp;start=321&amp;count=80&amp;first=2017&amp;last=2017&amp;output=atom" rel="self"/>
<id>http://www.sec.gov//cgi-bin/srch-edgar?text=TYPE%3D4%2FA&amp;start=321&amp;count=80&amp;first=2017&amp;last=2017&amp;output=atom</id>
<author><name>Webmaster</name><email>webmaster@sec.gov</email></author>
<updated>2018-10-08T20:26:36-04:00</updated>
<entry>
<title>4/A - Andersons, Inc.</title>
<link href="/Archives/edgar/data/821026/000120919117019159/0001209191-17-019159-index.htm" rel="alternate" type="text/html"/>
<summary type="html">&lt;b&gt;Filed Date:&lt;/b&gt; 03/08/2017 &lt;b&gt;Accession Number:&lt;/b&gt; 0001209

In [107]:
import re
a = soup.find_all("link", href=re.compile("Archives"))
#links = [i.get("href").replace("-index.htm", "").replace("/Archives/", "") for i in a]

file = open("2017Form4a", "a+")
for i in a:
    link = i.get("href").replace("-index.htm", "").replace("/Archives/", "")
    file.write(link + "\n") 

In [136]:
with open("2017Form4a") as f:
    urls = f.read().splitlines()
print(len(urls))
urls = set(urls)
print(len(urls))
file = open("2017Form4a", "w")
for i in urls:
    file.write(i + "\n") 

477
401


0
6
10
