In [1]:
import databaseOps, fields
import numpy as np
import json
from bs4 import BeautifulSoup, Tag

In [2]:
conn = databaseOps.connectToDb()
cur = conn.cursor()

In [3]:
def score(aRows, bRows):
    numMatches, editDist, naiveDist = compare(aRows, bRows)
    score = numMatches - editDist - naiveDist
    return score/len(aRows)

# rows need to be ordered by row num
def compare(aRow, bRow):
    numMatches = 0
    editDists, naiveDists = [], []
    numFields = min(len(aRow), len(bRow))
    if numFields == 0:
        return 0
    for j in range(numFields):
        numMatches += sum([1 for i in range(numFields) if aRow[i] == bRow[i]])
        editDists.append(sum([editDistance(str(aRow[i]), str(bRow[i])) for i in range(numFields)]))
        naiveDists.append(sum([naiveEdit(str(aRow[i]), str(bRow[i])) for i in range(numFields)]))
#     print(numMatches, sum(editDists), sum(naiveDists))
    return (numMatches - sum(editDists) - sum(naiveDists))/numFields #more negative = more unlike
    return numMatches - sum(editDists) - sum(naiveDists) #more negative = more unlike

# aRow & bRow are dictionaries
def compareByCols(aRow, bRow):
    numMatches = 0
    editDists, naiveDists = [], []
    cols = list(aRow.keys())
    numCols = len(cols)
    for j in range(numCols):
        numMatches += sum([1 for i in range(numCols) if aRow[cols[i]] == bRow[cols[i]]])
        editDists.append(sum([editDistance(str(aRow[cols[i]]), str(bRow[cols[i]])) for i in range(numCols)]))
        naiveDists.append(sum([naiveEdit(str(aRow[cols[i]]), str(bRow[cols[i]])) for i in range(numCols)]))
#     print((numMatches - sum(editDists) - sum(naiveDists))/numCols)
    return (numMatches - sum(editDists) - sum(naiveDists))/numCols #more negative = more unlike
#     return numMatches - sum(editDists) - sum(naiveDists) #more negative = more unlike

# Shortest edit distance (Levenstein)
def editDistance(aField, bField):
    if aField == bField:
        return 0
    n, m = len(aField), len(bField)
    if aField == "None":
        return m/m
    if bField == "None":
        return n/n
    dp = [[0 for x in range(n+1)] for x in range(m+1)] 
    for i in range(m+1): 
        for j in range(n+1):
            if i == 0: 
            	dp[i][j] = j
            elif j == 0: 
                dp[i][j] = i
            elif aField[j-1] == bField[i-1]: 
                dp[i][j] = dp[i-1][j-1] 
            else: 
                dp[i][j] = 1 + min(dp[i][j-1], dp[i-1][j], dp[i-1][j-1])    
  
    return dp[m][n]/(n+m) #normalized is better

# delete from end of bField until match beginning of aField, then add rest of aField
def naiveEdit(aField, bField):
    if aField == bField:
        return 0
    n, m = len(aField), len(bField)
    if aField == "None":
        return m/m
    if bField == "None":
        return n/n
    if (len(aField) > len(bField)):
        a, b = aField, bField
    else:
        b, a = aField, bField
    numEdits = 0
    while aField.find(bField) != 0 and len(bField) > 0:
        bField = bField[:-1]
        numEdits += 1
    numEdits += len(aField) - len(bField)
    return numEdits/(n+m)

In [4]:
from fields import getHeadFields, getDTFields, getNDTFields, getFootFields, filterFields

# return all rows with accNum in table in dictionary form (keys = table field)
def getRows(accNum, table, transactionType=None):
    if transactionType is not None:
        cur.execute("select * from "+ table +" where accNum = '" + accNum + "' and type = '" + transactionType + "';")
    else:
        cur.execute("select * from "+ table +" where accNum = '" + accNum + "';")
    rows = [dict(zip([col[0] for col in cur.description], row)) for row in cur.fetchall()]
    return rows

headFields = getHeadFields()
headDropField = ['filedDate', 'dateOfOriginalSubmission', 'accNum', 'headId', 'documentType', 'rptOwnerFormType', 'changedDate', 'rptOwnerFilmNum']
headFields = filterFields(headFields, headDropField)

ndtFields = getNDTFields()
ndtDropField = ['accNum', 'nDTId', 'footNoteId', 'rowNumber', 'type', 'documentType']
ndtFields = filterFields(ndtFields, ndtDropField)
dtFields = getDTFields()

dtDropField = ['accNum', 'dTId', 'footNoteId', 'rowNumber', 'type', 'documentType']
dtFields = filterFields(dtFields, dtDropField)

footFields = getFootFields()
footDropField = ['accNum', 'fId', 'footNoteId', 'rowNumber', 'type', 'documentType']
footFields = filterFields(footFields, footDropField)

In [5]:
# given aRows, return prob dist of bRows being match for each aRow
# dict[aRow][probs] = prob
def getMatchProbDist(aRows, bRows, identifier): #4/a, 4
    probDist = dict()
    for aRow in aRows:
        probDist[aRow[identifier]] = dict()

    # if only 1 transaction in each
    if len(aRows) == 1 and len(bRows) == 1:
        probDist[aRows[0][identifier]][bRows[0][identifier]] = 1
        return probDist
    
    # score matching each 4a with each 4
    for aRow in aRows:
        transacMatchScores = dict()
        for bRow in bRows:
            transacMatchScores[bRow[identifier]] = compareByCols(aRow, bRow)
            
        #normalize scores 
        total = 0
        scores = transacMatchScores.values()
        minScore = min(scores)
        for key in transacMatchScores.keys():
            if minScore < 0:
                transacMatchScores[key] += abs(minScore) + 0.0000000001 #prevent from nan
            total += transacMatchScores[key]
        #
        for key in transacMatchScores.keys():
            probDist[aRow[identifier]][key] = transacMatchScores[key]/total
        
    #print('probDist', json.dumps(probDist, indent=2))
    return probDist

In [6]:
import pulp
    
def getOptMatches(probDist):
    fourAIDs = list(probDist.keys())
    fourIDs = list(probDist[fourAIDs[0]].keys())
    n, m = len(fourAIDs), len(fourIDs)
    allVars, variables = [], []
    for i in range(n):
        x = [str(i) + '|' + str(j) for j in range(m)]
        variables.append(x)

    lp = pulp.LpProblem("max weighted matching", pulp.LpMaximize)
    lstVars = []
    for j in range(len(variables)):
        a = [pulp.LpVariable(variables[j][i], lowBound = 0, upBound = 1, cat='Integer') for i in range(len(variables[j]))]
        lstVars.append(a)
        allVars += a
        
    objFunc, varWeightMap = [], {}
    for var in allVars:
        varName = var.name.split('|')
        fourAID, fourID = fourAIDs[int(varName[0])], fourIDs[int(varName[1])]
        objFunc += [(var, probDist[fourAID][fourID])] 
        varWeightMap[var.name] = probDist[fourAID][fourID]
    objFunc = pulp.LpAffineExpression(objFunc)
    lp += objFunc

    for j in range(len(lstVars)):
        cons = [(var, 1) for var in lstVars[j]]
        lp += pulp.LpAffineExpression(cons) <= 1 #constraints
    lp.solve()
    # for variable in lp.variables():
    #     print("{} = {}".format(variable.name, variable.varValue))
    
    chosenVars = [(var.name, varWeightMap[var.name]) for var in lp.variables() if var.varValue == 1]
    chosenVars = sorted(chosenVars, key=lambda tup: tup[1], reverse=True) #sort by prob of matching
    
    allMatches, unmatched = [], []
    for i in range(n): #only top m matches
        chosenName = chosenVars[i][0].split('|')
        if i < m:
            allMatches.append((fourAIDs[int(chosenName[0])], fourIDs[int(chosenName[1])]))
        else:
            unmatched.append(fourAIDs[int(chosenName[0])])
#     print('allMatches', allMatches)
#     print('unmatched', unmatched)
    return allMatches, unmatched
    

In [7]:
def getChangedFields(aRow, bRow, tblFields, fourAID="", fourID=""):
    changedFields = dict()
    for field in tblFields:
        after, before = dict(), dict()
        if bRow is None or bRow == [] or bRow[field] == 'null' or bRow[field] is None:
            if aRow[field] == "null" or aRow[field] is None:
                continue
            else:
                after['value'] = aRow[field]
                attachFootNote(after, field, aRow, fourAID)
        elif (aRow[field] == "null" or aRow[field] is None) and (bRow[field] != 'null' or bRow[field] is not None):
            before['value'] = bRow[field]
            attachFootNote(before, field, bRow, fourID)
        elif aRow[field] != bRow[field]:
            after['value'] = aRow[field]
            attachFootNote(after, field, aRow, fourAID)
            before['value'] = bRow[field]
            attachFootNote(before, field, bRow, fourID)
        if len(after) > 0 or len(before) > 0:
            changedFields[field] = dict()
        if len(after) > 0 or len(before) > 0:
            changedFields[field]['4A'] = after
            changedFields[field]['4'] = before
    return changedFields

def attachFootNote(dic, field, row, accNum):
    if isinstance(row[field], str) and "footnoteId" in row[field]: #<footnoteId id=\"F1\"/><footnoteId id=\"F2\"/>"
        
        soup = BeautifulSoup(row[field])
        footnotes = soup.findAll('footnoteid')
        
        for footnote in footnotes:
            fId = footnote.get('id')
            query = "select footNote from form4footNote where accNum='"+ accNum + "' and fId = '"+ fId +"' and footNoteField = '" + field + "';" 

            matches = cur.execute(query).fetchall() #grouped by [(aAcc, bAcc)]
#             print(query)
            dic[fId] = matches[0][0]

In [13]:
### compare just head; using rownumber and dictionary fields

# Given list fourA, print possible form 4 form each form fourA along with probability distribution
# See comment if only want to return top match
def get4ATo4Matches(fourA):
    aTo4 = dict()
    for idx in range(len(fourA)):
        aHead = getRows(fourA[idx], "form4Head")
        query = "select B.accNum as bAcc from(select * from form4head group by accNum) A, (select * from form4head group by accNum) B where A.accNum='"+ fourA[idx] + "' and B.documentType = '4' and A.dateOfOriginalSubmission = B.filedDate;" 

        matches = cur.execute(query).fetchall()

        if len(matches) < 1:
            print("No clear match for ", fourA[idx])
            continue

        probDist = dict()
        headScores, totalScores = dict(), dict()
        for match in matches:
            bAcc = match[0]
            bHead = getRows(bAcc, "form4Head")

            mHeadScores = np.zeros(len(bHead))

            for aRow in aHead:
                for bIdx in range(len(bHead)):
                    bRow = bHead[bIdx]
                    if aRow['rowNumber'] == bRow['rowNumber']:
                        mHeadScores[bIdx] = compareByCols(aRow, bRow)    

            totalScore = sum(mHeadScores)
            headScores[bAcc], totalScores[bAcc] = mHeadScores, totalScore

        scores = totalScores.values()
        minScore = min(scores)
        total = 0
        for key in totalScores.keys():
            if minScore < 0:
                totalScores[key] += abs(minScore) + 0.0000000000001 #prevent from nan
            total += totalScores[key]
        for key in totalScores.keys():
            probDist[key] = totalScores[key]/total 

        # Only keep top possible match
    #     aTo4[fourA[idx]] = max(probDist, key=probDist.get)

        # Uncomment if want to see probability distribution of possible matches (based on head fields only)
        values = sorted(probDist.values(), reverse=True)
        aTo4[fourA[idx]] = dict()
        for key in probDist.keys():
            if probDist[key] in values[:4]: #only show top 4
                aTo4[fourA[idx]][key] = probDist[key]

    print(json.dumps(aTo4, indent=2))
    return aTo4

# straight forward cases
# fourA = ['887546/000095011717000076/0000950117-17-000076',
# '893739/000121102217000006/0001211022-17-000006', 
# '1305168/000089924317006824/0000899243-17-006824', 
# '1653649/000120919117010174/0001209191-17-010174', 
# '893739/000129672017000004/0001296720-17-000004', 
# '1173479/000117347917000003/0001173479-17-000003']

fourA = ['1173479/000117347917000003/0001173479-17-000003']

# fourA w 10+ transactions
# fourA = [
# '1559998/000149315217004386/0001493152-17-004386',
# '1559998/000149315217004384/0001493152-17-004384',
# '1559998/000149315217004383/0001493152-17-004383',
# '1652044/000120919117003983/0001209191-17-003983', #only one that had match
# '66740/000112760217018755/0001127602-17-018755',
# '1409539/000149322517000001/0001493225-17-000001',
# '1652044/000120919117007350/0001209191-17-00735'
# ]

# res = get4ATo4Matches(fourA)

In [8]:
# Given list of fourAs, find most likely form 4 for each fourA, then describe changes between 4 and 4/A

def get4ATo4Changes(fourAs):
    diff = dict()
    for idxx in range(len(fourAs)):
        fourA = fourAs[idxx]

        aHead = getRows(fourA, "form4Head")

        aDTT = getRows(fourA, "form4dT", "transaction")
        aDTH = getRows(fourA, "form4dT", "holding")
        aNDTT = getRows(fourA, "form4ndT", "transaction")    
        aNDTH = getRows(fourA, "form4ndT", "holding")

        aTransacs = [aDTT, aDTH, aNDTT, aNDTH]
        labels = ["dt", "dt", "ndt", "ndt"]
        tblFields = [dtFields, dtFields, ndtFields, ndtFields]
        identifiers = ["dTId", "dTId", "nDTId", "nDTId"]

        aFoot = getRows(fourA, "form4footnote")    

        query = "select B.accNum as bAcc from form4head A, form4head B where A.accNum='"+ fourA + "' and B.documentType = '4' and A.dateOfOriginalSubmission = B.filedDate and A.rptOwnerName = B.rptOwnerName;" 

        matches = cur.execute(query).fetchall() #grouped by [(aAcc, bAcc)]

        if len(matches) < 1:
            print("No clear match for ", fourAs[idxx])
            continue

        matchDiff, lenChanges = dict(), 0
        for match in matches:
            four = match[0]

            bHead = getRows(four, "form4Head")

            bDTT = getRows(four, "form4dT", "transaction")
            bDTH = getRows(four, "form4dT", "holding")
            bNDTT = getRows(four, "form4ndT", "transaction")    
            bNDTH = getRows(four, "form4ndT", "holding")        

            bFoot = getRows(four, "form4footnote")
            bTransacs = [bDTT, bDTH, bNDTT, bNDTH]

            thisMatchDiff = dict() 
            #what field changed in head, dt, ndt, footnote??
            thisMatchDiff["head"], thisMatchDiff["dt"], thisMatchDiff["ndt"] = dict(), dict(), dict()
            for aRow in aHead:
                for bRow in bHead:
                    if aRow['rowNumber'] == bRow['rowNumber']:
                        changedFields = getChangedFields(aRow, bRow, headFields)
                        if len(changedFields) != 0:
                            thisMatchDiff["head"][aRow['rowNumber']] = changedFields
                        break

            for idx in range(len(aTransacs)):
                aTransac, bTransac = aTransacs[idx], bTransacs[idx]
                label, fields, identifier = labels[idx], tblFields[idx], identifiers[idx]
                if len(aTransac) != 0:
                    if len(aTransac) == len(bTransac):            
                        for aRow in aTransac:
                            for bRow in bTransac:
                                if aRow['rowNumber'] == bRow['rowNumber']:
                                    changedFields = getChangedFields(aRow, bRow, fields, aRow['accNum'], bRow['accNum'])
                                    if len(changedFields) != 0:
                                        thisMatchDiff[label][str(aRow['rowNumber']) + '-' + aRow['type']] = changedFields
                                    break
                    elif len(bTransac) == 0:
                        for aRow in aTransac:
                            thisMatchDiff[label][str(aRow['rowNumber']) + '-' + aRow['type']] = getChangedFields(aRow, None, fields, aRow['accNum'])
                    else:
                        probDist = getMatchProbDist(aTransac, bTransac, identifier) #4/a, 4
                        optMatches, unmatched = getOptMatches(probDist) #get matches

                        #print(optMatches)

                        if len(optMatches) < 1:
                            print("No clear match for ", fourAs[idxx])
                            continue

                        matchA = [match[0] for match in optMatches]
                        matchB = [match[1] for match in optMatches]

                        unMatchedRows = [aRow for aRow in aTransac if aRow[identifier] in unmatched]

                        for matchIdx in range(len(optMatches)):
                            #find full row corresponding to aRows[matchIdx]
                            aRow, bRow = None, None

                            for aT in aTransac:
                                if aT[identifier] == matchA[matchIdx]:
                                    aRow = aT
                                    break
                            for bT in bTransac:
                                if bT[identifier] == matchB[matchIdx]:
                                    bRow = bT
                                    break

                            changedFields = getChangedFields(aRow, bRow, fields, aRow['accNum'], bRow['accNum'])
                            if len(changedFields) != 0:
                                thisMatchDiff[label][str(aRow['rowNumber']) + '-' + aRow['type']] = changedFields
                        for aRow in unMatchedRows:
                            thisMatchDiff[label][str(aRow['rowNumber']) + '-' + aRow['type']] = getChangedFields(aRow, None, fields, aRow['accNum'])
            #matchDiff[four] = thisMatchDiff #display changes with all possible 4's

            #only display best matched form 4
            thisLenChanges = len(thisMatchDiff["head"]) + len(thisMatchDiff["dt"]) + len(thisMatchDiff["ndt"])
            if matchDiff == {} or (matchDiff != {} and thisLenChanges < lenChanges):
                thisMatchDiff["accNum"] = four
                matchDiff = thisMatchDiff
                lenChanges = thisLenChanges  

        diff[fourA] = matchDiff
    print(json.dumps(diff, indent=2))
    return diff

In [33]:
# 4/a 1dt & 10 dh, 4 2dt 9 dh
# fourAs = ['1652044/000120919117003983/0001209191-17-003983'] 

#4/a: 4 ndt, 5 ndh; 4: 0 ndt, 4 ndh **mix match case
#fourAs = ['1173479/000117347917000003/0001173479-17-000003'] 

# 4/a has 1 more dt **additional entry case
#fourAs = ['893739/000121102217000006/0001211022-17-000006', '893739/000129672017000004/0001296720-17-000004'] 

# '1689923/000089924317009598/0000899243-17-009598' multiple owners; mix & match owners

#same # of transactions/holdings; value updates only
fourAs = ['887546/000095011717000076/0000950117-17-000076', '1653649/000120919117010174/0001209191-17-010174',
'1305168/000089924317006824/0000899243-17-006824', '1584738/000114420417020939/0001144204-17-020939', '1405073/000120919117020624/0001209191-17-020624', '1310215/000005947817000106/0000059478-17-000106'] 

res = get4ATo4Changes(fourAs)

{
  "1305168/000089924317006824/0000899243-17-006824": {
    "dt": {},
    "accNum": "1305168/0000899243-17-004732",
    "head": {},
    "ndt": {
      "3-holding": {
        "natureOfOwnership": {
          "4A": {
            "value": "<value>Seiyonne Suriyakumar 2013 Irrevocable Trust</value><footnoteId id=\"F5\"/>",
            "F5": "Shares are held by the Seiyonne Suriyakumar 2013 Irrevocable Trust, Seiyonne Suriyakumar, as Trustee."
          },
          "4": {
            "F4": "Shares are held by the Seiyonne Suriyakumar 2013 Irrevocable Trust, Seiyonne Suriyakumar, as Trustee.",
            "value": "<value>Seiyonne Suriyakumar 2013 Irrevocable Trust</value><footnoteId id=\"F4\"/>"
          }
        }
      },
      "1-holding": {
        "sharesOwnedFollowingTransaction": {
          "4A": {
            "value": "<value>2520664</value>"
          },
          "4": {
            "value": "<value>595772</value>"
          }
        },
        "directOrIndirectOwnership": {


In [11]:
fname = "2017Form4a" #400 4/a

with open(fname) as f:
    urls = f.read().splitlines()

thisTime = urls #400 forms 4as
accs = []

for link in thisTime:
    links = link.split(",")
    parts = links[len(links)-1].split('/')
    if len(parts) == 5: #.../data/1084869/000108486914000025/0001084869-14-000025.txt
        accNum = parts[len(parts)-3] + '/' +parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    if len(parts) == 4: #.../data/1214101/0001104659-07-084171.txt
        accNum = parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    accs.append(accNum)
    
accs[:5]

['66740/000112760217004862/0001127602-17-004862',
 '1498132/000002547517000026/0000025475-17-000026',
 '927003/000120919117032230/0001209191-17-032230',
 '893739/000121102217000006/0001211022-17-000006',
 '66740/000112760217004878/0001127602-17-004878']

In [48]:
fourAs = ['1652044/000120919117003983/0001209191-17-003983'] 
get4ATo4Changes(fourAs)

{
  "1652044/000120919117003983/0001209191-17-003983": {
    "dt": {
      "10-holding": {
        "exerciseDate": {
          "4A": {
            "value": "<footnoteId id=\"F5\"/>",
            "F5": "Option is fully vested."
          },
          "4": {
            "F1": "Option is fully vested.",
            "value": "<footnoteId id=\"F1\"/>"
          }
        },
        "underlyingSecurityShares": {
          "4A": {
            "value": "<value>44955</value>"
          },
          "4": {
            "value": "<value>86446</value>"
          }
        },
        "sharesOwnedFollowingTransaction": {
          "4A": {
            "F1": "The original Form 4 filed on January 12, 2017 is amended by this Form 4/A to delete an erroneously reported option exercise of 1,000 shares of Class A Common Stock and option exercise of 1,000 shares of Class C Capital Stock from Table I, delete an erroneously reported option to purchase 1,000 shares of Class A Common Stock and option to purchase 

{'1652044/000120919117003983/0001209191-17-003983': {'accNum': '1652044/0001209191-17-003596',
  'dt': {'1-holding': {'conversionOrExercisePrice': {'4': {},
     '4A': {'value': '<value>318.2102</value>'}},
    'directOrIndirectOwnership': {'4': {},
     '4A': {'value': '<value>D</value>'}},
    'exerciseDate': {'4': {},
     '4A': {'F5': 'Option is fully vested.',
      'value': '<footnoteId id="F5"/>'}},
    'expirationDate': {'4': {}, '4A': {'value': '<value>2022-04-04</value>'}},
    'securityTitle': {'4': {},
     '4A': {'value': '<value>Option to Purchase Class A Common Stock</value>'}},
    'sharesOwnedFollowingTransaction': {'4': {},
     '4A': {'value': '<value>8646</value>'}},
    'underlyingSecurityShares': {'4': {},
     '4A': {'value': '<value>8646</value>'}},
    'underlyingSecurityTitle': {'4': {},
     '4A': {'value': '<value>Class A Common Stock</value>'}}},
   '1-transaction': {'conversionOrExercisePrice': {'4': {'value': '<value>316.9399</value>'},
     '4A': {'value

In [21]:
test = accs[:100]
testres = get4ATo4Changes(test)

No clear match for  927003/000120919117032230/0001209191-17-032230
No clear match for  1708881/000110465917052970/0001104659-17-052970
No clear match for  899051/000112760217034686/0001127602-17-034686
No clear match for  881890/000114036117019447/0001140361-17-019447
No clear match for  949858/000120919117052375/0001209191-17-052375
No clear match for  6176/000120919117031283/0001209191-17-031283
No clear match for  1314626/000131462617000006/0001314626-17-000006
No clear match for  741516/000120839417000002/0001208394-17-000002
No clear match for  1668028/000110465917026605/0001104659-17-026605
No clear match for  1178879/000120919117040582/0001209191-17-040582
No clear match for  918160/000091816017000097/0000918160-17-000097
No clear match for  899051/000112760217034703/0001127602-17-034703
No clear match for  1035443/000103544317000152/0001035443-17-000152
No clear match for  1316175/000121465917001675/0001214659-17-001675
No clear match for  1702626/000119143417000002/0001191434-

In [12]:
### for idx

fname = "./idx/2016Q4_2017Q4form4a_3MCO.idx" #400 4/a

with open(fname) as f:
    urls = f.read().splitlines()

with open(fname) as f:
    urls = f.read().splitlines()

thisTime = urls #400 forms 4as
accs = []

for link in thisTime:
    links = link[link.find("edgar"):link.find("txt")+3]
    parts = links.split("/")
    if len(parts) == 5: 
        accNum = parts[len(parts)-3] + '/' +parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    if len(parts) == 4: 
        accNum = parts[len(parts)-2] + '/' + parts[len(parts)-1].split('.')[0]
    accs.append(accNum)
    
accs[:5]

['1457247/0001493152-17-000292',
 '1453356/0001493152-17-000292',
 '893739/0001211022-17-000006',
 '893739/0001296720-17-000004',
 '66740/0001127602-17-004847']

In [17]:
### Find changes for 4/a idx
changeDic = get4ATo4Changes(accs)

No clear match for  1457247/0001493152-17-000292
No clear match for  1453356/0001493152-17-000292
No clear match for  893739/0001211022-17-000006
No clear match for  893739/0001296720-17-000004
No clear match for  1023731/0001023731-17-000006
No clear match for  1750/0001127602-17-008978
No clear match for  1137692/0001137692-17-000010
No clear match for  887546/0000950117-17-000076
No clear match for  1244172/0001140361-17-007651
No clear match for  1280600/0000899243-17-006842
No clear match for  1280600/0000899243-17-006848
No clear match for  935036/0001269847-17-000010
No clear match for  1220630/0001179110-17-003098
No clear match for  2230/0000002230-17-000006
No clear match for  2230/0000002230-17-000022
No clear match for  1234366/0001104659-17-000963
No clear match for  1272471/0000905729-17-000024
No clear match for  1157377/0001209191-17-021838
No clear match for  785787/0001437749-17-000878
No clear match for  1372414/0001372414-17-000002
No clear match for  1122304/000112

In [7]:
test = BeautifulSoup("<value>Common Stock</value><footnoteId id=\"F1\"/>dsdsd<footnoteId id=\"F2\"/>dsds")
a=test.findAll('footnoteid')
print(a[0].get('id'))

F1


In [3]:
import sys
!{sys.executable} -m pip install pandas

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/e5/53/896de98b5798291aff041d3d1d3636ad2a6495f558aab9bdb064842394eb/pandas-0.23.4-cp35-cp35m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.4MB)
[K    100% |████████████████████████████████| 14.4MB 788kB/s  a 0:00:011
Installing collected packages: pandas
Successfully installed pandas-0.23.4
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
#### oldddd

### compare head, dt, and ndt

fourA = [
'887546/000095011717000076/0000950117-17-000076', 
'893739/000121102217000006/0001211022-17-000006', 
'1305168/000089924317006824/0000899243-17-006824', 
'1653649/000120919117010174/0001209191-17-010174', 
'893739/000129672017000004/0001296720-17-000004', 
'1173479/000117347917000003/0001173479-17-000003'] #[list of accNum]
#fourA = ['1305168/000089924317006824/0000899243-17-006824'] #[list of accNum]
#fourA = ['1653649/000120919117010174/0001209191-17-010174'] #need to debug

aTo4 = dict()
for idx in range(len(fourA)):
    aHead = cur.execute("select * from form4Head where accNum = '" + fourA[idx]+ "';").fetchall()
    aDT = cur.execute("select * from form4dT where accNum = '" + fourA[idx]+ "';").fetchall()
    aNDT = cur.execute("select * from form4ndT where accNum = '" + fourA[idx]+ "';").fetchall()

    query = "select B.accNum as bAcc from(select * from form4head group by accNum) A, (select * from form4head group by accNum) B where A.accNum='"+ fourA[idx] + "' and B.documentType = '4' and A.dateOfOriginalSubmission = B.filedDate;" 

    matches = cur.execute(query).fetchall() #grouped by [(aAcc, bAcc)]
    probDist = dict()
    headScores, dtScores, ndtScores, totalScores = dict(), dict(), dict(), dict()
    for match in matches:
        bAcc = match[0]
        bHead = cur.execute("select * from form4Head where accNum = '" + bAcc + "';").fetchall()
        bDT = cur.execute("select * from form4dT where accNum = '" + bAcc + "';").fetchall()
        bNDT = cur.execute("select * from form4ndT where accNum = '" + bAcc + "';").fetchall()
        
        if (len(aDT) != 0 and len(bDT) == 0) or (len(aNDT) != 0 and len(bNDT) == 0):
            continue
        
        scoreLen = max(len(bHead), len(bDT), len(bNDT))
        mHeadScores, mDtScores, mNdtScores = np.zeros(scoreLen), np.zeros(scoreLen), np.zeros(scoreLen)
        
        for a in aHead:
            for j in range(len(bHead)):
                b = bHead[j]
                mHeadScores[j] = compare(a, b)
        
#         for a in aDT:
#             for j in range(len(bDT)): #if dt is empty then score is 0 (maybe ndt update?)
#                 b = bDT[j]
#                 mDtScores[j] = compare(a, b)
#         for a in aNDT:
#             for j in range(len(bNDT)):
#                 b = bNDT[j]
#                 mNdtScores[j] = compare(a, b)

        totalScore = sum([mHeadScores[p] + mDtScores[p] + mNdtScores[p] for p in range(len(mHeadScores))])
        headScores[bAcc], dtScores[bAcc], ndtScores[bAcc], totalScores[bAcc] = mHeadScores, mDtScores, mNdtScores, totalScore
    scores = totalScores.values()
    minScore = min(scores)
    total = 0
    for key in totalScores.keys():
        if minScore < 0:
            totalScores[key] += abs(minScore) + 1 #prevent from nan
        total += totalScores[key]
    for key in totalScores.keys():
        probDist[key] = totalScores[key]/total
        
    values = sorted(probDist.values(), reverse=True)
    aTo4[fourA[idx]] = dict()
    for key in probDist.keys():
        if probDist[key] in values[:4]:
            aTo4[fourA[idx]][key] = probDist[key]