In [38]:
# Run on first instance to install required libraries
%pip install smart_open
%pip install minecart
%pip install textract-trp
%pip install python-Levenshtein
%pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [39]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from smart_open import open
from sagemaker.session import Session

## Table Shaving
**Removing blank/empty rows that are reported in the line items**

In [40]:
def column_purge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Column designed to filter out rows that are NaN (empty) and reduce dataframe size
    ------------------------------------------------------------------------------------
    Input
        :param data: (type pandas.DataFrame)
            A dataframe object that corresponds to the X-17A-5 filings
    
    Output
        :return: (type pandas.DataFrame)
            Returns a dataframe of size less than or equal to the original input 
    """
    # begin by filtering out the NaN rows present in the first column
    first_col = df.columns[0]
    new_df = df[np.isin(df[first_col], df[first_col].dropna())]    # select subset of rows 
    
    # we reset the index of our new_df to recoup a consecutive index count
    new_df = new_df.reset_index()
    new_df = new_df[new_df.columns[1:]]    # skip the first column since we reset the index
    
    return new_df

## Table column merging
**For tables with three columns we merge the last two columns into a once unique column**

In [41]:
def merge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function passes a special dataframe, and reduces its dimensions accordingly. 
    Example releases include, but are note limited to, 1224385-2016 and 72267-2003
    ------------------------------------------------------------------------------------
    Input
        :param data: (type pandas.DataFrame)
            A dataframe object that corresponds to the X-17A-5 filings
    
    Output
        :return: (type pandas.DataFrame)
            Returns a dataframe of of size (Nx3) -> (Nx2)
    
    e.g.
    
    Converts a wide dataframe, balance sheet into a smaller rectangular form
                  0                                                 1                 2
            ====================================================================================
        0   Assets                                          | NaN            | NaN  
        1   Cash and cash equivalents                       | $ 606,278      |     
        2   Cash and securities segregated pursuant         | 273,083        | 
        3   Collateralized short-term financing agreements: | NaN            | $ 1,345
    
    
    Rectangular form of the the dataframe ->
                   0                                                 1          
            =====================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278        
        2   Cash and securities segregated pursuant         | 273,083        
        3   Collateralized short-term financing agreements: | $ 1,345            
    """
    # work on itterative merging for rows, check left/right and top/bottom
    n = df.shape[0]
    trans = []

    for i in range(n):
        row = df.iloc[i]         # index into the row

        name = row.iloc[0]       # the line item name (e.g. Total Assets)
        col1 = row.iloc[1]       # the first value(s) column
        col2 = row.iloc[2]       # the second value(s) column 
        
        # ----------------------------------------------
        # NOTE: We say nothing if both col 1 and 2 are 
        #       both populated with a value
        # ----------------------------------------------
        
        if col1 is not np.nan:
            trans.append([name, col1])  # if column 1 has a value we take it by default
        elif col2 is not np.nan:
            trans.append([name, col2])  # if column 1 has no value, but column 2 does, we take it
            
        # ----------------------------------------------
        
        # we want to check if there exists two NaNs - is it real or false flag
        if (col1 is np.nan) and (col2 is np.nan): 
            
            # look up one row (if possible to see if col1 and col2 are populated)
            try:
                # check the information for the above row
                prior_row = df.iloc[i-1]                     # previous dataframe row 
                prior_col1 = prior_row.iloc[1]               # first column from previous row
                prior_col2 = prior_row.iloc[2]               # second column from previous row
                
                # if both values present then we simply use the right hand side value above  
                if (prior_col1 is not np.nan) and (prior_col2 is not np.nan):
                    trans.append([name, prior_col2])
            
            # IndexError if not possible to look up one row       
            except IndexError: pass
    
    return pd.DataFrame(trans)

In [42]:
def column_merge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function determines whether a Balance Sheet should be merged or simply filtered. 
    Our two cases are determined as follows:
        * If the second column present in the balance sheet is mostly empty we assume that 
          the second column is an aggregated column, and we can merge it
        * However, if the second column is mostly filled with values, we assume that this
          represents accounting figures from the previous year 
    ------------------------------------------------------------------------------------
    Input
        :param data: (type pandas.DataFrame)
            A dataframe object that corresponds to the X-17A-5 filings
    
    Output
        :return: (type pandas.DataFrame)
            Returns a dataframe of of size (Nx3) -> (Nx2)
    """
    # two events could occur at this point (either the column represents totals, or values from a prior-year)
    arr = df[df.columns[2]].values

    # check the scope of the second column 
    n = arr.size
    k = arr.tolist().count(np.nan)

    # k-check: if more than half the arr size is np.nan we assume this is a "fake column"
    # we merge these columns since there are many blank rows, otherwise we assume year split 
    if k/n >= 0.50:
        new_df = merge(df)            # merge rows by merge function
    else:
        new_df = df[df.columns[:2]]   # return the most recent year 
    
    return new_df

## Table Row Split
**Since many of the existing tables run the risk of overlapping rows we work to split these rows to appropriate values**

In [43]:
def dollar_check(num):
    """
    A function to check the presence of a '$' or 'S'. This function is used to 
    complement our row splits function to determine "True splits"
    """
    if num not in ['$', 'S']:
        return True
    else: return False

In [44]:
def row_split(df:pd.DataFrame, text_file:dict) -> pd.DataFrame:
    """
    Function designed to split conjoined rows from Balance sheet dataframes into individual rows.
    Example releases include, but are note limited to, 42352-2015, 58056-2009, 58056-2013, 58056-2019
    ------------------------------------------------------------------------------------
    Input:
        :param df: (type pandas.DataFrame)
            References the balance sheet dataframe read in from AWS Textract
        :param text_file: (type dictionary)
            Stores text values with corresponding confidence level for balance sheet pages
    
    Output:
        :param return: (type pandas.DataFrame) 
            A processed dataframe of size greater than or equal to the inputed dataframe
    """
    
    # ##############################################################
    # NESTED HELPER FUNCTIONS
    # ##############################################################
    
    def find_row_splits(val) -> bool:
        """
        Compute a boolean measure to assess whether a row is conjoined or not. We make
        the assumption that a row is conjoined or merged if there exists a space in the 
        first value column (omiting the dollar sign $ and S which may be read in)
        """
        try:
            # split the data figures for each balance sheet figure
            arr = val.split(' ')
            
            # remove the '$' sign or 'S' if present in the list (this helps avoid false pasitives) 
            arr = list(filter(dollar_check, arr))
            
            # if length of read list exceeds 1 then we know there exists a multi-row bunch
            if len(arr) > 1:
                return True
            else: return False
        
        # handle exception for NaN (no attribute to split) 
        except AttributeError: return False
    
    def extract_lineitems(line:list, value:list, dictionary:dict) -> list:
        """
        Extract the appropriate line items from each line value.
        """
        splits = []
        
        # iterate through each line item
        for i in dictionary.keys():
    
            # we check for real key-value names avoiding single character keys
            if len(i) > 1: 
                idx = line.find(i)    # find the index of key-value (if possible) in line item array

                # if we find such a value we append the series (failure to find results idx = -1)
                if idx >= 0: splits.append(i)
        
        # check whether we have a one-to-one mapping between line items and line values, 
        # e.g. ['Assets', 'Cash', 'Recievables'] -> ['1,233', '4,819'] (3x2 mapping)
        n = len(splits) - len(value)
        
        # if n is equal to zero we have a "perfect" match
        if n == 0:
            return splits
        elif n > 0:
            return splits[n:]       # there exists a mismatch (more line items terms)
        else: 
            return None             # no specific rule paradigm (more values than items)
    
    def recursive_splits(values:list, lineName:list, sub=[]) -> pd.DataFrame:
        """
        Recursively breaks up merged rows for each split until no merged row is left
        """
        # if our list exceeds 1 in length, we continue to split
        if len(values) > 1:
            # construct a dataframe row of the first split term to append to sub list
            row = pd.DataFrame([lineName[0], values[0]]).T
            sub.append(row)
            
            # we pass the +1 index splits and line name, appending the first-most layer 
            return recursive_splits(values[1:], lineName[1:], sub=sub)
        else:
            row = pd.DataFrame([lineName[0], values[0]]).T
            sub.append(row)
            
            # we concatenate all DataFrames vertically to form a large DataFrame 
            return pd.concat(sub)
        
    # ##############################################################
    # ##############################################################    
    
    # select all the rows that match our description, where a space exists = row merge 
    selections = df[df[df.columns[1]].apply(lambda x: find_row_splits(x))]
    idxs = selections.index
    
    # iterate through each row that is determined to be conjoined
    for i in idxs:
        
        # find the index location od merged row
        row_idx = np.argmax(df.index == i)
        
        # slice dataframe according to the idx selection (we search for all periods were a break occurs)
        top = df.iloc[:row_idx]
        bottom = df.iloc[row_idx+1:]

        # divide the identified term from the selection e.g. "$ 9,112,943 13,151,663" -> ["$", "9,112,943", "13,151,663"] 
        # and filter out the $ sign in the list e.g. ["$", "9,112,943", "13,151,663"] -> [9,112,943", "13,151,663"]
        values = df[df.columns[1]].loc[i].split(' ')
        values = list(filter(dollar_check, values))
        
        # extract line names according to Text parsed list (requires parsed TEXT JSON)
        # e.g. ['Securities Held Total Assets'] -> ['Securities Held', 'Total Assets']
        lineName = df[df.columns[0]].loc[i]
        lineName = extract_lineitems(lineName, values, text_file)
        
        # if we retun a lineitem then we can perform recursive splits (otherwise avoid)
        if lineName is not None:
            # determine the splits for the corresponding row
            mid = recursive_splits(values, lineName, sub=[])
            mid.columns = df.columns

            # re-assign the value of df2 to update across each iteration
            df = pd.concat([top, mid, bottom])
            
        else:
            # no need for mid, since we have removed it from existence
            df = pd.concat([top, bottom])
        
    return df

## Numeric Conversion
**Work on converting all string and poor formating quantities to numerical type**

In [45]:
def num_scale(text_dict:dict) -> float:
    """
    Function used for scaling accounting figures by reported unites
    """
    scalar = {'thousands': 1e3, 'hundreds':1e2, 'millions':1e6, 'billions': 1e9}
    
    # iterate through each of the text values from dictionary map
    for text_value in text_dict.keys():
        
        # we check to see whether the text is found in our scalar dictionary
        for scale_type in scalar.keys():
            
            # search for the presence of the scale identifier (e.g. millions) 
            # we use a "fuzzy-partial" match to include partial fits (e.g. Dollar in Millions)
            scale_search = fuzz.partial_ratio(scale_type.lower(), text_value.lower())
            
            # we make the assumption that a score of 90 or greater, signals a match
            if scale_search >= 90:
                return scalar[scale_type]              # if found we simply return the multiplier
    
    # default to no multiplier (1)
    return 1

In [46]:
def num_strip(number):
    """
    This function converts a string to a numeric quantity, handles weird string format. 
    We handle input arguments of a string, integer or numpy.ndarray
    """
    
    numType = type(number)

    # if provided a non-empty string, perform regex operation 
    if (numType is str) and (len(number) > 0):

        # check for accounting formats that use parenthesis to signal losses 
        if number[0] == '(': number = '-' + number

        # case replacing to handle poor textract reading of numbers
        number = number.replace('I', '1').replace('l', '1')

        # --------------------------------------------------------------
        # Explanation of the Regex Expression:
        #      [^0-9|.|-]     = match all elements that are not numeric 0-9, periods "." or hyphens "-"
        #      (?<!^)-        = match all elements that are hyphens "-" not in the first index position
        #      \.(?=[^.]*\.)  = match all elements that are periods "." except the last instance
        # --------------------------------------------------------------

        check1 = re.sub("[^0-9|.|-]", "", number)         # remove all the non-numeric, periods "." or hyphens "-"
        check2 = re.sub("(?<!^)-", "", check1)            # removes all "-" that aren't in the first index 
        check3 = re.sub("\.(?=[^.]*\.)", "", check2)      # removes all periods except the last instance of "." 

        # --------------------------------------------------------------

        # we consider weird decimal values that exceed 2 spaces to the right (e.g. 432.2884)
        period_check = check3.find('.')                         # returns the location of the period 
        right_tail_length = len(check3) - period_check - 1      # right-tail length should not exceed 2

        # if more than 2 trailing digits to decimal point we assume incorrect placement
        if right_tail_length > 2:
            check3 = check3.replace('.', '')

        # last check against poor lagging formats e.g. "." or "-" to return nan or floating-point number
        if (check3 == '-') or (check3 == '.'):
            return 0.0
        else:
            # try to cast to floating point value, else flat NaN
            try: 
                return float(check3)
            except ValueError: 
                return np.nan

    # if operator is an integer or float then simply return the value
    elif (numType is int) or (numType is float):
        return number

    else:
        return np.nan

In [47]:
def cleanNumeric(value):
    """
    This function is a wrapper for calling the numerical extraction function 
    ------------------------------------------------------------------------------------
    Input:
        :param value:
            String with hidden numeric quanity (e.g. $ 19,225 = 19255)  
        :param text_file: (type dictionary)
            Stores text values with corresponding confidence level for balance sheet pages
            
    Output:
        :param return:
            A processed numeric quantiity or numpy.nan value depending on string issues  
    """
    
    assert type(value) is str or int or np.ndarray, 'Value must be of type string, integer, float or numpy array'
    
    # checks to see what type of value is being provided
    operator = type(value)
    
    # if provided a string, perform regex operation 
    if (operator is str) and (len(value) > 0):
        return num_strip(value)
    
    # if operator is integer then simply return the value, no need to modify 
    elif (operator is int) or (operator is float):
        return value 
    
    # if operator is numpy array then we perform a extraction per element in array
    elif (operator is np.ndarray):
        vFunc = np.vectorize(num_strip)      # vectorize function to apply to numpy array
        cleanValue = vFunc(value)            # apply vector function
        return cleanValue 


### Final Main Execution

In [48]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = 'ran-s3-systemic-risk'
    data_folder = 'Output/X-17A-5-BS-RAW/'
    output_folder = 'Output/X-17A-5-CLEAN/'
    temp_folder = 'Temp/'

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # retrieving text JSON file from s3 bucket and store to temp 
    s3.download_file(bucket, 'Temp/X17A5-TEXT.json', 'temp2.json')

    # read data on TEXT-Confidence dictionary
    with open('temp2.json', 'r') as f: text_dictionary = json.loads(f.read())  

    # remove local files for JSON
    os.remove('temp2.json')
    
    # csv directory with all X-17A-5 balance sheet information 
    paths = np.array(session.list_s3_files(bucket, data_folder))[1:]
    
    # iterate through each csv path e.g. ['Output/X-17A-5-BS/58056-2014-03-04.csv']
    for csv in paths:
        
        fileName = csv.split('/')[-1]         # strip filename from each csv
        base_file = fileName.split('.')[0]    # CIK-YYYY-MM-DD base name
        print('\nCleaning the {} file'.format(fileName))
        
        # download X-17A-5 csv file as a temporary csv file  
        s3.download_file(bucket, csv, 'temp.csv')
        df = pd.read_csv('temp.csv')
        
        # re-assign dataframe of balance sheet after cleanse
        df = column_purge(df)
        
        # --------------------------------------------------------------------------------------------------
        # COLUMN MERGING (IF NECESSARY)
        # --------------------------------------------------------------------------------------------------
        
        # if columns greater than 2, we have a weird data table that needs to be "merged"
        # NOTE: By construction we never have more than 3 columns present, thanks to our Textract check 
        if df.columns.size > 2:
            df = column_merge(df)
            print('\tWe merged the columns of {}'.format(fileName))
            
        # --------------------------------------------------------------------------------------------------
        # ROW SPLIT FOR MERGED ROWS (IF NECESSARY)
        # --------------------------------------------------------------------------------------------------
        
        # check for presence of row splits and correct any if found 
        tempDF = row_split(df, text_dictionary[base_file])
        
        # if difference is found in shape, then a transformation was done 
        if tempDF.shape != df.shape:
            print("\tFixed the merged rows for {}".format(fileName))
            
        # --------------------------------------------------------------------------------------------------
        # NUMERIC CONVERSION
        # --------------------------------------------------------------------------------------------------
        
        # pass numeric converter to the column to convert string to numerics
        tempDF[tempDF.columns[1]] = tempDF[tempDF.columns[1]].apply(cleanNumeric)
        
        # remove any NaN rows post numeric-conversion
        postDF = tempDF.dropna().copy()
        
        # check for potential scaler multipler on cash flows (adjust multiplier if possible)
        scale = num_scale(text_dictionary[base_file])
        postDF[postDF.columns[1]] = postDF[postDF.columns[1]].apply(lambda x: x * scale)
        
        print('\tWe converted to numeric figures for {}'.format(fileName))
        
        # --------------------------------------------------------------------------------------------------
        # BALANCE SHEET STORAGE
        # --------------------------------------------------------------------------------------------------

        print(postDF)
        
        # writing data frame to .csv file
        postDF.to_csv(fileName, index=False)

        # save contents to AWS S3 bucket
        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=output_folder + fileName, Body=data)

        # remove local file after it has been used
        os.remove(fileName)
        os.remove('temp.csv')

    print('\nAll .csv files are cleaned and primed')


Cleaning the 1224385-2004-03-01.csv file
	We converted to numeric figures for 1224385-2004-03-01.csv
                                                    0             1
0                                                Cash  1.760000e+05
1     Cash segregated pursuant to federal regulations  7.500000e+07
2     Securities purchased under agreements to resell  7.944113e+09
3   Securities owned, at market value ($8,769,300 ...  9.075170e+09
4   Receivable from broker-dealers and clearing or...  4.049708e+09
5                           Receivable from customers  5.124600e+07
6                         Accrued interest receivable  4.445700e+07
7   Property, equipment, and leasehold improvement...  1.106700e+07
8                                       Goodwill, net  6.112000e+06
9                                        Other assets  2.527380e+08
10                                       Total assets  2.150979e+10
13     Securities sold under agreements to repurchase  1.360247e+10
14  Securities

	We converted to numeric figures for 1224385-2011-03-01.csv
                                                    0             1
0                                                Cash  1.342900e+07
1   Financial instruments owned, at fair value ($1...  2.063521e+10
2   Receivable from broker-dealers and clearing or...  9.224518e+09
3     Securities purchased under agreements to resell  6.792762e+09
4                           Receivable from customers  7.310440e+08
5                                            Goodwill  7.968700e+07
6   Property, equipment, and leasehold improvement...  2.660000e+05
7                                        Other assets  1.330140e+08
8                                        Total assets  3.760993e+10
11     Securities sold under agreements to repurchase  2.204047e+10
12  Financial instruments sold, not yet purchased,...  6.849177e+09
13  Payable to broker-dealers and clearing organiz...  2.925217e+09
14                               Payable to customers  5

	We converted to numeric figures for 1224385-2017-03-01.csv
                                                    0             1
0                                                Cash  1.705750e+08
1     Cash segregated pursuant to federal regulations  5.699890e+08
2   Financial instruments owned, at fair value ($2...  4.529348e+10
3                                 Securities borrowed  1.853297e+10
4     Securities purchased under agreements to resell  2.230535e+10
5   Receivable from broker-dealers and clearing or...  2.163268e+10
6                           Receivable from customers  4.093015e+09
7                                            Goodwill  7.968700e+07
8   Property, equipment, and leasehold improvement...  5.850000e+05
9                                        Other assets  2.621580e+08
10                                       Total assets  1.129405e+11
13     Securities sold under agreements to repurchase  6.113130e+10
14  Financial instruments sold, not yet purchased,...  1


Cleaning the 42352-2002-01-30.csv file
	We converted to numeric figures for 42352-2002-01-30.csv
                                                    0             1
0                           Cash and cash equivalents  5.666498e+09
1   Cash and securities segregated in compliance w...  1.859211e+10
2   Receivables from brokers, dealers and clearing...  4.408164e+09
3       Receivables from customers and counterparties  1.316551e+10
4                                 Securities borrowed  9.985780e+10
5     Securities purchased under agreements to resell  1.471373e+10
6          Financial instruments owned, at fair value  2.906175e+10
7   Financial instruments owned and pledged as col...  2.568717e+09
8                                        Other assets  1.941781e+09
10                              Short-term borrowings  3.223388e+10
11  Payables to brokers, dealers and clearing orga...  3.419440e+09
12           Payables to customers and counterparties  5.369711e+10
13                

['9,112,943', '13,151,663']
	Fixed the merged rows for 42352-2007-01-23.csv
	We converted to numeric figures for 42352-2007-01-23.csv
                                                    0             1
0                           Cash and cash equivalents  2.008168e+09
1   Cash and securities segregated for regulatory ...  5.323661e+10
2   Receivables from brokers, dealers and clearing...  9.551828e+09
3       Receivables from customers and counterparties  1.613905e+10
5                                 Securities borrowed  2.587133e+11
6   Financial instruments purchased under agreemen...  5.642668e+10
7          Financial instruments owned, at fair value  8.191529e+10
8   Financial instruments owned and pledged as col...  2.398528e+10
9    Total financial instruments owned, at fair value  1.059006e+11
10                                       Other assets  7.274550e+09
11                                       Total assets  5.092507e+11
0   Unsecured short-term borrowings, including the

	We converted to numeric figures for 42352-2014-07-31.csv
                                                    0             1
1                           Cash and cash equivalents  3.118000e+09
2   Cash and securities segregated for regulatory ...  3.147000e+10
3   Collateralized agreements: Securities borrowed...  2.102660e+11
4   Securities purchased under agreements to resel...  1.001070e+11
5   Receivables from brokers, dealers and clearing...  7.359000e+09
6   Receivables from customers and counterparties ...  2.467500e+10
7   Financial instruments owned, at fair value (in...  1.446380e+11
8                                        Other assets  2.883000e+09
9                                        Total assets  5.245260e+11
11  Unsecured short-term borrowings (includes $198...  3.393900e+10
13  Securities loaned (includes $33,912 at fair va...  9.137800e+10
14  Securities sold under agreements to repurchase...  1.347840e+11
15  Other secured financings (includes $9,739 at f...  2.9


Cleaning the 42352-2019-03-01.csv file
['As', 'of', 'December', '2018']
	Fixed the merged rows for 42352-2019-03-01.csv
	We converted to numeric figures for 42352-2019-03-01.csv
                                                    0             1
2                                                Cash  1.422800e+10
4   Securities purchased under agreements to resel...  1.097720e+11
5   Securities borrowed (includes $24,101 at fair ...  1.476440e+11
7         Brokers, dealers and clearing organizations  6.769000e+09
8   Customers and counterparties (includes $178 at...  1.870400e+10
9   Financial instruments owned (at fair value and...  1.172800e+11
10                                       Other assets  2.268000e+09
11                                       Total assets  4.166650e+11
14  Securities sold under agreements to repurchase...  1.057090e+11
15  Securities loaned (includes $20,827 at fair va...  6.555600e+10
16  Other secured financings (includes $7,598 at f...  2.762500e+10
18   


Cleaning the 58056-2004-03-26.csv file
	We converted to numeric figures for 58056-2004-03-26.csv
                                                    0             1
0   Cash, cash equivalents, and cash segregated fo...  3.928850e+08
1                           Receivable from customers  5.564333e+09
2                                 Securities borrowed  7.867755e+10
3   Receivables from brokers, dealers, and clearin...  1.027592e+10
4     Securities purchased under agreements to resell  4.945774e+10
6              U.S. Government and agency obligations  2.498625e+10
7   U.S. Government obligations pledged as collateral  1.702972e+10
8                State and municipal bond obligations  4.156900e+07
9                               Corporate obligations  6.236653e+09
10        Corporate obligations pledged as collateral  8.239400e+08
11                      Stocks, warrants, and options  4.392380e+09
12  Stocks, warrants, and options pledged as colla...  3.989122e+09
13            Comm

	We converted to numeric figures for 58056-2008-02-29.csv
                                                    0             1
1                           Cash and cash equivalents  8.010820e+08
2   Cash and securities segregated for benefit of ...  8.250562e+09
4     Securities purchased under agreements to resell  1.304113e+11
5                                 Securities borrowed  1.678936e+11
6   Financial instruments owned, at market value (...  1.856126e+11
8                                           Customers  2.366990e+09
9        Brokers, dealers, and clearing organizations  9.435937e+09
10                                              Other  1.000000e+10
11  Property, plant, and equipment (net of accumul...  7.097760e+08
12                                       Other assets  4.298786e+09
13                                       Total assets  5.197806e+11
16     Securities sold under agreements to repurchase  3.680559e+11
17                                  Securities loaned  3.6

['70,582', '33,496']
['-', '14,528', '(2,636)']
	Fixed the merged rows for 58056-2015-03-02.csv
	We converted to numeric figures for 58056-2015-03-02.csv
                                                    0             1
0   Cash and cash equivalents (includes cash equiv...  8.160000e+08
1   Cash and securities segregated for benefit of ...  7.931000e+09
3   Securities purchased under agreements to resel...  3.964900e+10
4   Securities borrowed (includes $24,353 at fair ...  5.173000e+10
5   Financial instruments owned, at fair value (in...  4.372100e+10
7                                           Customers  1.558000e+09
8                                        Noncustomers  1.867400e+10
9        Brokers, dealers, and clearing organizations  4.730000e+09
10  Property, plant, and equipment (net of accumul...  5.620000e+08
11  Other assets (includes $2,625 of securities re...  4.267000e+09
12                                       Total assets  1.736380e+11
0   Securities sold under agre

['-', '15,288', '(4,449)']
	Fixed the merged rows for 58056-2019-09-30.csv
	We converted to numeric figures for 58056-2019-09-30.csv
                                                    0             1
1   Cash and cash equivalents (includes cash equiv...  9.460000e+08
2   Cash segregated under federal and other regula...  2.500000e+08
4   Securities purchased under agreements to resel...  3.211800e+10
5   Securities borrowed (includes fair value of $1...  2.928900e+10
6   Financial instruments owned, at fair value (in...  2.243000e+10
8                                           Customers  1.013000e+09
9                                        Noncustomers  3.000000e+06
10       Brokers, dealers, and clearing organizations  2.897000e+09
11  Premises and equipment (net of accumulated dep...  4.780000e+08
12                                       Other assets  1.522000e+09
13                                       Total assets  9.094600e+10
15  Collateralized agreements and financings: Secu.

['90,043', '204,544']
	Fixed the merged rows for 68136-2004-01-30.csv
	We converted to numeric figures for 68136-2004-01-30.csv
                                                    0             1
0                                                Cash  1.602300e+08
1   Cash and securities deposited with clearing or...  2.222508e+10
3                  U.S. government and federal agency  1.310378e+10
4                            Corporate and other debt  2.009774e+10
5                                  Corporate equities  9.533411e+09
6                                Derivative contracts  3.546632e+09
7     Securities purchased under agreements to resell  5.577719e+10
8                   Securities received as collateral  2.715550e+10
9                                 Securities borrowed  1.357946e+11
11                                          Customers  1.642868e+10
12        Brokers, dealers and clearing organizations  5.444361e+09
13                             Interest and dividends  3


Cleaning the 68136-2009-01-29.csv file
	We converted to numeric figures for 68136-2009-01-29.csv
                                                    0             1
0                                         ASSETS Cash  1.690063e+09
1   Cash and securities deposited with clearing or...  4.418126e+10
3               U.S. government and agency securities  1.685994e+10
4              Other sovereign government obligations  5.768050e+08
5                            Corporate and other debt  2.869846e+10
6                                  Corporate equities  1.562403e+10
7                                Derivative contracts  8.579713e+09
8                                         Investments  7.989130e+08
9    Securities received as collateral, at fair value  5.168112e+09
11    Securities purchased under agreements to resell  5.221735e+10
12                                Securities borrowed  9.030198e+10
14                                          Customers  2.194694e+10
15        Brokers,


Cleaning the 68136-2012-02-29.csv file
	We converted to numeric figures for 68136-2012-02-29.csv
                                                    0             1
1                           Cash and cash equivalents  1.700180e+09
2   Cash deposited with clearing organizations or ...  9.501725e+09
4               U.S. government and agency securities  6.070366e+10
5              Other sovereign government obligations  3.539237e+09
6   Corporate and other debt ($90,713 related to c...  1.433535e+10
7                                  Corporate equities  1.187409e+10
8                      Derivative and other contracts  4.093870e+09
9                                         Investments  2.206330e+08
10   Total financial instruments owned, at fair value  9.476684e+10
11   Securities received as collateral, at fair value  1.222839e+10
12  Securities purchased under agreements to resel...  8.190605e+10
13                                Securities borrowed  1.216298e+11
15                


Cleaning the 68136-2016-02-29.csv file
	We converted to numeric figures for 68136-2016-02-29.csv
                                                    0             1
0                                                Cash  1.213000e+09
1   Cash deposited with clearing organizations or ...  1.253600e+10
2   Financial instruments owned, at fair value (ap...  6.349400e+10
3    Securities received as collateral, at fair value  1.469200e+10
4   Securities purchased under agreements to resel...  5.755700e+10
5                                 Securities borrowed  1.259340e+11
7   Customers (net of allowance for doubtful accou...  8.660000e+09
8         Brokers, dealers and clearing organizations  3.989000e+09
9                              Interest and dividends  3.900000e+08
10                                     Fees and other  1.069100e+10
11                                         Affiliates  1.190000e+08
12  Premises, equipment and software (net of accum...  1.363000e+09
13                


Cleaning the 72267-2003-05-30.csv file
	We merged the columns of 72267-2003-05-30.csv
	We converted to numeric figures for 72267-2003-05-30.csv
                                                    0             1
0                           Cash and cash equivalents  6.062780e+08
1   Cash and securities segregated pursuant to fed...  2.730830e+08
2     Securities purchased under agreements to resell  2.660844e+10
3                                 Securities borrowed  1.324224e+10
4             U.: S. government and agency securities  7.960231e+09
5                                   Equity securities  6.820932e+09
6   Corporate debt and collateralized mortgage obl...  1.137946e+09
7                                             Options  7.425700e+07
8                                        Receivables:  1.599337e+10
9         Brokers, dealers and clearing organizations  4.992730e+08
10                                          Customers  1.820420e+08
11                             Interest


Cleaning the 72267-2008-05-30.csv file
	We merged the columns of 72267-2008-05-30.csv
['15,155,780', '12,032']
	Fixed the merged rows for 72267-2008-05-30.csv
	We converted to numeric figures for 72267-2008-05-30.csv
                                                    0             1
0                           Cash and cash equivalents  5.022480e+08
1   Cash and securities segregated pursuant to fed...  2.030000e+08
2                                 Securities borrowed  1.404742e+10
3     Securities purchased under agreements to resell  1.108355e+09
5         Brokers, dealers and clearing organizations  1.245960e+08
6                              Interest and dividends  5.025400e+07
7                                           Customers  1.300600e+07
8                   Securities received as collateral  1.472600e+08
9   Furniture, equipment and leasehold improvement...  8.385000e+06
10                                       Other assets  5.007400e+07
11                                

	We merged the columns of 72267-2012-05-30.csv
['65,183,498', '43,233,634']
['74,550', '150,078']
	Fixed the merged rows for 72267-2012-05-30.csv
	We converted to numeric figures for 72267-2012-05-30.csv
                                                    0             1
0                           Cash and cash equivalents  2.048302e+09
1   Cash and securities segregated pursuant to fed...  7.704480e+08
2   Collateralized short-term financing agreements...  4.833960e+10
3                                 Securities borrowed  1.684390e+10
0                           available to the Company)  6.518350e+10
0                                            Company)  4.323363e+10
5         Brokers, dealers and clearing organizations  1.779379e+09
6                                           Customers  4.054800e+08
7                              Interest and dividends  3.899850e+08
8                   Securities received as collateral  2.414100e+07
0                                        Other a

	We merged the columns of 72267-2016-05-31.csv
['94,489,304', '2,973,873']
['2,259,555', '3,150,000']
	Fixed the merged rows for 72267-2016-05-31.csv
	We converted to numeric figures for 72267-2016-05-31.csv
                                                    0             1
0                           Cash and cash equivalents  6.170970e+08
1         Deposits with exchanges and segregated cash  9.474890e+08
2   Securities purchased under agreements to resel...  4.637082e+10
3                                 Securities borrowed  1.453758e+10
4   Trading assets ($14,976,358 were pledged to va...  6.090840e+10
5   to consolidated variable interest entities, no...  5.277732e+10
6         Brokers, dealers and clearing organizations  1.965074e+09
7                                           Customers  3.015550e+08
8                              Interest and dividends  2.819270e+08
9   Furniture, equipment, leasehold improvements a...  2.548556e+09
10  accumulated depreciation and amortizatio


Cleaning the 782124-2002-01-29.csv file
['1', '554,673']
	Fixed the merged rows for 782124-2002-01-29.csv
	We converted to numeric figures for 782124-2002-01-29.csv
                                                    0             1
1                           Cash and cash equivalents  2.223360e+08
3              in compliance with federal regulations  9.071138e+09
4     Securities purchased under agreements to resell  3.476479e+10
5                                 Securities borrowed  5.109478e+10
6                   Securities received as collateral  3.037956e+09
8                                           Customers  1.237373e+10
9                         Brokers, dealers and others  2.751879e+09
10                             Interest and dividends  1.372600e+08
11  Financial instruments owned, at fair value ($2...  2.688416e+10
12                                       Other assets  3.175880e+08
13                                       TOTAL ASSETS  1.406556e+11
15                


Cleaning the 782124-2007-01-29.csv file
	We converted to numeric figures for 782124-2007-01-29.csv
                                                    0             1
1                           Cash and cash equivalents  8.348380e+08
3              in compliance with federal regulations  8.001601e+09
4     Securities purchased under agreements to resell  3.203967e+10
5                                 Securities borrowed  8.619240e+10
6                   Securities received as collateral  1.935823e+10
8                                           Customers  2.365936e+10
9                         Brokers, dealers and others  6.492994e+09
10                             Interest and dividends  3.476470e+08
11  Financial instruments owned and pledged as col...  2.771411e+10
12         Financial instruments owned, at fair value  2.962645e+10
13  Assets of variable interest entities and mortg...  1.461557e+09
14                                       Other assets  4.618630e+08
15              


Cleaning the 782124-2012-02-29.csv file
	We converted to numeric figures for 782124-2012-02-29.csv
                                                    0             1
0                                                Cash  1.032000e+09
1   Cash and securities segregated under federal a...  2.956500e+10
2        Securities purchased under resale agreements  1.266660e+11
4                                 Securities borrowed  8.791200e+10
5                   Securities received as collateral  4.764000e+09
6   Receivable from brokers, dealers and clearing ...  5.525000e+09
7                           Receivable from customers  1.759300e+10
8                         Financial instruments owned  8.780800e+10
9   Financial instruments owned, pledged to counte...  7.058000e+09
11  Fixed assets (net of accumulated depreciation ...  7.600000e+07
12                                           Goodwill  1.328000e+09
13  Other assets (included $62 at fair value at De...  1.967000e+09
14              

	We converted to numeric figures for 782124-2016-02-29.csv
                                                    0             1
1                                                Cash  7.320000e+08
2   Cash and securities segregated under federal a...  2.706200e+10
3   Securities purchased under resale agreements (...  1.291590e+11
4                                 Securities borrowed  8.007700e+10
5    Securities received as collateral, at fair value  3.996000e+09
6                          Receivables from customers  1.428300e+10
7   Receivables from brokers, dealers, clearing or...  1.644600e+10
8   Financial instruments owned, at fair value (in...  9.547000e+10
9                                            Goodwill  1.356000e+09
10          Other assets (included $12 at fair value)  1.099000e+09
11                                     Total assets(i  3.696800e+11
13                                   Commercial paper  1.556200e+10
14  Short-term borrowings (included $116 at fair v...  1.

	We converted to numeric figures for 782124-2021-03-01.csv
                                                    0             1
1                                                Cash  1.113000e+10
2   Securities purchased under resale agreements (...  2.008240e+11
3   Securities borrowed (included $36,021 at fair ...  1.228080e+11
4   Securities received as collateral, at fair val...  3.013000e+09
5                          Receivables from customers  3.655700e+10
6   Receivables from brokers, dealers, clearing or...  2.802600e+10
7   Financial instruments owned, at fair value (in...  2.029610e+11
8                                            Goodwill  1.356000e+09
9          Other assets (included $188 at fair value)  3.107000e+09
10                                   Total assets (a)  6.097820e+11
12  Short-term borrowings (included $9,214 at fair...  4.382400e+10
13  Securities sold under repurchase agreements (i...  3.351810e+11
14     Securities loaned (included $38 at fair value)  4.

	We converted to numeric figures for 851376-2006-03-01.csv
                                                    0             1
0                           Cash and cash equivalents  9.614400e+07
1   Cash and securities segregated in compliance w...  3.401803e+09
2   Receivables from brokers, dealers and clearing...  5.427999e+09
3                          Receivables from customers  1.063306e+09
4     Securities purchased under agreements to resell  5.988444e+10
5                                 Securities borrowed  7.765738e+10
6                     Securities owned, at fair value  3.099096e+10
7         Securities owned, pledged to counterparties  1.526498e+10
8           Accrued interest and dividend receivables  5.634290e+08
9   Exchange memberships - at cost (market value $...  5.484000e+06
10                                       Other assets  8.087300e+07
11                                       Total assets  1.944368e+11
13     Securities sold under agreements to repurchase  1.


Cleaning the 851376-2012-02-29.csv file
	We converted to numeric figures for 851376-2012-02-29.csv
                                                    0             1
1                           Cash and cash equivalents  9.670000e+08
2   Cash and cash equivalents segregated for regul...  2.861000e+09
4     Securities purchased under agreements to resell  1.567590e+11
5                                 Securities borrowed  4.131100e+10
6   Securities received as collateral (includes $1...  1.906200e+10
7   Financial instruments owned, at fair value (in...  6.585400e+10
8   Receivables from brokers, dealers and clearing...  1.008900e+10
9                          Receivables from customers  8.890000e+09
10          Accrued interest and dividend receivables  2.930000e+08
11                                       Other assets  3.900000e+08
12                                       Total assets  3.064760e+11
15     Securities sold under agreements to repurchase  2.089560e+11
16              

['32,564', '35,720']
	Fixed the merged rows for 851376-2017-03-01.csv
	We converted to numeric figures for 851376-2017-03-01.csv
                                                    0             1
0                           Cash and cash equivalents  1.510000e+08
1   Cash and cash equivalents segregated for regul...  4.641000e+09
0   Securities purchased under agreements to resel...  3.256400e+10
0   Securities borrowed (includes $15,141 at fair ...  3.572000e+10
4   Securities received as collateral, at fair val...  3.362300e+10
5   Financial instruments owned, at fair value (in...  2.102700e+10
6   Receivables from brokers, dealers and clearing...  1.045800e+10
7                          Receivables from customers  1.397600e+10
8           Accrued interest and dividend receivables  1.340000e+08
9                                        Other assets  2.170000e+08
10                                       Total assets  1.525110e+11
13  Securities sold under agreements to repurchase...  

	We converted to numeric figures for 851376-2021-03-01.csv
                                                    0         1
0                           Cash and cash equivalents     255.0
1   Cash and cash equivalents segregated for regul...    1976.0
3   Securities purchased under agreements to resel...   33574.0
4   Securities borrowed (includes $29,218 at fair ...   29218.0
6                                         collateral)      38.0
7   Financial instruments owned, at fair value (in...   25408.0
8   Receivables from brokers, dealers and clearing...   10391.0
9   Receivables from customers (includes $9,681 at...   10994.0
10          Accrued interest and dividend receivables       4.0
11                                       Other assets     733.0
12                                       Total assets  112591.0
15  Securities sold under agreements to repurchase...   37133.0
16  Securities loaned (includes $25,381 at fair va...   25381.0
17  Obligation to return securities received 


Cleaning the 853784-2006-03-01.csv file
	We converted to numeric figures for 853784-2006-03-01.csv
                                                    0             1
0         Trading inventory, at market value, pledged  1.874530e+10
1     Securities purchased under agreements to resell  9.632636e+09
2   Receivable from brokers, dealers, clearing org...  2.029265e+09
3                                   Intangible assets  1.441900e+07
4                                        Other assets  3.185810e+08
5                                        Total assets  5.002569e+10
8                               Short term borrowings  9.100000e+08
9      Securities sold under agreements to repurchase  3.597392e+10
10  Securities sold, not yet purchased, at market ...  1.891516e+09
11  Securities sold, not yet purchased, at market ...  7.379536e+09
12  Obligation to return securities received as co...  1.499090e+08
13        Payable under securities lending agreements  3.589820e+08
14  Payable to b

	We converted to numeric figures for 853784-2011-03-01.csv
                                                    0             1
1                                                Cash  1.928800e+07
2   Cash and securities segregated under federal a...  4.768280e+08
3              Financial instruments, at market value  2.175384e+09
4     Financial instruments, at market value, pledged  1.321567e+10
5                   Securities received as collateral  1.073549e+09
6     Securities purchased under agreements to resell  3.300532e+10
7   Receivable under securities borrowing arrangem...  3.263643e+10
8   Receivable from brokers, dealers, clearing org...  9.665240e+08
9                                            Goodwill  1.441900e+07
10                                       Other assets  2.675640e+08
11                                       Total assets  8.385098e+10
14                              Short-term borrowings  8.500750e+08
15  Financial instruments sold, not yet purchased,...  7.


Cleaning the 853784-2016-03-01.csv file
	We converted to numeric figures for 853784-2016-03-01.csv
                                                    0             1
1                                                Cash  8.675900e+07
2   Cash and securities segregated under federal a...  6.194510e+08
3   Financial instruments owned, at fair value (in...  6.517540e+09
4   Securities purchased under agreements to resel...  3.817085e+10
5   Receivable under securities borrowing arrangem...  2.351485e+10
6                           Receivable from customers  1.906650e+08
7   Receivable from brokers, dealers, clearing org...  2.286667e+09
8                                            Goodwill  1.086500e+07
9                                        Other assets  1.828670e+08
10                                       Total assets  7.158051e+10
13                              Short-term borrowings  6.254620e+08
14  Financial instruments sold, not yet purchased,...  4.783424e+09
15     Securitie


Cleaning the 853784-2021-03-01.csv file
	We converted to numeric figures for 853784-2021-03-01.csv
                                                    0             1
1                                                Cash  2.850000e+08
2   Cash segregated under federal and other regula...  6.090000e+08
3   Financial instruments owned, at fair value (in...  9.153000e+09
4   Securities purchased under agreements to resel...  2.064300e+10
5   Receivable under securities borrowing arrangem...  1.175800e+10
6   Receivable from brokers, dealers, and clearing...  4.154000e+09
7                           Receivable from customers  2.690000e+08
8            Other assets (include $13 at fair value)  2.950000e+08
9                                        Total assets  4.716600e+10
11                              Short-term borrowings  1.110000e+08
12  Financial instruments sold, not yet purchased,...  4.070000e+09
13     Securities sold under agreements to repurchase  3.720800e+10
14              


Cleaning the 91154-2005-03-01.csv file
	We merged the columns of 91154-2005-03-01.csv
['71,812', '413']
	Fixed the merged rows for 91154-2005-03-01.csv
	We converted to numeric figures for 91154-2005-03-01.csv
                                                    0             1
0                           Cash and cash equivalents  9.080000e+08
1   Cash and securities segregated and on deposit ...  2.663000e+09
2     Securities purchased under agreements to resell  8.328500e+10
3               Deposits paid for securities borrowed  6.282000e+10
4    U.S. government and government agency securities  3.495300e+10
5                           Corporate debt securities  2.013200e+10
6                                   Equity securities  1.592400e+10
7                      State and municipal securities  1.098400e+10
8   Mortgage loans and collateralized mortgage sec...  4.398000e+09
9                            Money market instruments  2.854000e+09
10                            Contractual

	We merged the columns of 91154-2009-03-02.csv
	We converted to numeric figures for 91154-2009-03-02.csv
                                                    0             1
0                   Assets: Cash and cash equivalents  1.073000e+09
1   Cash segregated and on deposit for Federal and...  8.763000e+09
2   Securities purchased under agreements to resel...  8.340900e+10
3               Deposits paid for securities borrowed  8.008200e+10
4   Financial instruments owned and contractual co...  1.634910e+11
5    U.S. government and government agency securities  3.290800e+10
6                                   Equity securities  1.561800e+10
7                           Corporate debt securities  1.135800e+10
8                      State and municipal securities  6.853000e+09
9                             Contractual commitments  2.261000e+09
10  Mortgage loans and collateralized mortgage obl...  1.923000e+09
11                           Money market instruments  8.030000e+08
12         


Cleaning the 91154-2012-02-29.csv file
	We merged the columns of 91154-2012-02-29.csv
['23,953', '624']
	Fixed the merged rows for 91154-2012-02-29.csv
	We converted to numeric figures for 91154-2012-02-29.csv
                                                    0             1
0                           Cash and cash equivalents  8.090000e+08
1            or deposited with clearing organizations  1.145100e+10
2   Securities purchased under agreements to resel...  6.517700e+10
3   Deposits paid for securities borrowed (includi...  9.217100e+10
4   Trading account assets (approximately $18 bill...  1.573480e+11
5                          Mortgage-backed securities  2.688000e+10
6         U.S. Treasury and federal agency securities  1.769800e+10
7                                   Equity securities  1.039600e+10
8                           Corporate debt securities  6.453000e+09
9                             Asset-backed securities  4.609000e+09
10                     State and municipa


Cleaning the 91154-2016-03-01.csv file
	We converted to numeric figures for 91154-2016-03-01.csv
                                                    0             1
1                           Cash and cash equivalents  6.050000e+08
2   Cash segregated under federal and other regula...  3.036000e+09
3   Securities borrowed or purchased under agreeme...  1.606710e+11
5                          Mortgage-backed securities  2.759000e+10
6         U.S. Treasury and federal agency securities  1.074800e+10
7                                   Equity securities  9.265000e+09
8                           Corporate debt securities  4.878000e+09
9                             Asset-backed securities  3.301000e+09
10                                        Derivatives  1.339000e+09
11                      Foreign government securities  9.250000e+08
12                     State and municipal securities  5.190000e+08
13  Securities received as collateral, at fair val...  8.716000e+09
15                

	We converted to numeric figures for 91154-2020-03-02.csv
                                                    0             1
1                                                Cash  7.980000e+08
2   Cash segregated under federal and other regula...  6.055000e+09
3   Securities borrowed and purchased under agreem...  1.711310e+11
5                          Mortgage-backed securities  2.956000e+10
6         U.S. Treasury and federal agency securities  1.817500e+10
7                                   Equity securities  1.238900e+10
8                           Corporate debt securities  6.806000e+09
9                             Asset-backed securities  2.435000e+09
10                     State and municipal securities  1.979000e+09
11                                        Derivatives  1.294000e+09
12                      Foreign government securities  3.150000e+08
13  Securities received as collateral, at fair val...  6.260000e+09
15                                          Customers  1.0