In [5]:
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [13]:
assetMDL = load('Outs/asset_svc_mdl_v1.joblib')
liableMDL = load('Outs/liability_svc_mdl_v1.joblib')

In [8]:
# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

In [10]:
liableDF

Unnamed: 0,CIK,Year,Accrued employee compensation and benefits,"Adjustable Rate Cumulative Preferred Stock, Series A, held in treasury, at cost (151 shares)","Brokers, dealers and others","Common stock, $1.00 par value; 1,000 shares authorized and outstanding","Common stock, $1.00 par value; 1,000 shares authorized and outstanding Paid-in capital",Customers,"Financial instruments sold, but not yet purchased, at fair value",Interest and dividends,...,Other liabilities and accrued expenses,Paid-in capital,"Preferred stock, $1.00 par value (Adjustable Rate Cumulative Preferred Stock, Series A, $500,000 liquidation preference); 1,000 shares authorized; 300 shares issued",Retained earnings,Securities loaned,Securities sold under agreements to repurchase,Short-term borrowings,Subordinated liabilities,TOTAL LIABILITIES AND STOCKHOLDER'S EQUITY,Total Stockholder's Equity
0,782124,2002,1284391.0,-61442.0,6344629.0,,1554673.0,66487473.0,15903027.0,349104.0,...,494375.0,,150000.0,2819256.0,5032121.0,44489202.0,5683726.0,3350000.0,140655621.0,3462488.0
1,782124,2003,1053071.0,-61442.0,2903018.0,1.0,,72356465.0,15246427.0,293368.0,...,466901.0,554673.0,150000.0,3061210.0,6354888.0,44699330.0,10349388.0,3260000.0,150559829.0,3704442.0


In [9]:
assetDF

Unnamed: 0,CIK,Year,"Brokers, dealers and others",Cash and cash equivalents,Customers,"Financial instruments owned, at fair value ($22,226,481 pledged as collateral)",Interest and dividends,Not pledged as collateral,Other assets,Pledged as collateral,Securities borrowed,Securities purchased under agreements to resell,Securities received as collateral,TOTAL ASSETS,in compliance with federal regulations
0,782124,2002,6344629.0,222336.0,66487473.0,26884157.0,349104.0,,317588.0,,51094781.0,34764794.0,3037956.0,140655621.0,9071138.0
1,782124,2003,2903018.0,4896551.0,72356465.0,,293368.0,7249032.0,291780.0,20185616.0,52855722.0,36738953.0,5669811.0,150559829.0,6875084.0


In [20]:
# journal of physics A, Journal Stat. Physics
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # sum all columns, across row and map to structured dataframe
        structured_df[label] = unstructured_df[data.values].sum(axis=1)
        
    return structured_df

In [24]:
asset_predictions = pd.DataFrame([assetDF.columns[2:], 
                                  assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[2:]))], 
                                 index=['LineItems', 'Labels']).T

liable_predictions = pd.DataFrame([liableDF.columns[2:], 
                                   liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[2:]))], 
                                  index=['LineItems', 'Labels']).T

In [23]:
asset_predictions

Unnamed: 0,LineItems,Labels
0,"Brokers, dealers and others",Receivable from broker-dealers
1,Cash and cash equivalents,Cash and cash equivalents
2,Customers,Receivables from customers and counterparties
3,"Financial instruments owned, at fair value ($2...","Financial instruments owned, at fair value"
4,Interest and dividends,Other assets
5,Not pledged as collateral,Securities received as collateral
6,Other assets,Other assets
7,Pledged as collateral,Securities received as collateral
8,Securities borrowed,Reverse Repurchase Agreements (reverse-repo)
9,Securities purchased under agreements to resell,Reverse Repurchase Agreements (reverse-repo)


In [25]:
liable_predictions

Unnamed: 0,LineItems,Labels
0,Accrued employee compensation and benefits,Other liabilities
1,"Adjustable Rate Cumulative Preferred Stock, Se...",Treasury stock
2,"Brokers, dealers and others",Payable to Broker/Dealers
3,"Common stock, $1.00 par value; 1,000 shares au...","Common stock, par value"
4,"Common stock, $1.00 par value; 1,000 shares au...","Common stock, par value"
5,Customers,Payable to customers and counterparties
6,"Financial instruments sold, but not yet purcha...",Securities sold short
7,Interest and dividends,Accounts payable
8,Obligation to return securities received as co...,Other liabilities
9,Other liabilities and accrued expenses,Accrued liabilities


In [22]:
structured_data(assetDF, asset_predictions)

Unnamed: 0,CIK,Year,Cash and cash equivalents,"Financial instruments owned, at fair value",Other assets,Receivable from broker-dealers,Receivables from customers and counterparties,Reverse Repurchase Agreements (reverse-repo),Securities received as collateral,Total assets
0,782124,2002,9293474.0,26884157.0,666692.0,6344629.0,66487473.0,85859575.0,3037956.0,140655621.0
1,782124,2003,11771635.0,0.0,585148.0,2903018.0,72356465.0,89594675.0,33104459.0,150559829.0


In [26]:
structured_data(liableDF, liable_predictions)

Unnamed: 0,CIK,Year,Accounts payable,Accrued liabilities,Additional Paid-in capital,"Common stock, par value",Other liabilities,Payable to Broker/Dealers,Payable to customers and counterparties,Repurchase Agreements (repo),Retained (Accumulated) earnings,Securities sold short,Short-term borrowing,Subordinated liabilities,Total liabilities and shareholder's equity,Total shareholder's equity,Treasury stock
0,782124,2002,349104.0,494375.0,0.0,1704673.0,4322347.0,6344629.0,66487473.0,49521323.0,2819256.0,15903027.0,5683726.0,3350000.0,140655621.0,3462488.0,-61442.0
1,782124,2003,293368.0,466901.0,554673.0,150001.0,6722882.0,2903018.0,72356465.0,51054218.0,3061210.0,15246427.0,10349388.0,3260000.0,150559829.0,3704442.0,-61442.0
