In [45]:
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [46]:
assetMDL = load('Outs/asset_svc_mdl_v1.joblib')
liableMDL = load('Outs/liability_svc_mdl_v1.joblib')

In [47]:
# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

In [48]:
# journal of physics A, Journal Stat. Physics
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Name', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with only np.nan and convert sumV index to np.nan
        # handle for Missing (NaN) and blank terms (0.0)
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

In [49]:
def company_pdf(df:pd.DataFrame, mdl):
    """
    Return a dataframe for a company showcasing its column names, the predicted class and the original values
    """
    
    # split values for company dataframe according to columns and values
    colNames = df.index
    colValues = df.values
    
    # predicting the column groups
    predNames = mdl.predict(HashingVectorizer(n_features=1000).fit_transform(colNames))
    
    retDF = pd.DataFrame({'Original Lineitems': colNames, 'Predicted Lineitems': predNames, 'Line values': colValues})
    
    return retDF

## Use Classificaiton model to predict label names for each line item

In [50]:
asset_predictions = pd.DataFrame([assetDF.columns[3:], 
                                  assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[3:]))], 
                                 index=['LineItems', 'Labels']).T

liable_predictions = pd.DataFrame([liableDF.columns[3:], 
                                   liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[3:]))], 
                                  index=['LineItems', 'Labels']).T

### Structured Asset Terms

In [51]:
# construct the strucutred data set 
tempdf = structured_data(assetDF, asset_predictions)
tempdf.to_csv('structAsset.csv', index=False)

In [56]:
tempdf[~np.isnan(tempdf['Total assets'])]

Unnamed: 0,CIK,Name,Year,Accumulated depreciation and amortization,Cash and cash equivalents,Cash and securities segregated for benefit of customers,Deposits with clearing organizations,Due from employees,Exchange memberships,"Financial instruments owned, at fair value",...,"Goodwill, net amortization",Other assets,Other receivables,"Property, plant and equipment",Receivable from broker-dealers,Receivables from customers and counterparties,Reverse Repurchase Agreements (reverse-repo),Securities received as collateral,Total assets,U.S. government and government agency
52,68136,MORGAN STANLEY & CO. LLC,2004,204544.0,22385307.0,,,,,,...,,34504705.0,,90043.0,5444361.0,16428683.0,191571756.0,27155496.0,310.8887,13103784.0
70,72267,"NOMURA SECURITIES INTERNATIONAL, INC.",2006,,534993.0,,,,,21313541.0,...,,198922.0,,20039.0,1329568.0,9538.0,50754781.0,21553.0,174182900.0,
152,91154,CITIGROUP GLOBAL MARKETS INC.,2017,,5386.0,,,,,,...,145.0,53702.0,,71.0,8848.0,7476.0,160143.0,9307.0,261644.0,16565.0
153,91154,CITIGROUP GLOBAL MARKETS INC.,2018,,5146.0,,,,,,...,145.0,51553.0,,,9051.0,13244.0,162683.0,15443.0,272544.0,15278.0


In [52]:
company_pdf(assetDF[(assetDF.CIK == 1224385) & (assetDF.Year == 2010)].iloc[0].iloc[3:].dropna(), assetMDL)

Unnamed: 0,Original Lineitems,Predicted Lineitems,Line values
0,Cash,Cash and cash equivalents,30831.0
1,Cash segregated pursuant to federal regulations,Cash and cash equivalents,25000.0
2,"Financial instruments owned, at fair value ($7...","Financial instruments owned, at fair value",9048850.0
3,Goodwill,"Goodwill, net amortization",79687.0
4,Other assets,Other assets,123560.0
5,"Property, equipment, and leasehold improvement...","Property, plant and equipment",1119.0
6,Receivable from broker-dealers and clearing or...,Receivable from broker-dealers,4674420.0
7,Receivable from customers,Receivables from customers and counterparties,50349.0
8,Securities purchased under agreements to resell,Reverse Repurchase Agreements (reverse-repo),3465840.0


### Structured Liability Terms

In [53]:
# # construct the strucutred data set 
# tempdf = structured_data(liableDF, liable_predictions)
# tempdf.to_csv('structLiable.csv', index=False)

In [54]:
# tempdf