# Generating ORM Facts & Fact Types

In [None]:
import markovify
import pandas as pd
import numpy as np

### Sample data

In [None]:
facts = ['The Academic with empNr 715 has EmpName ‘Adams A’',
        'The Academic with empNr 715 works for the Dept named ‘Computer Science’',
        'The Academic with empNr 715 occupies the Room with roomNr ‘69-301’',
        'The Academic with empNr 715 uses the Extension with extNr ‘2345’',
        'The Extension with extNr ‘2345’ provides the AccessLevel with code ‘LOC’',
        'The Academic with empNr 715 is contracted till the Date with mdy-code ‘01/31/95’']

In [None]:
fact_types =  ['Academic has EmpName',
               'Academic works_for Dept',
               'Academic occupies Room',
               'Academic uses Extension',
               'Extension provides AccessLevel',
               'Academic is_contracted_till Date',
               'Academic is tenured',
               'Room contains Academic',
               'EmpName belongs_to Academic',
               'Dept uses AccessLevel']

In [None]:
df = pd.DataFrame(zip(facts, facts_types), columns=['facts', 'fact_types'])
df

### Fitting a markov chain & generating facts types

In [None]:
text_model = markovify.NewlineText(fact_types, state_size=1)
text_model

In [None]:
text_model.to_dict()

In [None]:
d = {'state_size': 1,
 'chain': '[[["___BEGIN__"], {"Academic": 6, "Extension": 1, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]',
 'parsed_sentences': [['Academic', 'has', 'EmpName'],
  ['Academic', 'works_for', 'Dept'],
  ['Academic', 'occupies', 'Room'],
  ['Academic', 'uses', 'Extension'],
  ['Extension', 'provides', 'AccessLevel'],
  ['Academic', 'is_contracted_till', 'Date'],
  ['Academic', 'is', 'tenured'],
  ['Room', 'contains', 'Academic'],
  ['EmpName', 'belongs_to', 'Academic'],
  ['Dept', 'uses', 'AccessLevel']]}

In [None]:
tm = markovify.NewlineText(fact_types, 1)

In [None]:
tm.to_dict()

In [None]:
tm.chain = c

In [None]:
c = '[[["___BEGIN__"], {"Academic": 6, "Extension": 5, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]'

In [None]:
tm.to_dict()

In [None]:
for i in range(10):
    print(text_model.make_sentence())

### Generating facts from generated fact type

In [None]:
generated_fact_type = "Academic works for Dept uses Extension provides AccessLevel"
generated_fact_type

In [None]:
tokens = {'Academic':['empNr 715', 'empNr 281', 'empNr 372'],
         'Dept':['Sales', 'Marketing', 'Analytics'],
         'Extension':['1100', '5502', '3463'],
         'AccessLevel':['Read Only', 'Full Access', 'Limited']}

In [None]:
def replacetoken(string):
    if string in tokens.keys():
        toks = tokens.get(string)
        idx = np.random.randint(0,3)
        return str(string + ' with ' + toks[idx])
    else:
        return string

In [None]:
for i in range(20):
    print(' '.join([replacetoken(i) for i in generated_fact_type.split()]))

# Proposed Architecture

<img src='markovORM.jpg'></img>

1. The engine first extracts facts from a database or ORM diagram(s)
2. Next these facts are turned into fact types and fact tokens are stored separately
3. The fact types are used to train a sequential encoder into Thought vectors
4. The Thought vectors are then decoded back to generated fact types
5. A randomizer then picks up random tokens (some of which are without replacement) and adds them to the generated fact types to generate facts
6. These facts are then used to impute a Relational database

### Further Study

https://aclweb.org/anthology/P18-1151

# -----------------------------------------------------------------------

# Demo - Generating ORM facts using input fact types & entity vocabulary

In [6]:
import numpy as np
import markovify
import collections
import pandas as pd

### 1. Generating fact instances using manual input

In [1]:
#Take input fact_types
def get_input_fact_types(n):
    input_fact_types = []
    for i in range(n):
        print("-- Fact #(",i+1,'/',n,'):')
        input_fact_types.append(input())
    return input_fact_types


#Extracting unique entities based on uppercase
def get_unique_enities(fact_types):
    entities = []
    for i in fact_types:
        entities.append([j for j in i.split() if j.isupper()==True])
        
    unique_entities = list(np.unique([item for sublist in entities for item in sublist]))
    return unique_entities

#Get vocab for unique entities
def get_entity_tokens(input_fact_types):
    
    unique_entities = get_unique_enities(input_fact_types)
    vocab = {}
    
    for i in unique_entities:
        print('-- Entity:',i,'| input examples:')
        vocab[i] = [i.strip() for i in input().split(',')]
    return vocab


#Random sampling with replacement for now
def create_instance(unique_vocab):
    vocab_instance = {k:np.random.choice(v) for k,v in unique_vocab.items()}
    return vocab_instance


#Generate fact instances using input_fact_types and vocab
def generate_fact_instance(fact_types, unique_vocab):
    
    vocab_instance = create_instance(unique_vocab)
    generated_instance = []
    dict_instance = []
    
    
    for fact_type in fact_types:
        #generating fact as natural language
        words = [str(i+' \''+vocab_instance.get(i)+'\'') 
                 if i in list(vocab_instance.keys()) else i for i in fact_type.split(' ')]
        generated_fact = ' '.join(words)
        generated_instance.append(generated_fact)
        
        #generating dict version of the facts
        entities = [j for j in fact_type.split() if j.isupper()==True]
        dict_entities = {i:vocab_instance.get(i) for i in entities}
        dict_instance.append(dict_entities)
    
    return vocab_instance, generated_instance, dict_instance


#Get dictionary instance for each generated fact
def generate_n_instances(input_fact_types, vocab, n):
    
    vi_list = []
    fi_list = []
    di_list = []
    for i in range(n):
        vi, fi, di = generate_fact_instance(input_fact_types, vocab)
        vi_list.append(vi)
        fi_list.append(fi)
        di_list.append(di)
    
    full_vi = vi_list
    full_fi = fi_list #np.array(fi_list).T.tolist()
    
    full_di = []
    new_di = np.array(di_list).T.tolist()
    for i in new_di:
        df = pd.DataFrame.from_dict(i)
        dd = df.to_dict(orient='list')
        full_di.append(dd)
        
    return full_vi, full_fi, full_di 

In [None]:
The ACCOUNT_ID is associated with ACCOUNT_TYPE
The ACCOUNT_ID is associated with NAME
The ACCOUNT_ID is associated with AGE
The AGE belongs to CATEGORY
The ACCOUNT_TYPE belongs to COUNTRY


In [48]:
#STEP 1: Input fact types
input_fact_types = get_input_fact_types(5)

-- Fact #( 1 / 5 ):
The ACCOUNT_ID is associated with ACCOUNT_TYPE
-- Fact #( 2 / 5 ):
The ACCOUNT_ID is associated with NAME
-- Fact #( 3 / 5 ):
The ACCOUNT_ID is associated with AGE
-- Fact #( 4 / 5 ):
The AGE belongs to CATEGORY
-- Fact #( 5 / 5 ):
The ACCOUNT_TYPE belongs to COUNTRY


In [49]:
input_fact_types

['The ACCOUNT_ID is associated with ACCOUNT_TYPE',
 'The ACCOUNT_ID is associated with NAME',
 'The ACCOUNT_ID is associated with AGE',
 'The AGE belongs to CATEGORY',
 'The ACCOUNT_TYPE belongs to COUNTRY']

In [50]:
#STEP 2: Input example tokens for fact type entities
vocab = get_entity_tokens(input_fact_types)

-- Entity: ACCOUNT_ID | input examples:
1111,2222,3333,4444,5555,6666,7777,8888
-- Entity: ACCOUNT_TYPE | input examples:
Primary, Secondary
-- Entity: AGE | input examples:
20,21,22,23,24,25,26,27,28,29,30
-- Entity: CATEGORY | input examples:
Junior,Senior, Super Senior
-- Entity: COUNTRY | input examples:
India, US, UK, Germany
-- Entity: NAME | input examples:
Bill, Akshay, Sumit, Paritosh


In [53]:
from pprint import pprint

In [54]:
pprint(vocab)

{'ACCOUNT_ID': ['1111', '2222', '3333', '4444', '5555', '6666', '7777', '8888'],
 'ACCOUNT_TYPE': ['Primary', 'Secondary'],
 'AGE': ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30'],
 'CATEGORY': ['Junior', 'Senior', 'Super Senior'],
 'COUNTRY': ['India', 'US', 'UK', 'Germany'],
 'NAME': ['Bill', 'Akshay', 'Sumit', 'Paritosh']}


In [55]:
#STEP 3: Generate n fact instances from the fact types and vocab
vi, fi, di = generate_n_instances(input_fact_types, vocab, 5)

In [57]:
vi[0]

{'ACCOUNT_ID': '1111',
 'ACCOUNT_TYPE': 'Secondary',
 'AGE': '22',
 'CATEGORY': 'Senior',
 'COUNTRY': 'US',
 'NAME': 'Sumit'}

In [58]:
fi

[["The ACCOUNT_ID '1111' is associated with ACCOUNT_TYPE 'Secondary'",
  "The ACCOUNT_ID '1111' is associated with NAME 'Sumit'",
  "The ACCOUNT_ID '1111' is associated with AGE '22'",
  "The AGE '22' belongs to CATEGORY 'Senior'",
  "The ACCOUNT_TYPE 'Secondary' belongs to COUNTRY 'US'"],
 ["The ACCOUNT_ID '2222' is associated with ACCOUNT_TYPE 'Secondary'",
  "The ACCOUNT_ID '2222' is associated with NAME 'Paritosh'",
  "The ACCOUNT_ID '2222' is associated with AGE '21'",
  "The AGE '21' belongs to CATEGORY 'Senior'",
  "The ACCOUNT_TYPE 'Secondary' belongs to COUNTRY 'UK'"],
 ["The ACCOUNT_ID '5555' is associated with ACCOUNT_TYPE 'Primary'",
  "The ACCOUNT_ID '5555' is associated with NAME 'Akshay'",
  "The ACCOUNT_ID '5555' is associated with AGE '30'",
  "The AGE '30' belongs to CATEGORY 'Junior'",
  "The ACCOUNT_TYPE 'Primary' belongs to COUNTRY 'India'"],
 ["The ACCOUNT_ID '8888' is associated with ACCOUNT_TYPE 'Primary'",
  "The ACCOUNT_ID '8888' is associated with NAME 'Parit

In [59]:
di

[{'ACCOUNT_ID': ['1111', '2222', '5555', '8888', '6666'],
  'ACCOUNT_TYPE': ['Secondary', 'Secondary', 'Primary', 'Primary', 'Primary']},
 {'ACCOUNT_ID': ['1111', '2222', '5555', '8888', '6666'],
  'NAME': ['Sumit', 'Paritosh', 'Akshay', 'Paritosh', 'Bill']},
 {'ACCOUNT_ID': ['1111', '2222', '5555', '8888', '6666'],
  'AGE': ['22', '21', '30', '22', '24']},
 {'AGE': ['22', '21', '30', '22', '24'],
  'CATEGORY': ['Senior', 'Senior', 'Junior', 'Super Senior', 'Junior']},
 {'ACCOUNT_TYPE': ['Secondary', 'Secondary', 'Primary', 'Primary', 'Primary'],
  'COUNTRY': ['US', 'UK', 'India', 'Germany', 'UK']}]

In [61]:
pd.DataFrame(di[1])

Unnamed: 0,ACCOUNT_ID,NAME
0,1111,Sumit
1,2222,Paritosh
2,5555,Akshay
3,8888,Paritosh
4,6666,Bill


### 2. Example from the first model shared as examples

In [45]:
# Example of a model, as shared as example

bill_facts_1 = ["The PARTY with PARTY_ID 'P0001' exists .",
"The ACCOUNT with ACCOUNT_ID 'AC001' exists .",
"The TRADER with USER_ID 'TR-001' exists .",
"The ORDER with ORDER_ID 'O0001' exists .",
"The ACCOUNT with ACCOUNT_NO 'A9-001' exists .",
"The INSTRUCTION with INSTRUCTION_ID 'I0001' exists .",
"The USER with USER_ID 'U0001' exists .",
"The ALLOCATION with ALLOC_ID 'A0001' exists .",
"The EXECUTION with EXEC_ID 'X0001' exists .",
"The QUOTE with QUOTE_ID 'Q0001' exists .",
"The BUSINESS_GROUP with GROUP_ID'BG1' exists.",
"The TRADE with TRADE_ID 'TX-0001' exists.",
"The BUSINESS_GROUP 'BG1' has NAME 'House Traders' .",
"The BUSINESS_GROUP 'BG2' has NAME 'Floor Traders' .",
"The BUSINESS_GROUP 'BG1' has member USER 'U0001' .",
"The BUSINESS_GROUP 'BG2' has member TRADER 'TR-001' .",
"The TRADER 'TR-001' issued ORDER 'O0001' .",
"The PARTY 'P0001' requested QUOTE 'Q0001' .",
"The ORDER 'O0001' was issued for QUOTE 'Q0001' .",
"The ORDER 'O0001' was executed on TRADE-DATE '06-27-2018' .",
"The ORDER 'O0001' has BASE_CURR_CD 'USD' .",
"The ORDER 'O0001' has QUOTE_CURR_CD 'USD' .",
"The ORDER 'O0001' has TRADE_SIDE_CD '1' .",
"The ORDER 'O0001' has ORDER_QTY '9' .",
"The ORDER 'O0001' has SPOT_RATE '89.08' .",
"The ORDER 'O0001' has FORWARD_RATE '87.12' .",
"The ORDER 'O0001' has FORWARD_DATE '06-27-2018' .",
"The ORDER 'O0001' has VALUE_DATE '06-27-2018' .",
"The ORDER 'O0001' has FORWARD_TYPE_CD '7' .",
"The ORDER 'O0001' has LOWER_BND_LIM '80.00' .",
"The ORDER 'O0001' has UPPER_BND_LIM '100.00' .",
"The ORDER 'O0001' issued by TRADER 'TR-001' .",
"The ORDER 'O0001' has GOOD_TYPE '7' .",
"The ORDER 'O0001' has GOOD_DATE '06-30-2018' .",
"The ORDER 'O0001' has ACCEPTED_TS '2019-06-27 09:12:15' .",
"The ORDER 'O0001' has ACK_TS '2019-06-27 09:13:00' .",
"The ORDER 'O0001' has COMPLIANCE_GRP_ID '1' .",
"The ORDER 'O0001' has HIST_FLAG '1' .",
"The ORDER 'O0001' has ADDED_TS '2019-06-27 10:15:18' .",
"The ORDER 'O0001' has ADDED_BY 'U0001' .",
"The ORDER 'O0001' has UPDT_TS '' .",
"The ORDER 'O0001' has UPDT_BY '' .",
"The ORDER 'O0001' has VERSION '1' .",
"The QUOTE 'Q0001' has QUOTE_TYPE_CD '1' .",
"The QUOTE_TYPE_CD '1' has QUOTE_TYPE_DESC 'INDICATIVE'.",
"The QUOTE_TYPE_CD '2' has QUOTE_TYPE_DESC 'TRADEABLE'.",
"The QUOTE_TYPE_CD '3' has QUOTE_TYPE_DESC 'RESTRICTED_TRADEABLE'.",
"The QUOTE_TYPE_CD '4' has QUOTE_TYPE_DESC 'COUNTER_TRADEABLE'.",
"The EXECUTION_ALLOCATION with EXEC_ALLOC_ID 'EA0001' exists .",
"The EXECUTION_ALLOCATION 'EA0001' was executed on TRADE-DATE '06-27-2018' .",
"The EXECUTION_ALLOCATION 'EA0001' has EXEC_ID 'X0001' .",
"The EXECUTION_ALLOCATION 'EA0001' has ALLOC_ID 'A0001' .",
"The EXECUTION_ALLOCATION 'EA0001' has OUTSTANDING_QTY '0' .",
"The EXECUTION_ALLOCATION 'EA0001' has ALLOCATED_QTY '100' .",
"The EXECUTION_ALLOCATION 'EA0001' has ALLOC_TS '2018-01-19 03:14:07' .",
"The EXECUTION_ALLOCATION 'EA0001' has HIST_FLAG '1' .",
"The EXECUTION_ALLOCATION 'EA0001' has ADDED_TS '2018-01-19 03:15:18' .",
"The EXECUTION_ALLOCATION 'EA0001' has ADDED_BY 'U0001' .",
"The EXECUTION_ALLOCATION 'EA0001' has UPDT_TS '' .",
"The EXECUTION_ALLOCATION 'EA0001' has UPDT_BY '' .",
"The EXECUTION_ALLOCATION 'EA0001' has VERSION '1' .",
"The ALLOCATION 'A0001' has SETTLEMENT_CD '1' .",
"The SETTLEMENT_CD '1' has SETTLEMENT_TYPE_DESC 'REGULAR' .",
"The SETTLEMENT_CD '2' has SETTLEMENT_TYPE_DESC 'CASH' .",
"The SETTLEMENT_CD '3' has SETTLEMENT_TYPE_DESC 'NEXT_DAY' .",
"The SETTLEMENT_CD '4' has SETTLEMENT_TYPE_DESC 'T+2' .",
"The SETTLEMENT_CD '5' has SETTLEMENT_TYPE_DESC 'T+3' .",
"The SETTLEMENT_CD '6' has SETTLEMENT_TYPE_DESC 'T+4' .",
"The SETTLEMENT_CD '7' has SETTLEMENT_TYPE_DESC 'T+5' .",
"The SETTLEMENT_CD '8' has SETTLEMENT_TYPE_DESC 'FUTURE' .",
"The SETTLEMENT_CD '9' has SETTLEMENT_TYPE_DESC 'WHEN_AND_IF_ISSUED' .",
"The SETTLEMENT_CD '10' has SETTLEMENT_TYPE_DESC 'SELLERS_OPTION' .",
"The ACCOUNT 'AC001' has ACCOUNT_TYPE_CD '1' .",
"The ACCOUNT_TYPE_CD '1' has ACCOUNT_TYPE_DESC 'Account is carried on customer side of Books' .",
"The ACCOUNT_TYPE_CD '2' has ACCOUNT_TYPE_DESC 'Account is carried on non-customer side of Books' .",
"The ACCOUNT_TYPE_CD '3' has ACCOUNT_TYPE_DESC 'House Trader' .",
"The ACCOUNT_TYPE_CD '4' has ACCOUNT_TYPE_DESC 'Floor Trader' .",
"The ACCOUNT_TYPE_CD '5' has ACCOUNT_TYPE_DESC 'Account is carried on non-customer side of books and is cross margined' .",
"The ACCOUNT_TYPE_CD '6' has ACCOUNT_TYPE_DESC 'Account is house trader and is cross margined' .",
"The ACCOUNT_TYPE_CD '7' has ACCOUNT_TYPE_DESC 'Joint Backoffice Account (JBO)' .",
"The TRADE 'TX-0001' has COMMISSION_TYPE_CD '1' .",
"The COMMISSION_TYPE_CD '1' has COMMISSION_TYPE_DESC 'per unit' .",
"The COMMISSION_TYPE_CD '2' has COMMISSION_TYPE_DESC 'percentage' .",
"The COMMISSION_TYPE_CD '3' has COMMISSION_TYPE_DESC 'absolute (total monetary amount)' .",
"The COMMISSION_TYPE_CD '4' has COMMISSION_TYPE_DESC 'percentage waived - cash discount' .",
"The COMMISSION_TYPE_CD '5' has COMMISSION_TYPE_DESC 'percentage waived - enhanced units' .",
"The COMMISSION_TYPE_CD '6' has COMMISSION_TYPE_DESC 'points per bond or or contract' .",
"The TRADE 'TX-0001' has TRADE_TYPE_CD '0' .",
"The TRADE_TYPE_CD '0' has TRADE_TYPE_DESC 'Regular Trade' .",
"The TRADE_TYPE_CD '1' has TRADE_TYPE_DESC 'Block Trade' .",
"The TRADE_TYPE_CD '2' has TRADE_TYPE_DESC 'Exchange for Physical (EFP)' .",
"The TRADE_TYPE_CD '3' has TRADE_TYPE_DESC 'Transfer' .",
"The TRADE_TYPE_CD '4' has TRADE_TYPE_DESC 'Late Trade' .",
"The TRADE_TYPE_CD '5' has TRADE_TYPE_DESC 'T Trade' .",
"The TRADE_TYPE_CD '6' has TRADE_TYPE_DESC 'Weighted Average Price Trade' .",
"The TRADE_TYPE_CD '7' has TRADE_TYPE_DESC 'Bunched Trade' .",
"The TRADE_TYPE_CD '8' has TRADE_TYPE_DESC 'Late Bunched Trade' .",
"The TRADE_TYPE_CD '9' has TRADE_TYPE_DESC 'Prior Reference Price Trade' .",
"The TRADE_TYPE_CD '10' has TRADE_TYPE_DESC 'After Hours Trade' .",
"The TRADE 'TX-0001' has INSTRUCTION 'I0001' .",
"The INSTRUCTION 'I0001' has INSTRUCTION_TYPE_CD '1' .",
"The INSTRUCTION_TYPE_CD '1' has INSTRUCTION_TYPE_DESC 'Calculated (includes MiscFees and NetMoney)'.",
"The INSTRUCTION_TYPE_CD '2' has INSTRUCTION_TYPE_DESC 'Preliminary (without MiscFees and NetMoney)'.",
"The INSTRUCTION_TYPE_CD '3' has INSTRUCTION_TYPE_DESC 'Sellside Calculated Using Preliminary'.",
"The INSTRUCTION_TYPE_CD '4' has INSTRUCTION_TYPE_DESC 'Sellside Calculated Without Preliminary'.",
"The INSTRUCTION_TYPE_CD '5' has INSTRUCTION_TYPE_DESC 'Ready-To-Book - Single Order'.",
"The INSTRUCTION_TYPE_CD '6' has INSTRUCTION_TYPE_DESC 'Buyside Ready-To-Book - Combined Set of Orders (Replaced)'.",
"The INSTRUCTION_TYPE_CD '7' has INSTRUCTION_TYPE_DESC 'Warehouse instruction'.",
"The INSTRUCTION_TYPE_CD '8' has INSTRUCTION_TYPE_DESC 'Request to Intermediary'.",
"The TRADE 'TX-0001' has EVENT_TYPE_CD '1' .",
"The EVENT_TYPE_CD '1' has EVENT_TYPE_DESC 'PUT'.",
"The EVENT_TYPE_CD '2' has EVENT_TYPE_DESC 'CALL'.",
"The EVENT_TYPE_CD '3' has EVENT_TYPE_DESC 'TENDER'.",
"The EVENT_TYPE_CD '4' has EVENT_TYPE_DESC 'SINKING FUND CALL'.",
"The EVENT_TYPE_CD '99' has EVENT_TYPE_DESC 'OTHER'.",
"The TRADE 'TX-0001' involves COUNTER_PARTY 'P0001' having PARTY_ROLE 'BUYER' .",
"The TRADE 'TX-0001' involves COUNTER_PARTY 'P0002' having PARTY_ROLE 'SELLER' .",
"The TRADE 'TX-0001' was executed by TRADER 'TR-001' .",
"The PARTY 'P0001' has PARTY_TYPE 'ORGANIZATION' .",
"The PARTY 'P0001' has PARTY_NAME 'Acme Paper Group' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'HEADQUARTERS_COUNTRY' with PARTY_CHARACTERISTIC_VALUE 'US' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'HEADQUARTERS_STREET_NAME' with PARTY_CHARACTERISTIC_VALUE 'Turnpike Drive' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'HEADQUARTERS_STREET_NUMBER' with PARTY_CHARACTERISTIC_VALUE '2700' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'HEADQUARTERS_CITY' with PARTY_CHARACTERISTIC_VALUE 'Hatboro' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'HEADQUARTERS_POSTAL_CODE' with PARTY_CHARACTERISTIC_VALUE '19040' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'LEGAL_ENTITY_ID' with PARTY_CHARACTERISTIC_VALUE '549300JPOPI49S3LIQ04' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'LEGAL_NAME' with PARTY_CHARACTERISTIC_VALUE 'Acme Corrugated Box Co., Inc.' .",
"The PARTY 'P0001' has PARTY_CHARACTERISTIC_TYPE 'LEGAL_FORM_CODE' with PARTY_CHARACTERISTIC_VALUE '9999 - CORPORATION' ."]

In [19]:
#Since the input needed is fact_types and vocab for entities, 
#the following functions reverse engineer the required inputs from given fact examples

def extract_fact_type(s):
    parts = s.split('\'')
    fact_type0 = ' '.join([parts[i].strip() for i in range(len(parts)) if i%2==0])
    fact_type1 = fact_type0.replace('.','').strip()
    return fact_type1

def get_facts(facts_model):
    fact_types = list(np.unique([extract_fact_type(i) for i in facts_model]))
    return fact_types

def extract_entity_token_tuple(s):
    parts = s.split('\'')
    fact_tokens = [[parts[i].strip()] for i in range(len(parts)) if i%2!=0]
    
    remaining_fact = ' || '.join([parts[i].strip() for i in range(len(parts)) if i%2==0])
    
    words = remaining_fact.split(' ')
    ett = [words[i-1] for i in range(len(words)) if words[i]=='||']
    
    return list(zip(ett, fact_tokens))

def extract_entity_token_vocab(facts_model):
    tuples = [extract_entity_token_tuple(i) for i in facts_model]
    tuples_flat = [item for sublist in tuples for item in sublist]
    
    c = collections.defaultdict(list)
    for a,b in tuples_flat:
        c[a].extend(b)  # add to existing list or create a new one

    vocab = dict(c.items())
    unique_vocab = {k:list(np.unique(v)) for k,v in vocab.items()}
    
    return unique_vocab

In [20]:
input_fact_types = get_facts(bill_facts_1)
vocab = extract_entity_token_vocab(bill_facts_1)

In [23]:
vi, fi, di = generate_n_instances(input_fact_types, vocab, 20)

In [38]:
vi[0]

{'PARTY_ID': 'P0001',
 'ACCOUNT_ID': 'AC001',
 'USER_ID': 'TR-001',
 'ORDER_ID': 'O0001',
 'ACCOUNT_NO': 'A9-001',
 'INSTRUCTION_ID': 'I0001',
 'ALLOC_ID': 'A0001',
 'EXEC_ID': 'X0001',
 'QUOTE_ID': 'Q0001',
 'GROUP_ID': 'BG1',
 'TRADE_ID': 'TX-0001',
 'BUSINESS_GROUP': 'BG1',
 'NAME': 'Floor Traders',
 'USER': 'U0001',
 'TRADER': 'TR-001',
 'ORDER': 'O0001',
 'PARTY': 'P0001',
 'QUOTE': 'Q0001',
 'TRADE-DATE': '06-27-2018',
 'BASE_CURR_CD': 'USD',
 'QUOTE_CURR_CD': 'USD',
 'TRADE_SIDE_CD': '1',
 'ORDER_QTY': '9',
 'SPOT_RATE': '89.08',
 'FORWARD_RATE': '87.12',
 'FORWARD_DATE': '06-27-2018',
 'VALUE_DATE': '06-27-2018',
 'FORWARD_TYPE_CD': '7',
 'LOWER_BND_LIM': '80.00',
 'UPPER_BND_LIM': '100.00',
 'GOOD_TYPE': '7',
 'GOOD_DATE': '06-30-2018',
 'ACCEPTED_TS': '2019-06-27 09:12:15',
 'ACK_TS': '2019-06-27 09:13:00',
 'COMPLIANCE_GRP_ID': '1',
 'HIST_FLAG': '1',
 'ADDED_TS': '2019-06-27 10:15:18',
 'ADDED_BY': 'U0001',
 'UPDT_TS': '',
 'UPDT_BY': '',
 'VERSION': '1',
 'QUOTE_TYPE_CD': 

In [36]:
pd.DataFrame(di[3])

Unnamed: 0,ACCOUNT_TYPE_CD,ACCOUNT_TYPE_DESC
0,6,Account is carried on customer side of Books
1,4,Floor Trader
2,2,House Trader
3,3,Joint Backoffice Account (JBO)
4,4,House Trader
5,4,Account is carried on non-customer side of boo...
6,4,Floor Trader
7,7,Account is carried on non-customer side of Books
8,4,Floor Trader
9,1,Account is carried on non-customer side of Books


# Summary - 

Please refer to - 
- the input_fact_types and vocab variables for inputs
- the vi, fi, di variables for the expected outputs

Implemented - 
1. A series of fact types can be seen as a graph of connected entities.
1. Given a graph of fact types and a backend entity vocabulary, its possible to generate instances of said graph by sampling from the entity vocabulary
3. Then these instances can be reorganised as tables to be dumped into a database

Work needed on - 
1. The method of generating examples for an entity need to be created using libraries such as FAKER
2. Currently, random sampling is being used for creating instances of the graph. This strategy needs to be modified so as to ensure that certain parameters act as unique identifiers for the graph (meaning they occur only once, and must pe popped from the queue once used to generate an instance), while others are random choice and can repeat.