# Generating ORM Facts & Fact Types

In [245]:
import markovify
import pandas as pd
import numpy as np

### Sample data

In [257]:
facts = ['The Academic with empNr 715 has EmpName ‘Adams A’',
        'The Academic with empNr 715 works for the Dept named ‘Computer Science’',
        'The Academic with empNr 715 occupies the Room with roomNr ‘69-301’',
        'The Academic with empNr 715 uses the Extension with extNr ‘2345’',
        'The Extension with extNr ‘2345’ provides the AccessLevel with code ‘LOC’',
        'The Academic with empNr 715 is contracted till the Date with mdy-code ‘01/31/95’']

In [270]:
fact_types =  ['Academic has EmpName',
               'Academic works_for Dept',
               'Academic occupies Room',
               'Academic uses Extension',
               'Extension provides AccessLevel',
               'Academic is_contracted_till Date',
               'Academic is tenured',
               'Room contains Academic',
               'EmpName belongs_to Academic',
               'Dept uses AccessLevel']

In [145]:
df = pd.DataFrame(zip(facts, facts_types), columns=['facts', 'fact_types'])
df

Unnamed: 0,facts,fact_types
0,The Academic with empNr 715 has EmpName ‘Adams A’,Academic has EmpName
1,The Academic with empNr 715 works_for_the Dept...,Academic works for Dept
2,The Academic with empNr 715 occupies_the Room ...,Academic occupies Room
3,The Academic with empNr 715 uses_the Extension...,Academic uses Extension
4,The Extension with extNr ‘2345’ provides_the A...,Extension provides AccessLevel
5,The Academic with empNr 715 is_contracted_till...,Academic is contracted till Date


### Fitting a markov chain & generating facts types

In [272]:
text_model = markovify.NewlineText(fact_types, state_size=1)
text_model

<markovify.text.NewlineText at 0x11f931a20>

In [273]:
text_model.to_dict()

{'state_size': 1,
 'chain': '[[["___BEGIN__"], {"Academic": 6, "Extension": 1, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]',
 'parsed_sentences': [['Academic', 'has', 'EmpName'],
  ['Academic', 'works_for', 'Dept'],
  ['Academic', 'occupies', 'Room'],
  ['Academic', 'uses', 'Extension'],
  ['Extension', 

In [274]:
d = {'state_size': 1,
 'chain': '[[["___BEGIN__"], {"Academic": 6, "Extension": 1, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]',
 'parsed_sentences': [['Academic', 'has', 'EmpName'],
  ['Academic', 'works_for', 'Dept'],
  ['Academic', 'occupies', 'Room'],
  ['Academic', 'uses', 'Extension'],
  ['Extension', 'provides', 'AccessLevel'],
  ['Academic', 'is_contracted_till', 'Date'],
  ['Academic', 'is', 'tenured'],
  ['Room', 'contains', 'Academic'],
  ['EmpName', 'belongs_to', 'Academic'],
  ['Dept', 'uses', 'AccessLevel']]}

In [278]:
tm = markovify.NewlineText(fact_types, 1)

In [280]:
tm.to_dict()

{'state_size': 1,
 'chain': '[[["___BEGIN__"], {"Academic": 6, "Extension": 1, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]',
 'parsed_sentences': [['Academic', 'has', 'EmpName'],
  ['Academic', 'works_for', 'Dept'],
  ['Academic', 'occupies', 'Room'],
  ['Academic', 'uses', 'Extension'],
  ['Extension', 

In [282]:
tm.chain = c

In [281]:
c = '[[["___BEGIN__"], {"Academic": 6, "Extension": 5, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]'

In [285]:
tm.to_dict()

AttributeError: 'str' object has no attribute 'to_json'

<markovify.text.NewlineText at 0x111f55198>

In [182]:
for i in range(10):
    print(text_model.make_sentence())

Dept uses Extension provides AccessLevel
Academic has EmpName belongs to Academic is tenured
EmpName belongs to Academic is contracted till Date
Academic works for Dept uses Extension provides AccessLevel
Dept uses Extension
Room contains Academic is tenured
Room contains Academic uses AccessLevel
Dept uses Extension provides AccessLevel
Room contains Academic is contracted till Date
Academic works for Dept uses AccessLevel


### Generating facts from generated fact type

In [183]:
generated_fact_type = "Academic works for Dept uses Extension provides AccessLevel"
generated_fact_type

'Academic works for Dept uses Extension provides AccessLevel'

In [220]:
tokens = {'Academic':['empNr 715', 'empNr 281', 'empNr 372'],
         'Dept':['Sales', 'Marketing', 'Analytics'],
         'Extension':['1100', '5502', '3463'],
         'AccessLevel':['Read Only', 'Full Access', 'Limited']}

In [241]:
def replacetoken(string):
    if string in tokens.keys():
        toks = tokens.get(string)
        idx = np.random.randint(0,3)
        return str(string + ' with ' + toks[idx])
    else:
        return string

In [244]:
for i in range(20):
    print(' '.join([replacetoken(i) for i in generated_fact_type.split()]))

Academic with empNr 372 works for Dept with Analytics uses Extension with 1100 provides AccessLevel with Read Only
Academic with empNr 281 works for Dept with Marketing uses Extension with 3463 provides AccessLevel with Limited
Academic with empNr 372 works for Dept with Analytics uses Extension with 1100 provides AccessLevel with Read Only
Academic with empNr 715 works for Dept with Analytics uses Extension with 1100 provides AccessLevel with Full Access
Academic with empNr 372 works for Dept with Marketing uses Extension with 5502 provides AccessLevel with Limited
Academic with empNr 281 works for Dept with Sales uses Extension with 1100 provides AccessLevel with Limited
Academic with empNr 715 works for Dept with Marketing uses Extension with 3463 provides AccessLevel with Read Only
Academic with empNr 281 works for Dept with Sales uses Extension with 1100 provides AccessLevel with Limited
Academic with empNr 715 works for Dept with Sales uses Extension with 3463 provides AccessLeve

# Proposed Architecture

<img src='markovORM.jpg'></img>

1. The engine first extracts facts from a database or ORM diagram(s)
2. Next these facts are turned into fact types and fact tokens are stored separately
3. The fact types are used to train a sequential encoder into Thought vectors
4. The Thought vectors are then decoded back to generated fact types
5. A randomizer then picks up random tokens (some of which are without replacement) and adds them to the generated fact types to generate facts
6. These facts are then used to impute a Relational database

### Further Study

https://aclweb.org/anthology/P18-1151

In [314]:
import random
import operator
import bisect
import json

# Python3 compatibility
try: # pragma: no cover
    basestring
except NameError: # pragma: no cover
    basestring = str

BEGIN = "___BEGIN__"
END = "___END__"

def accumulate(iterable, func=operator.add):
    """
    Cumulative calculations. (Summation, by default.)
    Via: https://docs.python.org/3/library/itertools.html#itertools.accumulate
    """
    it = iter(iterable)
    total = next(it)
    yield total
    for element in it:
        total = func(total, element)
        yield total

class Chain(object):
    """
    A Markov chain representing processes that have both beginnings and ends.
    For example: Sentences.
    """
    def __init__(self, corpus, state_size, model=None):
        """
        `corpus`: A list of lists, where each outer list is a "run"
        of the process (e.g., a single sentence), and each inner list
        contains the steps (e.g., words) in the run. If you want to simulate
        an infinite process, you can come very close by passing just one, very
        long run.
        `state_size`: An integer indicating the number of items the model
        uses to represent its state. For text generation, 2 or 3 are typical.
        """
        self.state_size = state_size
        self.model = model or self.build(corpus, self.state_size)
        self.precompute_begin_state()

    def build(self, corpus, state_size):
        """
        Build a Python representation of the Markov model. Returns a dict
        of dicts where the keys of the outer dict represent all possible states,
        and point to the inner dicts. The inner dicts represent all possibilities
        for the "next" item in the chain, along with the count of times it
        appears.
        """

        # Using a DefaultDict here would be a lot more convenient, however the memory
        # usage is far higher.
        model = {}

        for run in corpus:
            items = ([ BEGIN ] * state_size) + run + [ END ]
            for i in range(len(run) + 1):
                state = tuple(items[i:i+state_size])
                follow = items[i+state_size]
                if state not in model:
                    model[state] = {}

                if follow not in model[state]:
                    model[state][follow] = 0

                model[state][follow] += 1
        return model

    def precompute_begin_state(self):
        """
        Caches the summation calculation and available choices for BEGIN * state_size.
        Significantly speeds up chain generation on large corpuses. Thanks, @schollz!
        """
        begin_state = tuple([ BEGIN ] * self.state_size)
        choices, weights = zip(*self.model[begin_state].items())
        cumdist = list(accumulate(weights))
        self.begin_cumdist = cumdist
        self.begin_choices = choices

#     def move(self, state):
#         """
#         Given a state, choose the next item at random.
#         """
#         if state == tuple([ BEGIN ] * self.state_size):
#             choices = self.begin_choices
#             cumdist = self.begin_cumdist
#         else:
#             choices, weights = zip(*self.model[state].items())
#             cumdist = list(accumulate(weights))
#         r = random.random() * cumdist[-1]
#         selection = choices[bisect.bisect(cumdist, r)]
#         return selection

#     def gen(self, init_state=None):
#         """
#         Starting either with a naive BEGIN state, or the provided `init_state`
#         (as a tuple), return a generator that will yield successive items
#         until the chain reaches the END state.
#         """
#         state = init_state or (BEGIN,) * self.state_size
#         while True:
#             next_word = self.move(state)
#             if next_word == END: break
#             yield next_word
#             state = tuple(state[1:]) + (next_word,)

#     def walk(self, init_state=None):
#         """
#         Return a list representing a single run of the Markov model, either
#         starting with a naive BEGIN state, or the provided `init_state`
#         (as a tuple).
#         """
#         return list(self.gen(init_state))

    def to_json(self):
        """
        Dump the model as a JSON object, for loading later.
        """
        return json.dumps(list(self.model.items()))

#     @classmethod
#     def from_json(cls, json_thing):
#         """
#         Given a JSON object or JSON string that was created by `self.to_json`,
#         return the corresponding markovify.Chain.
#         """

#         if isinstance(json_thing, basestring):
#             obj = json.loads(json_thing)
#         else:
#             obj = json_thing

#         if isinstance(obj, list):
#             rehydrated = dict((tuple(item[0]), item[1]) for item in obj)
#         elif isinstance(obj, dict):
#             rehydrated = obj
#         else:
#             raise ValueError("Object should be dict or list")

#         state_size = len(list(rehydrated.keys())[0])

#         inst = cls(None, state_size, rehydrated)
#         return inst

In [315]:
chains = Chain(corpus, 1)

In [292]:
corpus = [i.split(' ') for i in fact_types]
corpus

[['Academic', 'has', 'EmpName'],
 ['Academic', 'works_for', 'Dept'],
 ['Academic', 'occupies', 'Room'],
 ['Academic', 'uses', 'Extension'],
 ['Extension', 'provides', 'AccessLevel'],
 ['Academic', 'is_contracted_till', 'Date'],
 ['Academic', 'is', 'tenured'],
 ['Room', 'contains', 'Academic'],
 ['EmpName', 'belongs_to', 'Academic'],
 ['Dept', 'uses', 'AccessLevel']]

In [316]:
chains.to_json()

'[[["___BEGIN__"], {"Academic": 6, "Extension": 1, "Room": 1, "EmpName": 1, "Dept": 1}], [["Academic"], {"has": 1, "works_for": 1, "occupies": 1, "uses": 1, "is_contracted_till": 1, "is": 1, "___END__": 2}], [["has"], {"EmpName": 1}], [["EmpName"], {"___END__": 1, "belongs_to": 1}], [["works_for"], {"Dept": 1}], [["Dept"], {"___END__": 1, "uses": 1}], [["occupies"], {"Room": 1}], [["Room"], {"___END__": 1, "contains": 1}], [["uses"], {"Extension": 1, "AccessLevel": 1}], [["Extension"], {"___END__": 1, "provides": 1}], [["provides"], {"AccessLevel": 1}], [["AccessLevel"], {"___END__": 2}], [["is_contracted_till"], {"Date": 1}], [["Date"], {"___END__": 1}], [["is"], {"tenured": 1}], [["tenured"], {"___END__": 1}], [["contains"], {"Academic": 1}], [["belongs_to"], {"Academic": 1}]]'

In [None]:
entity_tokens = []