In [1]:
import json
import random
import os
import string
from pprint import pprint
from snips_nlu import SnipsNLUEngine
from snips_nlu.default_configs import CONFIG_EN

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'engine.eng',
 'flights_dataset.json',
 'gitbit',
 'Snips NLU EDA-Changed to Region, Product, Company.ipynb',
 'Snips NLU EDA.ipynb',
 'snips_together.py']

In [3]:
example = json.load(open("flights_dataset.json"))
print("Possible:")
for i in example.keys():
    print(" ",i)

Possible:
  entities
  intents
  language


# Entities example:

In [4]:
pprint(example["entities"])
entities = example["entities"]

{'locality': {'automatically_extensible': True,
              'data': [{'synonyms': [], 'value': 'djibouti'},
                       {'synonyms': [], 'value': 'san francisco'},
                       {'synonyms': ['new york', 'big apple'],
                        'value': 'new york city'}],
              'matching_strictness': 1.0,
              'use_synonyms': True},
 'snips/datetime': {}}


In [5]:
print("Keys")
print(entities.keys())
print("Keys essentially are the tag names...")

Keys
dict_keys(['locality', 'snips/datetime'])
Keys essentially are the tag names...


# BUILT IN
#### As seen here, something that is 'snips/datetime' is essentially a built in version of possible entities.
* snips/amountOfMoney
* snips/datetime
* snips/duration
* snips/musicAlbum
* snips/musicArtist
* snips/musicTrack
* snips/number
* snips/ordinal
* snips/percentage
* snips/temperature

# CUSTOM ENTITIES
#### Locality on the other hand, with example tags in the data, and various other settings such as matching_strictness are custom entities, we define the following:
* automatically_extensible: Your entity must be automatically extensible. Or only those within the strict list will be allowed...
* data: As you see data is a list of both example tags, listed in the dictionary as values, and also a list of synonyms within the "synonyms" entry in the dictionary
* matching_strictness : float – controls the matching strictness of the entity (only for custom entities). Must be between 0.0 and 1.0.
* use_synonyms: bool – whether or not to map entity values using synonyms (only for custom entities)



# Intents Examples

In [6]:
pprint(example["intents"])
intents = example["intents"]

{'bookFlight': {'utterances': [{'data': [{'text': 'Book me a flight from '},
                                         {'entity': 'locality',
                                          'slot_name': 'departure',
                                          'text': 'Paris'},
                                         {'text': ' to '},
                                         {'entity': 'locality',
                                          'slot_name': 'destination',
                                          'text': 'London'},
                                         {'text': ' '},
                                         {'entity': 'snips/datetime',
                                          'slot_name': 'flight_time',
                                          'text': 'this weekend'}]},
                               {'data': [{'text': 'Find me an airplane for '},
                                         {'entity': 'snips/datetime',
                                          'slot_name': 'flight_

### Key format
* Outerkey = Intent name
* Mid Key = "utterances"
###### Inner Key format
* List of dictionaries
* If just part of the statement, dictionary that only has "text" as a key
* if part of an entity: {"entity": type of entity,    "slot_name": what kind of the entity is it? ,  "text": "Banana" }
* slot name hence, means if they are all under the same entity, what nuance it is, such as a locality being the entity, but some of the localities referring to destinations or departures accordingly.

##### Begin.
We hence first begin by defining generators for the various fields we MIGHT want to train for.<br>
A list begins:
* Location - (Current location in the warehouse it is stored) (Example: M-N--D)
* *  Only the last letter and numbers are known to vary. The last letter probably designates the zone in the warehouse it is stored in.  
* BAU Number - (Example: -------- 8 numbers where - represents any numbers really.) Essentially the birth certificate of the products sent to the current station. Contains travel history and production area. 
* Product Number - (Example: MA--------- 9 numbers after the first two letters.)
* * Different packing numbers have different boxes.. apparently?
* Product Line - (Example: ---------- Always 10 numbers.)Which line of products is it under, such as like.. the newest coolbeans line of semiconductors.
* Description - 11 digits/letters, a mix of both. Essentially the product details itself.
* Lot No. - (Example: M--------A Two letters, one at each end, 8 numbers)  Unknown purpose. Probably another internal warehouse location.
* Date Code - (4 numbers - likely last 2 digits of year, then month ) What date are we handling this current order?
* Case Size - How many of the Products are in each of the current packaging?
* Each - How many of the products are in each of the boxes in the new packaging we are doing?
* Boxes - How many of boxes will be output as a result?
<br><br><br>
Since we are attempting to do toy examples, we will focus on a few important details. The first being Case size, and Each and Boxes, which are relevant for determining how to pack the boxes. <br>For simplicity we will begin by composing them into 2 items. i.e we keep only the case size and each, since boxes should be implicitly determinable as a result of both items.<br>
We will also focus on Description of the product which might aid the integrated model in determining which 3d model to show.

We hence begin by looking at another sample dataset for a few ways to ask for data, the Maluuba Frames dataset.<br>
For simplicity, we look at their initial requests as a reference for examples.


In [7]:
# maluuba = json.load(open("Maluuba/frames.json"))

In [8]:
# initial_inquiries =[]
# for i in range(len(maluuba)):
#     initial_inquiries.append(maluuba[i]["turns"][0]["text"])

In [9]:
# len(initial_inquiries) # as expected, 1369 dialogue openers.
# #printing.
# for i in initial_inquiries:
#     print(i)

A quick visual inspection suggests the following possibilities for references:


### DETAIL DETAIL, DETAIL
I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.
### Detail, Detail, Detail.
I am one person voyaging to Tampa. I can depart from Indianapolis. Must be between 20th and 31 August.

### Detail.
So we are departing from Pittsburg<br>
kyoto to boston<br>
Looking to go to Guadalajara<br>



### Detail, Detail, Question.
I'm looking to book a trip from Hamburg starting August 15th. I have a 14 day vacation. What packages can you recommend?
### No detail
cool? <br>
do your thang<br>
Biebs here. I wanna book a vacation asap.<br>
Hi.<br>
HEEYYYYY<br>
Hello :slightly_smiling_face: Can you believe it? My psychotic boss finally gave me a few days off! I feel like a thousand pounds have been lifted off my shoulders, and I can't wait to book the perfect trip!<br>
I never been much good at maths. We way overcalculated what we would be spending my wedding on September 7, and only have 1600 left over :disappointed: Wifey is so mad at me<br>
My place just burnt down and I need to get home immediately!<br>

##### No details provided, we should attempt to either coerce details or at least return the fact that there are no details. 
##### For now we will aim to note we have no details provided. So we should aim to get this to the NULL intention/None intention in Snips NLU

### Greet, Low detail, Question
Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?<br>

##### Low details provided, we should attempt to either coerce details or at least return the fact that there are too few details.


### Greet, Detail, Detail
Hello, I am looking to book a vacation from Gotham City to Mos Eisley for $2100 <br>

Hi I'd like to go to Caprica from Busan, between Sunday August 21, 2016 and Wednesday August 31, 2016 <br>

Hello, I am looking to book a trip for 2 adults and 6 children for $21,300 or less. We are departing from Kochi for Denver. <br>

We have a lot of examples of no detail, and simply sending in spam will already allow us to simulate that.

# BREAKDOWN - Fabrication of data /Adaptation
#####  These should Always return None. If our training is successful
cool?<br>
do your thang<br>
Biebs here. I wanna book a vacation asap.<br>
Hi.<br>
HEEYYYYY<br>
Hello :slightly_smiling_face: Can you believe it? My psychotic boss finally gave me a few days off! I feel like a thousand pounds have been lifted off my shoulders, and I can't wait to book the perfect trip!<br>
I never been much good at maths. We way overcalculated what we would be spending my wedding on September 7, and only have 1600 left over :disappointed: Wifey is so mad at me<br>
My place just burnt down and I need to get home immediately!<br>
##### These are positive examples...
I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.<br>
I am one person voyaging to Tampa. I can depart from Indianapolis. Must be between 20th and 31 August.<br>
So we are departing from Pittsburg<br>
kyoto to boston<br>
Looking to go to Guadalajara<br>
I'm looking to book a trip from Hamburg starting August 15th. I have a 14 day vacation. What packages can you recommend?<br>
Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?<br>
Hello, I am looking to book a vacation from Gotham City to Mos Eisley for 2100 <br>
Hi I'd like to go to Caprica from Busan, between Sunday August 21, 2016 and Wednesday August 31, 2016<br>
Hello, I am looking to book a trip for 2 adults and 6 children for \$21,300 or less. We are departing from Kochi for Denver.

# Tokens:
3ach<br>
pr0d<br>
cas3<br>
# Examples
I have pr0d. Case size is cas3, need 3ach in output.<br>
I have Case size of cas3 and need 3ach in output, product number pr0d.<br>
pr0d, 3ach each, now is cas3.<br>
i got cas3, need pack into 3ach, product pr0d<br>
Hello there i am looking to pack pr0d, can you help me?<br>
Hello, I am packing pr0d coming in cas3 and out as 3ach<br>
Hello, I am packing pr0d, with out as 3ach<br>
Hello, I am packing pr0d coming in cas3<br>
I am trying to pack pr-d. I currently have them in cases of cas3, and i need to place them into 3ach.<br>
cas3 to 3ach<br>
3ach from cas3<br>
Hello, I received pr0d. From 3ach to cas3.<br>
pr0d.<br>
cas3.<br>
3ach.<br>
pr0d. cas3. 3ach.<br>
cas3. pr0d. 3ach.<br>
3ach. pr0d. cas3.<br>
3ach. cas3. pr0d. <br>
cas3. 3ach. pr0d. <br>
pr0d. 3ach. cas3. <br>

In [10]:
def eleven_genny(p=2,deterministic = False,seed=None):
    if deterministic and seed:
        random.setseed(seed)
    if p>11:
        print("p was >11. Make it <= to 11.")
        return
    a = random.randint(0,99999999999) # generate a random bunch of numbers
    if len(str(a))<11:
        a = '{:11d}'.format(a) # format to 11 characters
    else:
        a = str(a)
    a = list(a)
    uppercase_len = len(string.ascii_uppercase) # get all upper case items
    for i in range(len(a)):
        if a[i]!=" ":
            roll = random.randint(0,11) # perform a diceroll
            if roll<=p:
                a[i] = string.ascii_uppercase[random.randint(0,uppercase_len-1)] + " "# overwrite that number with a letter
            else:
                roll = random.randint(0,11)
                if roll<=int(p/2): #nerfed chance
                    if a[i] ==str(0):
                        a[i]=" zero "
                    elif a[i]==str(1):
                        roll = random.randint(0,10)
                        if roll>3:
                            a[i]=" one "
                        else:
                            a[i] = " want "
                        
                    elif a[i]==str(2):
                        roll = random.randint(0,10)
                        if roll>5 and roll<7:
                            a[i]= " two "
                        elif roll>=7 and roll<9:
                            a[i] = " to "
                        elif roll<=5 and roll>3:
                            a[i] = " too "
                        else:
                            a[i] = " two "
                        
                    elif a[i]==str(3):
                        roll = random.randint(0,10)
                        if roll>5:
                            a[i]=" three "
                        else:
                            a[i] = " tree "
                    elif a[i]==str(4):
                        roll = random.randint(0,10)
                        if roll>5:
                            a[i]= " four "
                        elif roll<=5 and roll>3:
                            a[i] = " for "
                        else:
                            a[i] = " fore "
                        
                    elif a[i]==str(5):
                        a[i] = " five "
                    elif a[i]==str(6):
                        a[i] = " six "
                    elif a[i]==str(7):
                        a[i] = " seven "
                    elif a[i] == str(8):
                        roll = random.randint(0,10)
                        if roll>8:
                            a[i]=" ate "
                        else:
                            a[i] = " eight "
                    elif a[i] == str(9):
                        a[i] = " nine "
                
        else:
            a[i]=0
    summation = ""
    for i in a:
        summation+=str(i)
    return summation.strip()


def cas3_n_each():
    a = random.randint(300,5000)
    return int(a/random.randint(2,5)),int(a)
eleven_genny()
cas3_n_each()

def region_company():
    region = ["NA","SEA","EUR","ASIA","CHINA"]
    companies = ["Amazon", "Intel","Asus"]
    return region[random.randint(0,len(region)-1)], companies[random.randint(0,len(companies)-1)]

In [11]:
def parse_for_dicty(fabricated):
    all_sentences = []
    dictated= [] # for keeping track of all the drawn eleven_generateds.
    for i in fabricated:
        latest_sentence = i.split()
        region, company = region_company() # 
        latest_string = "" # concatenated string. Includes spaces!
        latest_output = [] # output list.
        last_was_a_flag = False
        for word in latest_sentence:
            if "cas3" in word:
                if latest_string!="":
                    latest_output.append({'text':latest_string})
                    latest_string = ""
                if word!="cas3":
                    #cas3, cas3. etc. essentially another character melded to the thing.
                    working_list = word.split("cas3")
                    if working_list[0]=="":# it is the case of <TOKEN><EXTRA>
                        latest_output.append({'entity': 'Number','slot_name': 'Region','text': str(region)})
                        latest_string+=working_list[1]
                    else:
                        # it is the case of <EXTRA><TOKEN>
                        latest_output[len(latest_output)-1]+=working_list[0]
                        latest_output.append({'entity': 'Number','slot_name': 'Region','text': str(region)})

                else:
                    latest_output.append({'entity': 'Number','slot_name': 'Region','text': str(region)})

                last_was_a_flag = True


            elif "pr0d" in word:
                if latest_string!="":
                    latest_output.append({'text':latest_string})
                    latest_string = ""
                # product.
                if word!="pr0d":
                    #cas3, cas3. etc. essentially another character melded to the thing.
                    working_list = word.split("pr0d")
                    if working_list[0]=="":# it is the case of <TOKEN><EXTRA>
                        latest_eleven_draw = str(eleven_genny())
                        dictated.append(latest_eleven_draw)
                        latest_output.append({'entity': 'Mixed_Number','slot_name': 'Product Number','text':latest_eleven_draw })
                        latest_string+=working_list[1]
                    else:
                        # it is the case of <EXTRA><TOKEN>
                        latest_eleven_draw = str(eleven_genny())
                        dictated.append(latest_eleven_draw)
                        latest_output[len(latest_output)-1]+=working_list[0]
                        latest_output.append({'entity': 'Mixed_Number','slot_name': 'Product Number','text': latest_eleven_draw})

                else:
                    latest_eleven_draw = str(eleven_genny())
                    dictated.append(latest_eleven_draw)
                    latest_output.append({'entity': 'Mixed_Number','slot_name': 'Product Number','text': latest_eleven_draw})

                last_was_a_flag = True


            elif "3ach" in word:
                if latest_string!="":
                    latest_output.append({'text':latest_string})
                    latest_string = ""
                # each case.
                if word!="3ach":
                    #cas3, cas3. etc. essentially another character melded to the thing.
                    working_list = word.split("3ach")
                    if working_list[0]=="":# it is the case of <TOKEN><EXTRA>
                        latest_output.append({'entity': 'Number','slot_name': 'Company','text': str(company)})
                        latest_string+=working_list[1] # add to the next string to add..
                    else:
                        # it is the case of <EXTRA><TOKEN>
                        latest_output[len(latest_output)-1]+=working_list[0]
                        latest_output.append({'entity': 'Number','slot_name': 'Company','text': str(company)})

                else:
                    latest_output.append({'entity': 'Number','slot_name': 'Company','text': str(company)})

                last_was_a_flag = True


            else:
                #garbage. continue the talking..
                if last_was_a_flag:
                    latest_string+=" "
                    last_was_a_flag = False
                latest_string+=word
                latest_string+=" "
        all_sentences.append(latest_output)
    return all_sentences, dictated

In [12]:
"going from cas3 to 3ach"
"going to 3ach from cas3"
"Need pr0d, cas3 to 3ach"
bad_examples = [
    "pr0d."
,"cas3."
,"3ach."
,"pr0d. cas3. 3ach."
,"cas3. pr0d. 3ach."
,"3ach. pr0d. cas3."
,"3ach. cas3. pr0d." # not sure if these are even useful though...
,"cas3. 3ach. pr0d."
,"pr0d. 3ach. cas3. "]

In [13]:
fabricated = ["I have pr0d. Country is cas3, from 3ach."
,"This is from cas3 and going to 3ach, product number pr0d."
,"I want pr0d, from cas3 to 3ach. heading towards other locations."
,"Product pr0d please"  # 3ach = company
,"Product number pr0d" # cas3 = Region
,"pr0d going to 3ach and cas3, how do i do this"
,"Product number of pr0d"
,"Going to cas3."
,"reduct number pr0d, each cas3 door 3ach" # badly parsed versions?
,"pr0d, came in from region cas3, and to 3ach "
,"you unpacking pr0d from cas3 to 3ach"
,"search all products from region cas3, for 3ach"
,"currently have pr0d, coming in from cas3"
,"ey i got pr0d and i need 3ach from cas3"
,"i got them from cas3 meant for 3ach."
,"i got cas3, need pack into 3ach, product pr0d"
,"Hello there i am looking to pack pr0d, can you help me?"
,"Hello, I am packing pr0d coming in cas3 and out as 3ach"
,"Salutations, I am packing product pr0d , with out as 3ach"
,"am working with pr0d. Need to pack from cas3 to 3ach"
,"greetings, can i have pr0d, with cas3 to 3ach"
,"Packing pr0d, need to go from cas3 to 3ach"
,"Hello, I am packing pr0d coming in cas3"
,"yo, need pr0d, it's like in cas3 for now and i need to pack as boxes of 3ach"
,"Hi, pr0d, is as cas3, needed in 3ach"
,"Hey, I have productive pr0d. I need to fit it into cas3 from boxes of 3ach"
,"Hey, I don't like without number pr0d each from cas3 to 3ach."
,"I am trying to pack pr0d. I currently have them in cases of cas3, and i need to place them into 3ach."
,"I have product pr0d. Currently is in cas3. It needs to be 3ach"
,"can i have pr0d please"
,"can i have pr0d which is packed in boxes of cas3. I need to know how to put it as 3ach"
,"I have pr0d, need from cas3 to 3ach"
,"pr0d is, move from cas3 to 3ach"
,"Hey i'm packing this pr0d my dudes, and i and moving it from cas3 to 3ach "
,"How to pack pr0d to new pack of 3ach"
,"Hello, I received pr0d. From 3ach to cas3."]

for i in fabricated:
    i=i.replace("3ach","")
    i=i.replace("cas3","")

all_sentences,dictated = parse_for_dicty(fabricated)
# print(dictated)
#"Book me a flight from Paris To London This weekend"
# an example of the proper text is here
{'data': [{'text': 'Book me a flight from '},
          {'entity': 'locality','slot_name': 'departure','text': 'Paris'},
          {'text': ' to '},
          {'entity': 'locality','slot_name': 'destination','text': 'London'},
          {'text': ' '},
          {'entity': 'snips/datetime','slot_name': 'flight_time','text': 'this weekend'}]},
training_data = {'intents': {'Ask': {'utterances': []}}}
for i in all_sentences:
    training_data['intents']['Ask']['utterances'].append({"data":i})
#Defined entities = Number, Slots = Case, Each  
# Entities: Mixed_Number. Slots = Product
training_data["language"] = "en"
# completed adding all... intents trainings...
training_data["entities"]={"Mixed_Number": {
      "automatically_extensible": True,
      "data": [
          {"value": "ZB059197X5C","synonyms": []},
          {"value": "2S859U2KY07","synonyms": []},
          {"value": "062E9N06PX2","synonyms": []},
          {"value": "M37E4C5521X","synonyms": []},
          {"value": "7RC88968B3K","synonyms": []},
          {"value": eleven_genny(),"synonyms": []},
          {"value": "10290E75093","synonyms": []},
          {"value": eleven_genny(),"synonyms": []},
          {"value": eleven_genny(),"synonyms": []}],
    "matching_strictness": 1.0,"use_synonyms": True},
    "Number":{
        "automatically_extensible": True,
      "data": [{"value": str(cas3_n_each()[0]),"synonyms":[]} for i in range(60)],
        "matching_strictness": 1.0,
        "use_synonyms": True},
                           
                           "Company": {
      "automatically_extensible": True,
      "data": [
          {"value": "Asus","synonyms": []},
          {"value": "Intel","synonyms": []},
          {"value": "Amazon","synonyms": []},
      ],
                               
    "matching_strictness": 1.0,"use_synonyms": True},
    "Number":{
        "automatically_extensible": True,
      "data": [{"value": str(cas3_n_each()[0]),"synonyms":[]} for i in range(60)],
        "matching_strictness": 1.0,
        "use_synonyms": True},
                           
                           "Region": {
      "automatically_extensible": True,
      "data": [
          {"value": "SEA","synonyms": ["South East Asia","South Asia"]},
          {"value": "NA","synonyms": ["North America","America"]},
          {"value": "CHINA","synonyms": []},
          {"value": "ASIA","synonyms": []},
          {"value": "EUR","synonyms": ["Europe"]},
        ],
    "matching_strictness": 1.0,"use_synonyms": True},
    "Number":{
        "automatically_extensible": True,
      "data": [{"value": str(cas3_n_each()[0]),"synonyms":[]} for i in range(60)],
        "matching_strictness": 1.0,
        "use_synonyms": True}
                          }
#pprint(training_data)
region = ["NA","SEA","EUR","ASIA","CHINA"]
companies = ["Amazon", "Intel","Asus"]
engine = SnipsNLUEngine(config = CONFIG_EN)
print("Fitting")
engine.fit(training_data)
print("Fit complete")
region, company = region_company()
pprint(engine.parse("hello can i have "+ str(eleven_genny())+". "+ "i got like "+ str(region) + ". and i gotta give it out as "+ str(company)))
print("\n\n")
region, company = region_company()
pprint(engine.parse("product: "+ str(eleven_genny())))
print("\n\n")
region, company = region_company()
pprint(engine.parse("I have product "+ str(eleven_genny())+". "+ "Currently is in "+ str(region) + ". It needs to be  "+ str(company)))
print("\n\n")
region, company = region_company()
pprint(engine.parse("Hello, I am packing "+ str(eleven_genny())+ " coming in "+str(region) +" and out as "+ str(company)))
print("\n\n")
region, company = region_company()
pprint(engine.parse("Need without number "  +str(eleven_genny()) + " "+ str(company) +" from ."+ str(region)))
print("\n\n")
region, company = region_company()
pprint(engine.parse("Need product number" +str(eleven_genny()) + " from "+ str(company) +" to "+ str(region)))
print("\n\n")

Fitting
Fit complete
{'input': 'hello can i have 850 three  tree 04846 seven. i got like CHINA. and '
          'i gotta give it out as Intel',
 'intent': {'intentName': 'Ask', 'probability': 0.7350175834786234},
 'slots': [{'entity': 'Mixed_Number',
            'range': {'end': 44, 'start': 17},
            'rawValue': '850 three  tree 04846 seven',
            'slotName': 'Product Number',
            'value': {'kind': 'Custom',
                      'value': '850 three  tree 04846 seven'}},
           {'entity': 'Number',
            'range': {'end': 62, 'start': 57},
            'rawValue': 'CHINA',
            'slotName': 'Region',
            'value': {'kind': 'Custom', 'value': 'CHINA'}},
           {'entity': 'Number',
            'range': {'end': 96, 'start': 91},
            'rawValue': 'Intel',
            'slotName': 'Company',
            'value': {'kind': 'Custom', 'value': 'Intel'}}]}



{'input': 'product: 0379J 941829',
 'intent': {'intentName': 'Ask', 'probability': 0

#### Unfortunate Conclusion
Snips NLU Model relies strongly on context about the word and does not look for characteristics within the word itself. However, by training on entites that encompass multiple words will at least make snips NLU recognise that entities that we define can come in multiple words. <br>
It is important to note that this is done in both the entity example declarations and also the actual training data. A quick parser was used to replace and place the relevant entities into the poorly fabricated training data.

In [14]:
#loaded_engine = SnipsNLUEngine.from_path("engine.eng")
engine.persist("engine.eng")