In [9]:
from collections import defaultdict
import numpy as np
import pandas as pd
from abc import ABCMeta, abstractmethod

# Simulated World dataset generation

In [10]:
def retrieve_first_names(input_file):
    with open(input_file,'r') as inp:
        a =  inp.readlines()
    a = map(lambda x : x.split('\t')[0],a)
    a = list(np.random.choice(a,len(a),replace=False))
    return a

def retrieve_file(input_file):
    with open(input_file,'r') as inp:
        a =  inp.readlines()
        aa = list()
        for i in a:
            aa.append(i[:-1])
        aa = list(np.random.choice(aa,len(aa),replace=False)) 
        return aa

In [11]:
DATA_DIR = "../preprocessing/"
nam = retrieve_first_names(DATA_DIR+"first_name.txt")
pla = retrieve_file(DATA_DIR + "house_places.txt")
obj = retrieve_file(DATA_DIR + 'objects.txt')

In [12]:
class Scenario(object):
    """Bordes and al. used a rather simple world : 4 characters, 3 objects and 5 rooms"""
    def __init__(self,training_sample,number_of_characters = 4, number_of_objects = 3, number_of_locations = 5):
        self._generate_variables(number_of_characters, number_of_objects, number_of_locations)
        self.training_samples = list()
        if isinstance(training_sample,list):
            for ts in training_sample:
                self.add_training_sample(ts)
        else : 
            self.add_training_sample(training_sample)
        
    def _generate_variables(self,number_of_characters = 4, number_of_objects = 3, number_of_locations = 5):
        self.number_of_characters = number_of_characters
        self.number_of_objects = number_of_objects
        self.number_of_locations = number_of_locations
        self.names = retrieve_first_names("../preprocessing/first_name.txt")[:self.number_of_characters]
        self.objects = retrieve_file('../preprocessing/objects.txt')[:self.number_of_characters]
        self.locations = retrieve_file("../preprocessing/house_places.txt")[:self.number_of_characters]
        
    def add_training_sample(self,training_sample):
        assert hasattr(training_sample,'__call__'), "Training sample should be callable"
        assert hasattr(training_sample,'_generate_questions'), "the traning sample should have _generate_questions "
        assert hasattr(training_sample,'_generate_context'), "the traning sample should have _generate_context attr "
        t = training_sample(self.objects,self.names,self.locations)
        self.training_samples.append(t)
        
    def create_corpus(self,number_of_sentences = 70):
        self._generate_variables()
        self.dataset = defaultdict(dict)
        for i in range(number_of_sentences) :
            self._generate_variables()
            t = np.random.choice(self.training_samples)
            self.dataset[i] = t.generate_training_example()
        return self.dataset

In [13]:
class TrainingItems(object):
    __metaclass__ = ABCMeta
    
    def __init__(self,list_of_objects,list_of_characters,list_of_places):
        self.list_of_objects = list_of_objects
        self.list_of_characters = list_of_characters
        self.list_of_places = list_of_places
        self._context = self._generate_context()
        self._question = self._generate_questions()    
        
    @abstractmethod
    def _generate_questions(self):
        pass

    @abstractmethod
    def _generate_context(self):
        pass

    @abstractmethod        
    def _generate_answer(self,indice):
        pass

    def generate_training_example(self):
        d = {}
        q = self._generate_questions()
        c = self._generate_context()
        c.append(q["question"])
        d["X"] = '\n'.join(c)
        d["Y"] = q["answer"]
        return d

In [14]:
class Locations(TrainingItems):
    def _generate_context(self):
        self._context = list()
        self.c0, self.c1, self.c2 = np.random.choice(self.list_of_characters, 3, replace=False)
        self.p0, self.p1, self.p2 = list(np.random.choice(self.list_of_places, 3, replace=False))
        self.o0 = np.random.choice(self.list_of_objects)

        self._context.append(self.c0 + " went to " + self.p0)
        self._context.append(self.c1 + " took " + self.o0 + " in " + self.p1)
        self._context.append(self.c1 + " left " + self.o0 + " in " + self.p2)
        self._context.append(self.c1 + " joined " + self.c0 + " in " + self.p0)
        self._context.append(
          self.c2 + " went from " + self.p0 + " to the " + self.p2 + " because he can't stand " + self.c1)
        return self._context

    def _generate_questions(self):
        self._questions = defaultdict(dict)
        self._questions[0]["question"] = "Why did " + self.c2 + " left ?"
        self._questions[0]["answer"] = " he doesn't like " + self.c1

        self._questions[1]["question"] = "Where is the " + self.o0 + " ?"
        self._questions[1]["answer"] = self.p2

        l = np.random.choice(len(self._questions))
        return {"question": self._questions[l]["question"],
                "answer": self._generate_answer(l) + self._questions[l]["answer"]}

    def _generate_answer(self, indice):
        self._formulation = defaultdict(list)
        self._formulation[0].append("I guess because ")
        self._formulation[0].append("I think it's because ")
        self._formulation[1].append("I think it's in ")
        self._formulation[1].append("Probably in ")
        return self._formulation[indice][np.random.choice(len(self._formulation[indice]))]


In [16]:
def write_corpus(dataset):
    assert isinstance(dataset,dict), "the given dataset is not of the right type. {} found, dict required ".format(type(dataset))
    f = []
    for _,item in dataset.iteritems():
        sol = " | ".join([item["X"],item["Y"]])
        sol.replace("\n",' ')
        f.append(sol)
    return f

## Creating the corpus we will be using during our research

In [15]:
s = Scenario(Locations)
ss = s.create_corpus()
a = write_corpus(ss)

In [18]:
a[0]

"Duncan went to the terraced house\nDarryl took the radiator in the porch\nDarryl left the radiator in the basement\nDarryl joined Duncan in the terraced house\nAlex went from the terraced house to the the basement because he can't stand Darryl\nWhy did Carmen left ? |\xc2\xa0I think it's because  he doesn't like Darryl"

## Using Wordnet to generalize the world used in the sentences

In [17]:
from nltk import wordnet
w = wordnet.wordnet
## dir(w)
## Test 
t = w.synsets("take")
for i in t :
    if i.name() == "bring.v.01":
        print(i.lemma_names())
# Test 1
print("##################")
t = w.synsets("leave")
for i in t :
    print(i.name())
    print(i.lemma_names())

[u'bring', u'convey', u'take']
##################
leave.n.01
[u'leave', u'leave_of_absence']
leave.n.02
[u'leave']
farewell.n.02
[u'farewell', u'leave', u'leave-taking', u'parting']
leave.v.01
[u'leave', u'go_forth', u'go_away']
leave.v.02
[u'leave']
leave.v.03
[u'leave']
leave.v.04
[u'leave', u'leave_alone', u'leave_behind']
exit.v.01
[u'exit', u'go_out', u'get_out', u'leave']
leave.v.06
[u'leave', u'allow_for', u'allow', u'provide']
leave.v.07
[u'leave', u'result', u'lead']
leave.v.08
[u'leave', u'depart', u'pull_up_stakes']
entrust.v.02
[u'entrust', u'leave']
bequeath.v.01
[u'bequeath', u'will', u'leave']
leave.v.11
[u'leave']
leave.v.12
[u'leave', u'leave_behind']
impart.v.01
[u'impart', u'leave', u'give', u'pass_on']
forget.v.04
[u'forget', u'leave']
