In [1]:
import gensim
# used for loading or saving
model_file = '/home/ramin/projects/ECO/src/python/modelbuilder/parsed_v3_valid.doc2vec'


In [2]:
# 2 Build sentence list (each sentence needs at least 1 tag)
filename = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v3/parsed_v3_valid.txt'

sentences = []
from random import shuffle

for uid, line in enumerate(open(filename)):
    ls = gensim.models.doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % uid])
    sentences.append(ls)
print(len(sentences),'sentences')

(4266193, 'sentences')


In [3]:
# 3 Training the doc2vec model
### ALTERNATIVE
### JUST LOAD IT WITH THE NEXT CELL
### FOR SECURITY REASONS, LETS HAVE A FLAG

# tutorial https://rare-technologies.com/doc2vec-tutorial/
# proposes shuffling or learning reate adjustment. we gonna do both
# in total 20 epochs
# took ca. 6.30 hours

build_model = False

if build_model:
    model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
    print('building vocab') 
    model.build_vocab(sentences)

    base_alpha = model.alpha
    base_min_alpha = model.min_alpha

    for mepoch in range(2):
        model.alpha = base_alpha 
        model.min_alpha = base_min_alpha
        for epoch in range(10):
            print('epoch',mepoch * 10 + epoch)
            model.train(sentences)
            model.alpha -= 0.002  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
        shuffle(sentences)

    # saving the model    
    model.save(model_file)

In [4]:
# 4 Loading the model

model_loaded = gensim.models.Doc2Vec.load(model_file)

In [None]:
# 5 TEST: printing sentence 9 and getting the most similar ones.
test_sentence_index = 2639533

print ' '.join(sentences[test_sentence_index][0])
sims = model_loaded.docvecs.most_similar('SENT_'+str(test_sentence_index),topn = 30)
print 'similar sentence',len(sims)
print '\nSIMILAR SENTENCES\n'
for sim in sims:
    print nice_print(sim),sim

In [7]:
# 6 Tiny helper
import re

def print_word_list(wl):
    str =  ' '.join(wl)
    pattern = re.compile('\s.\s')
    shift = 0
    for ma in pattern.finditer(str):
        str = str[:ma.start(0)-shift]+ma.group(0)[1:]+ str[ma.end(0)-shift:]
        shift +=1
    if str[-2] == ' ':
        str = str[:-2] + str[-1:]
    return str

def nice_print(tagged_doc):
    word_list = sentences[int(tagged_doc[0][5:])][0]
    return print_word_list(word_list)

def print_similar(similar):
    return nice_print(similar)

def nice_print_labSen(labeledSentence):
    return print_word_list(labeledSentence[0])

def print_index(index):
    sentence = ' '.join(sentences[index][0])
    return sentence
    
def get_similar_index(similar):
    return int(similar[0][5:])
       
def get_index_tag(labeledSentence):
    return labeledSentence[1][0]

def get_index(labeledSentence):
    return int(get_index_tag(labeledSentence)[5:])
    
def equal_word_lists(index1, index2):
    wl1 = sentences[index1][0]
    wl2 = sentences[index2][0]
    if len(wl1) != len(wl2):
        return False
    else:
        for i in range(len(wl1)):
            if wl1[i] != wl2[i]:
                return False
    return True

def get_lab_sent_by_similar(similar):
    print get_similar_index(similar)
    return sentences[get_similar_index(similar)]

In [None]:
# 7 TEST
# iterate over similar sentences
# needs the sentences loaded (cell 2)
import random

index = 1983
# len(sentences)
# print sentences[index]
sentence = ' '.join(sentences[index][0])
print sentence
selected_indices = [index]

for sentence in range(100):
    sims = model_loaded.docvecs.most_similar('SENT_'+str(index))
    while True:
        selected = random.choice(sims)
        check_index = int(selected[0][5:])
        if check_index not in selected_indices:
            break
    index = check_index
    selected_indices.append(index)
    print nice_print(selected)
#     print selected_indices
    

In [8]:
# 8 Story TreeNode class

import random

class LabSentTreeNode:
    
    def __init__(self, labeledSentence, parent = None):
        self.sentence = labeledSentence
#       self.sentence_index = sentence_index
        self.similars = self.get_similars()  
        self.randoms = []
        self.children = {} # index: SentenceTreeNode
        self.selected_child = -1 # None
        self.parent = parent
             
    def get_similars(self):
        return model_loaded.docvecs.most_similar(get_index_tag(self.sentence),topn = 10)
    
    def print_options(self):
        for index, sentence in enumerate(self.similars):
            print index, '(*)' if get_similar_index(sentence) in self.children else '', nice_print(sentence), sentence[1]
        if self.parent:
            print 'p', nice_print_labSen(self.parent.sentence)
        print ':::Randoms:::'
        for index in range(10):
            rnd_sen = sentences[random.randint(0,len(sentences))]
            self.randoms.append(rnd_sen)
            print 'r'+str(index),  nice_print_labSen(rnd_sen)
            
    def select_child(self):
        u_input = raw_input('Next child: ')
        if u_input == 'p':
            u_input = -1
        elif u_input == 'q':
            u_input = -2
        elif u_input.startswith('r'):
            u_input = 100 + int(u_input[1:])
        try:
            selected_index = int(u_input)
        except ValueError:
            return self
        if selected_index >= 0 and selected_index < len(self.similars):
            lab_sent = get_lab_sent_by_similar(self.similars[selected_index])
            child =  LabSentTreeNode(lab_sent, self)
            self.children[selected_index] = child
            self.selected_child = selected_index
            return child
        elif selected_index >= 100 and selected_index < len(self.randoms) + 100:
            print 'random sen'
            child =  LabSentTreeNode(self.randoms[selected_index - 100], self)
            self.children[selected_index] = child
            self.selected_child = selected_index    
            return child
        elif u_input == -1 and self.parent:
            return self.parent
        elif u_input == -2:
            return None
        
    def toJSON(self):
        return {'sentence':nice_print_labSen(self.sentence),
                'index':get_index(self.sentence),
               'children':[self.children[child_index].toJSON() for child_index in self.children]
               }
     

In [9]:
# 9 Story creator log/helper functions

from IPython.display import clear_output
import json

def get_story(root_node):
    act_sentence = root_node
    story = []
    while act_sentence:
        story.append(nice_print_labSen(act_sentence.sentence))
        if act_sentence.selected_child  >= 0:
            act_sentence = act_sentence.children[act_sentence.selected_child]
        else:
            break
    return story

def log_json(root_node):
    with open('log_json.txt','w') as output:
        output.write(json.dumps(root_node.toJSON(),indent=2))
    
def log_story(root_node):
    story = get_story(root_node)
    with open('log_story.txt','w') as output:
        for l in story:
            output.write(l+'\n')   
    
def print_story(root_node):
    story = get_story(root_node)
    for l in story:
        print(l)   

#print root_node.toJSON()

In [13]:
# 9 Story creator
import time

sentence = sentences[random.randint(0,len(sentences))]
root_node = LabSentTreeNode(sentence)
actual_node = root_node

while actual_node:
    clear_output()
    log_json(root_node)
    log_story(root_node)
    print '//////'
    print nice_print_labSen(actual_node.sentence)
    print '//////'
    actual_node.print_options()
    time.sleep(1)
    actual_node = actual_node.select_child()

//////
Because the GNU GPL won ’t let them do that.
//////
0  We don ’t want to let them do that. 0.887139320374
1  And let them do what they wanted to do. 0.861190795898
2  Well, I don ’t really do that anymore. 0.846694588661
3  I don ’t want to do that. 0.844094991684
4  We ’ ll explain how to do that in Chapter2. 0.833934009075
5  He didn ’t want to do that. 0.823796033859
6  You ’ re not going to do that. 0.812620222569
7  And to do that, you ’ ll need to learn the basics. 0.806607484818
8  Do you think that breaksfrom that. .. 0.801155567169
9  I do n't know whatI 'm going to do. 0.797776579857
p The design was based on the metastable aggregation of architecture and information.
:::RRR:::
r0 Among the lettersI have received, about 10 percent are stories of attacks people suffered during their European vacations.
r1 Equally inspiring was the attitude of many poor working-men.
r2 There area few additional caveats to consider in this analysis.
r3 In Proceedings of the IEEE Internati

In [15]:
print_story(root_node)
print '>>>'
print root_node.toJSON()
#import json
#print json.dumps(root_node.toJSON(),indent=2)

The fundamental feature of this network design was neutrality among packets.
The design was based on the metastable aggregation of architecture and information.
Because the GNU GPL won ’t let them do that.
>>>
{'index': 4213497, 'children': [{'index': 3851251, 'children': [{'index': 3564378, 'children': [], 'sentence': 'Because the GNU GPL won \xe2\x80\x99t let them do that.'}], 'sentence': 'The design was based on the metastable aggregation of architecture and information.'}], 'sentence': 'The fundamental feature of this network design was neutrality among packets.'}
