In [1]:
import random
from collections import OrderedDict

class NGramm:
    def __init__(self):
        self.predicted = {}
        self.total_count = 0
        
    def add_word(self, word):
        if word in self.predicted:
            self.predicted[word]['count'] += 1
        else:
            self.predicted[word] = {'count': 1, 'proba': 0}
        self.total_count += 1
        self.compute_proba()
        self.predicted = OrderedDict(sorted(self.predicted.items(), key=lambda i: i[1]['proba'], reverse=True))
    
    def compute_proba(self):
        for key, word in self.predicted.items():
            proba = word['count'] / self.total_count
            self.predicted[key]['proba'] = proba
    
    def generate_word(self):
        treshold = random.random()
        for key, word in self.predicted.items():
            if treshold <= word['proba']:
                return key
            else:
                # egg: if th = 0.8, 1st proba = 0.5 and 2nd = 0.3, we want th to be = to 0.3 when we compare to 2nd prob
                treshold -= word['proba']

In [7]:
import random
from collections import deque

class Model:
    def __init__(self, n, corpus=None, tokenizer=None, sentence_delimiter="."):
        self.n = n - 1
        self.ngramms = {}
        self.default_punkt = sentence_delimiter
        if corpus:
            self.parse_corpus(corpus, tokenizer)
            
    def parse_corpus(self, corpus, tokenizer=None):
        import re
        # tokenize into sentences
        # if no tokenizer we assume corpus should be an array of sentences (array of words)
        if tokenizer is None:
            sentences = corpus
        elif tokenizer == "punkt":
            import nltk
            sentences = nltk.sent_tokenize(corpus)
            # now array of sentences
            # lets transform sentences into words
            sentences = [re.sub(r"[^\w\d'\s]+",'', s.lower()).split() for s in sentences]
            
        # custom method passed? 
        else:
            sentences = tokenizer(corpus)
            
        # add start and stop words
        sentences = [["<s>"] + s + ["</s>"] for s in sentences]
                
        # train for every sentence
        for s in sentences:
            for i in range(self.n, len(s)):
                predicted = s[i]
                words = s[i - self.n:i]
                # select a delimiter that has low chances of appearing in the text
                ngramm_id = "@ç@".join(words)
                if ngramm_id not in self.ngramms:
                    self.ngramms[ngramm_id] = NGramm()
                self.ngramms[ngramm_id].add_word(predicted)
    
    def generate_sentence(self, start=None):
        if start:
            # tokenize start
            start = [ w.lower() for w in start.split()]
        # if not start generate a random start
        if not start or "@ç@".join(start) not in self.ngramms:
            start = random.choice(self.find_ngramm("<s>")).split('@ç@')
        res = deque(start)
        ngramm = deque(start)
        word = ""
        # generate words until reaching a stop word
        while word != "</s>":
            ngramm_id = '@ç@'.join(ngramm)
            if ngramm_id not in self.ngramms:
                return (" ".join(res)).strip()
            word = self.ngramms[ngramm_id].generate_word()
            res.append(word)
            ngramm.append(word)
            ngramm.popleft()
        
        punctuation = self.default_punkt
        # remove start and stop words
        res.pop()
        if res[0] == "<s>":
            res.popleft()
        if res[0] in ["quoi", 'comment', 'quel', 'lequel', 'pourquoi', 'quand', 'est', 'qui', 'où']:
            punctuation = "?"
        sentence = (" ".join(res)).strip()
        sentence = sentence[0].upper() + sentence[1:]
        return sentence + punctuation + " "
        
    def generate(self, nb_sentences=10, start=None, linked=False):
        """
        linked: should we use the end of previous sentence as beginning for the next one?
        """
        res = ""
        for n in range(nb_sentences):
            sentence = self.generate_sentence(start)
            res += sentence
            if linked:
                start = " ".join(sentence[len(sentence) - self.n:])
            else:
                start = None
        return res.strip()
    
    def find_ngramm(self, searched):
        res = []
        for key in self.ngramms.keys():
            if searched in key:
                res.append(key)
        return res
            
            

In [8]:
corpus = open('./data/lotr.txt', 'r').read().strip().replace('\n', ' ').replace('#', '').replace('-', '')

In [21]:
len(corpus.split(' '))

797276

In [11]:
m = Markov(4, corpus, "punkt")

In [16]:
m.generate(10, "the ring as it", True)

"The ring as it was set out in the chronicles of the red book of the periannath and was brought to him by the thain peregrin when he retired to gondor in iv 64. Before the crossing of the river. Cruel and cold. Later as the sun was setting and the company was stirring and getting ready to come in search of thorin's company. This will hide some of the telltales'. There is if you care to go two hundred miles or so out of your way to bring tidings to jowyn and to speak with her in her exile'. At night plumes of vapour steamed from the vents lit from beneath with red light or blue or venomous green. Trusting that you will be looked for on the road and nobody goes far and folk lock up early. Why we had a real setto and there were some folk killed killed dead. You can do as you like about my reward take me as a guide or not."

In [14]:
m.find_ngramm("ring")

['history@ç@of@ç@the@ç@ring',
 'of@ç@the@ç@ring@ç@as',
 'the@ç@ring@ç@as@ç@it',
 'ring@ç@as@ç@it@ç@was',
 'lord@ç@of@ç@the@ç@rings',
 'and@ç@me@ç@come@ç@blundering',
 'me@ç@come@ç@blundering@ç@along',
 'come@ç@blundering@ç@along@ç@making',
 'blundering@ç@along@ç@making@ç@a',
 'a@ç@beautiful@ç@grey@ç@ring',
 'beautiful@ç@grey@ç@ring@ç@of',
 'grey@ç@ring@ç@of@ç@smoke',
 'ring@ç@of@ç@smoke@ç@that',
 'time@ç@to@ç@blow@ç@smokerings',
 'to@ç@blow@ç@smokerings@ç@this',
 'blow@ç@smokerings@ç@this@ç@morning',
 'another@ç@even@ç@bigger@ç@smokering',
 '<s>@ç@not@ç@the@ç@wandering',
 'not@ç@the@ç@wandering@ç@wizard',
 'the@ç@wandering@ç@wizard@ç@that',
 'wandering@ç@wizard@ç@that@ç@gave',
 'came@ç@a@ç@tremendous@ç@ring',
 'a@ç@tremendous@ç@ring@ç@on',
 'tremendous@ç@ring@ç@on@ç@the',
 'ring@ç@on@ç@the@ç@frontdoor',
 'another@ç@even@ç@louder@ç@ring',
 'even@ç@louder@ç@ring@ç@at',
 'louder@ç@ring@ç@at@ç@the',
 'ring@ç@at@ç@the@ç@bell',
 'bilbo@ç@found@ç@himself@ç@answering',
 'found@ç@himself@ç@answ

In [144]:
import json, os
m = Markov(3, sentence_delimiter="")
corpus = []
for tab in os.listdir('./scripts/parsed_tabs_folk'):
    with open(os.path.join('./scripts/parsed_tabs_folk', tab)) as f:
        t = json.load(f)
        t = [x for x in t if x != '------']
        corpus.append(t)

print(len(corpus))
m.parse_corpus(corpus)

436


In [164]:
# res = m.generate(1, start="---3-- ---2-- ---0--")
res = m.generate(1)
# [{'id': c.chain_words, 'pred': c.predicted} for c in m.chains.values()]
tab = ""
strings = {0:"|", 1:"|", 2:"|", 3:"|", 4:"|", 5:"|"}
words = res.split(' ')
for index, w in enumerate(words):
    if len(w) == 6:
        for i in range(6):
            try:
                strings[i] += w[i] + "-" if index < len(words) - 1 and words[index + 1][i] not in ["p", "h", "b"] and w[i] not in ['p', 'h', 'b'] else w[i]
            except IndexError:
                print(w, i, index)
    else:
        for i in range(6):
            try:
                strings[i] += w[i * 2: i * 2 + 2] + "-" if index < len(words) - 1 and words[index + 1][i] not in ["p", "h", "b"] and w[i] not in ['p', 'h', 'b'] else w[i * 2: i * 2 + 2]
            except IndexError:
                print(w, i, index)
print("\n".join(strings.values()))

|--0-------------------------------0---------------------------------------------2--------------------
|--1---0-1-4-------0---------2h3-------3p1-0-0---0-1p0-------3---3-------3---3------------------
|--2-2---------0-2---------------------0-----------3-----------2---------2---2-------2---2------------
|--2---------4-----2p1h2-4-----------2---------0---------0-------0-----0-----------0-----0--------
|0-0-------2-------------------------3-----------------2---------0-----0-------------------4---2-0----
|----------------------------------------------3---------------------------------------------------3-3
