In [165]:
import random
from collections import OrderedDict

class Chain:
    def __init__(self, chain_words):
        self.chain_words = chain_words
        self.predicted = {}
        self.total_count = 0
        
    def add_word(self, word):
        if word in self.predicted:
            self.predicted[word]['count'] += 1
        else:
            self.predicted[word] = {'count': 1, 'proba': 0}
        self.total_count += 1
        self.compute_proba()
        self.predicted = OrderedDict(sorted(self.predicted.items(), key=lambda i: i[1]['proba'], reverse=True))
    
    def compute_proba(self):
        for key, word in self.predicted.items():
            proba = word['count'] / self.total_count
            self.predicted[key]['proba'] = proba
    
    def generate_word(self):
        treshold = random.random()
        for key, word in self.predicted.items():
            if treshold <= word['proba']:
                return key
            else:
                # egg: if th = 0.8, 1st proba = 0.5 and 2nd = 0.3, we want th to be = to 0.3 when we compare to 2nd prob
                treshold -= word['proba']

In [166]:
c = Chain(['Je', 'suis', 'très'])

In [167]:
c.add_word('test')
c.add_word('heureux')
c.add_word('triste')
c.add_word('heureux')

In [168]:
c.predicted

OrderedDict([('heureux', {'count': 2, 'proba': 0.5}),
             ('test', {'count': 1, 'proba': 0.25}),
             ('triste', {'count': 1, 'proba': 0.25})])

In [169]:
def test_chain(chain, samples=1000):
    from collections import Counter
    res = {}
    for i in range(samples):
        w = chain.generate_word()
        res[w] = res[w] + 1 if w in res else 1
    # lets compute the error
    print({k:x/samples - chain.predicted[k]['proba'] for (k,x) in Counter(res).items()})

In [170]:
test_chain(c, samples=1000000)

{'heureux': -4.300000000001525e-05, 'triste': -0.0003120000000000067, 'test': 0.0003549999999999942}


In [171]:
import random
from collections import deque

class Markov:
    def __init__(self, chain_length, corpus=None, tokenizer=None, sentence_delimiter="."):
        self.chain_length = chain_length
        self.chains = {}
        self.default_punkt = sentence_delimiter
        if corpus:
            self.parse_corpus(corpus, tokenizer)
            
    def parse_corpus(self, corpus, tokenizer=None):
        import re
        # tokenize into sentences
        # if no tokenizer we assume corpus should be an array of sentences (array of words)
        if tokenizer is None:
            sentences = corpus
        elif tokenizer == "punkt":
            import nltk
            sentences = nltk.sent_tokenize(corpus)
            # now array of sentences
            # lets transform sentences into words
            sentences = [re.sub(r"[^\w\d'\s]+",'', s.lower()).split() for s in sentences]
            
        # custom method passed? 
        else:
            sentences = tokenizer(corpus)
            
        # add start and stop words
        sentences = [["<s>"] + s + ["</s>"] for s in sentences]
                
        # train for every sentence
        for s in sentences:
            for i in range(self.chain_length, len(s)):
                predicted = s[i]
                chain_words = s[i - self.chain_length:i]
                chain_id = "@ç@".join(chain_words)
                if chain_id not in self.chains:
                    self.chains[chain_id] = Chain(chain_words)
                self.chains[chain_id].add_word(predicted)
    
    def generate_sentence(self, start=None):
        if start:
            # tokenize start
            start = [ w.lower() for w in start.split()]
        # if not start generate a random start
        if not start or "@ç@".join(start) not in self.chains:
            start = random.choice(self.find_chain("<s>")).split('@ç@')
        res = deque(start)
        chain = deque(start)
        word = ""
        # generate words until reaching a stop word
        while word != "</s>":
            cid = '@ç@'.join(chain)
            if cid not in self.chains:
                return (" ".join(res)).strip()
            word = self.chains[cid].generate_word()
            res.append(word)
            chain.append(word)
            chain.popleft()
        
        punctuation = self.default_punkt
        # remove start and stop words
        res.pop()
        if res[0] == "<s>":
            res.popleft()
        if res[0] in ["quoi", 'comment', 'quel', 'lequel', 'pourquoi', 'quand', 'est', 'qui', 'où']:
            punctuation = "?"
        sentence = (" ".join(res)).strip()
        sentence = sentence[0].upper() + sentence[1:]
        return sentence + punctuation + " "
        
    def generate(self, nb_sentences=10, start=None, link_chains=False):
        res = ""
        for n in range(nb_sentences):
            sentence = self.generate_sentence(start)
            res += sentence
            if link_chains:
                start = " ".join(sentence[len(sentence) - self.chain_length:])
            else:
                start = None
        return res.strip()
    
    def find_chain(self, searched):
        res = []
        for key in self.chains.keys():
            if searched in key:
                res.append(key)
        return res
            
            

In [172]:
mousquetaires = open('./data/3mousq.txt', 'r').read().strip().replace('\n', ' ').replace('#', '').replace('-', '')

In [186]:
len(mousquetaires.split(' '))

233558

In [217]:
m_mousquetaires_2 = Markov(2, mousquetaires, 'punkt')

In [232]:
m_mousquetaires_2.generate(2, link_chains=True)

"Donnonsnous ce plaisir d'ailleurs ce retour lui offrait un avantage c'était de surveiller lui même le départ de mon évanouissement felton écoutait ce dialogue sans dire gare. Montezen six."

In [174]:
m_mousquetaires_3 = Markov(3, mousquetaires, "punkt")

In [176]:
m_mousquetaires_3.generate(2, "athos porthos et", True)

"Athos porthos et aramis se placèrent à une table et se mit à table mangea peu et ne but que de l'eau. À mon tour."

In [177]:
m_mousquetaires_4 = Markov(4, mousquetaires, "punkt")

In [199]:
m_mousquetaires_4.generate(2, link_chains=True)

"L'un des deux gardes était invité pour le soir même mais nous devons dire à la louange de m d'artagnan fils quelques efforts qu'il tentât pour rester ferme comme le devait être un futur mousquetaire la nature l'emporta et il versa force larmes dont il parvint à grandpeine à cacher la moitié. Mais en venant par trop matin je crains de réveiller votre majesté."

In [200]:
lotr = open('./data/lotr.txt', 'r').read().strip().replace('\n', ' ').replace('#', '').replace('-', '')

In [210]:
m_lotr_2 = Markov(2, lotr, 'punkt')

In [216]:
m_lotr_2.generate(2, link_chains=True)

'Skulls and bones black in cinders lie beneath the roots of the food i send with you. Drea dful as the valley.'

In [201]:
m_lotr_3 = Markov(3, lotr, 'punkt')

In [233]:
m_lotr_3.generate(2, link_chains=True)

"The rain's nearly given over already' said sam 'but i wouldn't be one to say that the journey went well and they met no danger and heard nothing and seen nothing for two nights now'. 'the red arrow'."

In [234]:
m_lotr_4 = Markov(4, lotr, 'punkt')

In [239]:
m_lotr_4.generate(2, link_chains=True)

'As dusk drew down on the fourth day that they rode still forward after dusk and into the night beneath the moon. An author cannot of course remain wholly unaffected by his experience but the ways in which a storygerm uses the soil of experience are extremely complex and attempts to define the process are at best guesses from evidence that is inadequate and ambiguous.'

In [144]:
import json, os
m = Markov(3, sentence_delimiter="")
corpus = []
for tab in os.listdir('./scripts/parsed_tabs_folk'):
    with open(os.path.join('./scripts/parsed_tabs_folk', tab)) as f:
        t = json.load(f)
        t = [x for x in t if x != '------']
        corpus.append(t)

print(len(corpus))
m.parse_corpus(corpus)

436


In [164]:
# res = m.generate(1, start="---3-- ---2-- ---0--")
res = m.generate(1)
# [{'id': c.chain_words, 'pred': c.predicted} for c in m.chains.values()]
tab = ""
strings = {0:"|", 1:"|", 2:"|", 3:"|", 4:"|", 5:"|"}
words = res.split(' ')
for index, w in enumerate(words):
    if len(w) == 6:
        for i in range(6):
            try:
                strings[i] += w[i] + "-" if index < len(words) - 1 and words[index + 1][i] not in ["p", "h", "b"] and w[i] not in ['p', 'h', 'b'] else w[i]
            except IndexError:
                print(w, i, index)
    else:
        for i in range(6):
            try:
                strings[i] += w[i * 2: i * 2 + 2] + "-" if index < len(words) - 1 and words[index + 1][i] not in ["p", "h", "b"] and w[i] not in ['p', 'h', 'b'] else w[i * 2: i * 2 + 2]
            except IndexError:
                print(w, i, index)
print("\n".join(strings.values()))

|--0-------------------------------0---------------------------------------------2--------------------
|--1---0-1-4-------0---------2h3-------3p1-0-0---0-1p0-------3---3-------3---3------------------
|--2-2---------0-2---------------------0-----------3-----------2---------2---2-------2---2------------
|--2---------4-----2p1h2-4-----------2---------0---------0-------0-----0-----------0-----0--------
|0-0-------2-------------------------3-----------------2---------0-----0-------------------4---2-0----
|----------------------------------------------3---------------------------------------------------3-3
