In [108]:
import numpy as np
import tqdm

In [115]:
def load_corpus_two_gram(path):
    file = open(path, "r")
    two_grams = {}
    
    for line in file:

        *_, oc_number, word_1, word_2 = line.split(" ")

        if int(oc_number) == 1: break

        if word_2[-1:] == '\n':
            word_2 = word_2[:-1]
        
        if word_1 in two_grams.keys():
            two_grams[word_1].append([word_2, oc_number])
        else:
            two_grams[word_1] = [[word_2, oc_number]]

        

    return two_grams

In [156]:
class TextGenerator:

    def __init__(self, corpus, method, n_gram):

        self.corpus = corpus
        self.method = method
        self.n_gram = n_gram - 1
        self.corpus_keys = list(self.corpus.keys())

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def choose_word_uniform(self, words):

        number_of_words = len(words)
        word_idx = np.random.randint(0,number_of_words,1)[0]

        return words[word_idx][0]

    def choose_word_softmax(self, words):

        occurrences = np.array([word[1] for word in words], dtype=np.float64)

        occurrences = (occurrences - np.min(occurrences)) / (np.max(occurrences) - np.min(occurrences + 0.0001))

        number_of_words = len(words)
        probs = self.softmax(occurrences)
        word_idx = np.random.choice(number_of_words , size = 1, p = probs)[0]

        return words[word_idx][0]

    def generate(self, length, first_word):

        if first_word == None or first_word not in self.corpus_keys:
            sentence = [self.corpus_keys[np.random.randint(0,len(self.corpus_keys),1)[0]]]
        else:
            sentence = [first_word]

        for i in np.arange(length):

            word = sentence[-self.n_gram]
            
            if word in self.corpus_keys:
                
                if self.method == 'uniform':
                    pred = self.choose_word_uniform(self.corpus[word])
                if self.method == 'softmax':
                    pred = self.choose_word_softmax(self.corpus[word])
                sentence.append(pred)
            else:
                pred = self.corpus_keys[np.random.randint(0,len(self.corpus_keys),1)[0]]
                sentence.append(pred)

        res = ""
        for word in sentence:
            res += word
            res += " "
            


        return res

In [116]:
two_grams = load_corpus_two_gram('/content/drive/My Drive/Colab Notebooks/NLP/Dane/2grams')

In [157]:
T = TextGenerator(two_grams, 'softmax', 2)

In [159]:
T.generate(30, 'adam')

'adam konkol, gitarzysta basowy, kompozytor po deszczowym finale mistrz kierownicy. po gzymsie, a wygrana jest dajmy nowej ekonomii. nic złego. o formalnościach związanych rzemieniem. - zaburczał w przybyszach z leźna, „ocioszyński” '