In [1]:
import numpy as np
from itertools import permutations
from tqdm import tqdm

In [2]:
def load_corpus(path, num_lines, K):
    file = open(path, "r")

    one_grams = {}
    two_grams = {}
    three_gram = {}
    number_of_words = 0
    num_line_iter = 0 
    
    for _, line in enumerate(tqdm(file)):
        
        line = line.lower().split(" ")

        if len(line) < 3: continue

        number_of_words += len(line)
        line_iter = 0

        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_1 = word_1
            key_2 = (word_1, word_2)
            key_3 = (word_1, word_2, word_3)

            if key_1 in one_grams.keys():
                one_grams[key_1] += 1
            else:
                one_grams[key_1] = 1

            if key_2 in two_grams.keys():
                two_grams[key_2] += 1
            else:
                two_grams[key_2] = 1

            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1

            line_iter += 1

        key_2 = (word_2, word_3)
        if key_2 in two_grams.keys():
            two_grams[key_2] += 1
        else:
            two_grams[key_2] = 1


        key_1 = word_2
        if key_1 in one_grams.keys():
             one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1

        key_1 = word_3
        if key_1 in one_grams.keys():
            one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1


        num_line_iter += 1
        if num_line_iter == num_lines: break

    return three_gram, two_grams, one_grams,  number_of_words

In [3]:
num_lines = 22000000

three_grams, two_grams, one_grams, number_of_words = load_corpus('/content/polish_corpora.txt', num_lines, 3)

21999686it [30:05, 11892.81it/s]

In [8]:
len(three_grams.keys())

64225985

In [10]:
def train_three_grams(path, start_line, num_lines, K):
    file = open(path, "r")

    three_gram = {}
    num_line_iter = 0

    for line in file:
        if num_line_iter < start_line:
            num_line_iter += 1
            continue
        
        line = line.lower().split(" ")

        
        if len(line) < 3: continue

        line_iter = 0

        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_3 = (word_1, word_2, word_3)
            
            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1
            
            line_iter += 1

        num_line_iter += 1
        if num_line_iter == start_line + num_lines: break

    return three_gram

In [11]:
train_corpus = train_three_grams('/content/polish_corpora.txt', num_lines, 1000000, 3)

In [12]:
len(train_corpus.keys())

6990430

In [15]:
def train(one_grams, two_grams, three_grams, train_three_grams, number_of_words):

    lambdas = np.array([0, 0, 0])

    for three_gram in train_three_grams.keys():

        p3 = 0
        p2 = 0
        p1 = 0

        if three_gram in three_grams:

            p3 = three_grams[three_gram]/two_grams[three_gram[:2]]

        if three_gram[1:] in two_grams:

            p2 = two_grams[three_gram[1:]]/one_grams[three_gram[1]]

        if three_gram[2] in one_grams:

            p1 = one_grams[three_gram[2]]/number_of_words
        
        results = (p1, p2, p3)
        
        f = lambda i: results[i]
        best = max(range(len(results)), key=f)

        lambdas[best] += train_three_grams[three_gram]


    #lambdas = (lambdas-np.mean(lambdas)) / np.std(lambdas)
    #e_x = np.exp(lambdas - np.max(lambdas))
    #lambdas = e_x / e_x.sum()

    return lambdas/np.sum(lambdas)

In [16]:
lb = train(one_grams, two_grams, three_grams, train_corpus, number_of_words)
#del train_corpus

In [17]:
lb

array([0.14280524, 0.23494617, 0.62224859])

In [18]:
def test_three_gram(one_grams, two_grams, three_grams, lambdas, three_gram):


    result = 0

    if three_gram in three_grams:

        result += lambdas[2] * three_grams[three_gram]/two_grams[three_gram[:2]]

    if three_gram[1:] in two_grams:

        result += lambdas[1] * two_grams[three_gram[1:]]/one_grams[three_gram[1]]

    if three_gram[2] in one_grams:

        result += lambdas[0] * one_grams[three_gram[2]]/number_of_words

    return result

In [19]:
def test_sentence(one_grams, two_grams, three_grams, lambdas, sentence, K):

    sen_iter = 0
    result = 0

    while sen_iter < len(sentence) - 2:

        word_1 = sentence[sen_iter]
        word_2 = sentence[sen_iter+1]
        word_3 = sentence[sen_iter+2]

        if len(word_1) > K:
            word_1 = word_1[-K:]
        if len(word_2) > K:
            word_2 = word_2[-K:]
        if len(word_3) > K:
            word_3 = word_3[-K:]

        three_gram = (word_1, word_2, word_3)

        result += test_three_gram(one_grams, two_grams, three_grams, lambdas, three_gram)
        sen_iter +=1
    
    return result

In [22]:
f = open('/content/test.txt', 'r')

points = 0
iter = 1
for line in f:
    
    line = line.split()

    res = []
    res_values = []
    perms = list(permutations(line))
    for perm in list(perms):
        res.append([perm])
        res_values.append(test_sentence(one_grams, two_grams, three_grams, lb, perm, 3))

    res = np.array(res)
    res_values = np.array(res_values)

    best = np.argsort(res_values)[::-1]
    points += 1/(np.where(best == 0)[0][0]+1)

    print("------------Sentence {}-------------------- ".format(iter))
    print("1 order: {} | Prob: {}".format(" ".join(res[best[0]][0]), res_values[best[0]]))
    print("2 order: {} | Prob: {}".format(" ".join(res[best[1]][0]), res_values[best[1]]))
    print("3 order: {} | Prob: {}".format(" ".join(res[best[2]][0]), res_values[best[2]]))
    print("4 order: {} | Prob: {}".format(" ".join(res[best[3]][0]), res_values[best[3]]))
    print("5 order: {} | Prob: {}".format(" ".join(res[best[4]][0]), res_values[best[4]]))
    print("True order: {}".format(" ".join(line)))
    print("position of true order: {}".format(np.where(best == 0)[0][0]+1))
    print("Points for sentence: {}".format(1/(np.where(best == 0)[0][0]+1)))
    print("\n")
    iter +=1

print("All points: {}".format(points))

------------Sentence 1-------------------- 
1 order: nasz przedmiot to przetwarzanie języka naturalnego | Prob: 0.2582991096208979
2 order: przedmiot to przetwarzanie języka naturalnego nasz | Prob: 0.25715014968977196
3 order: nasz przedmiot przetwarzanie języka naturalnego to | Prob: 0.24398467969531987
4 order: to nasz przedmiot przetwarzanie języka naturalnego | Prob: 0.2417361165167084
5 order: przedmiot nasz to przetwarzanie języka naturalnego | Prob: 0.23346941796187362
True order: nasz przedmiot to przetwarzanie języka naturalnego
position of true order: 1
Points for sentence: 1.0


------------Sentence 2-------------------- 
1 order: jakub michaliszyn jedną grupę prowadzi | Prob: 0.011878245239992978
2 order: michaliszyn jakub jedną grupę prowadzi | Prob: 0.011864978191215915
3 order: jakub jedną grupę prowadzi michaliszyn | Prob: 0.011854605541330379
4 order: michaliszyn jedną grupę prowadzi jakub | Prob: 0.011831059412606046
5 order: jedną grupę prowadzi jakub michaliszyn | 