In [1]:
import numpy as np
from itertools import permutations
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_corpus(path, num_lines, K):
    file = open(path, "r")

    one_grams = {}
    two_grams = {}
    three_gram = {}
    number_of_words = 0
    num_line_iter = 0 
    
    for _, line in enumerate(tqdm(file)):
        
        line = line.lower().split(" ")

        if len(line) < 3: continue

        number_of_words += len(line)
        line_iter = 0

        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_1 = word_1
            key_2 = (word_1, word_2)
            key_3 = (word_1, word_2, word_3)

            if key_1 in one_grams.keys():
                one_grams[key_1] += 1
            else:
                one_grams[key_1] = 1

            if key_2 in two_grams.keys():
                two_grams[key_2] += 1
            else:
                two_grams[key_2] = 1

            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1

            line_iter += 1

        key_2 = (word_2, word_3)
        if key_2 in two_grams.keys():
            two_grams[key_2] += 1
        else:
            two_grams[key_2] = 1


        key_1 = word_2
        if key_1 in one_grams.keys():
             one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1

        key_1 = word_3
        if key_1 in one_grams.keys():
            one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1


        num_line_iter += 1
        if num_line_iter == num_lines: break

    return three_gram, two_grams, one_grams,  number_of_words

In [4]:
def train_three_grams(path, start_line, num_lines, K):
    file = open(path, "r")

    three_gram = {}
    num_line_iter = 0

    for line in file:
        if num_line_iter < start_line:
            num_line_iter += 1
            continue
        
        line = line.lower().split(" ")

        
        if len(line) < 3: continue

        line_iter = 0

        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_3 = (word_1, word_2, word_3)
            
            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1
            
            line_iter += 1

        num_line_iter += 1
        if num_line_iter == start_line + num_lines: break

    return three_gram

In [5]:
def train(one_grams, two_grams, three_grams, train_three_grams, number_of_words):

    lambdas = np.array([0, 0, 0])

    for three_gram in train_three_grams.keys():

        p3 = 0
        p2 = 0
        p1 = 0

        if three_gram in three_grams:

            p3 = three_grams[three_gram]/two_grams[three_gram[:2]]

        if three_gram[1:] in two_grams:

            p2 = two_grams[three_gram[1:]]/one_grams[three_gram[1]]

        if three_gram[2] in one_grams:

            p1 = one_grams[three_gram[2]]/number_of_words
        
        results = (p1, p2, p3)
        
        f = lambda i: results[i]
        best = max(range(len(results)), key=f)

        lambdas[best] += train_three_grams[three_gram]


    #lambdas = (lambdas-np.mean(lambdas)) / np.std(lambdas)
    #e_x = np.exp(lambdas - np.max(lambdas))
    #lambdas = e_x / e_x.sum()

    return lambdas/np.sum(lambdas)

In [6]:
def test_three_gram(one_grams, two_grams, three_grams, lambdas, three_gram):


    result = 0

    if three_gram in three_grams:

        result += lambdas[2] * three_grams[three_gram]/two_grams[three_gram[:2]]

    if three_gram[1:] in two_grams:

        result += lambdas[1] * two_grams[three_gram[1:]]/one_grams[three_gram[1]]

    if three_gram[2] in one_grams:

        result += lambdas[0] * one_grams[three_gram[2]]/number_of_words

    return result

In [7]:
def test_sentence(one_grams, two_grams, three_grams, lambdas, sentence, K):

    sen_iter = 0
    result = 0

    while sen_iter < len(sentence) - 2:

        word_1 = sentence[sen_iter]
        word_2 = sentence[sen_iter+1]
        word_3 = sentence[sen_iter+2]

        if len(word_1) > K:
            word_1 = word_1[-K:]
        if len(word_2) > K:
            word_2 = word_2[-K:]
        if len(word_3) > K:
            word_3 = word_3[-K:]

        three_gram = (word_1, word_2, word_3)

        result += test_three_gram(one_grams, two_grams, three_grams, lambdas, three_gram)
        sen_iter +=1
    
    return result

# Deleted Interpolation on words

In [9]:
num_lines = 4000000
K = 0 

three_grams, two_grams, one_grams, number_of_words = load_corpus('/content/polish_corpora.txt', num_lines, K)

train_corpus = train_three_grams('/content/polish_corpora.txt', num_lines, 100000, K)

lb = train(one_grams, two_grams, three_grams, train_corpus, number_of_words)

print("Lambdas from Training {}".format(lb))

f = open('/content/test.txt', 'r')

points = 0
iter = 1
for line in f:
    
    line = line.split()

    res = []
    res_values = []
    perms = list(permutations(line))
    for perm in list(perms):
        res.append([perm])
        res_values.append(test_sentence(one_grams, two_grams, three_grams, lb, perm, K))

    res = np.array(res)
    res_values = np.array(res_values)

    best = np.argsort(res_values)[::-1]
    points += 1/(np.where(best == 0)[0][0]+1)

    print("------------Sentence {}-------------------- ".format(iter))
    print("1 order: {} | Prob: {}".format(" ".join(res[best[0]][0]), res_values[best[0]]))
    print("2 order: {} | Prob: {}".format(" ".join(res[best[1]][0]), res_values[best[1]]))
    print("3 order: {} | Prob: {}".format(" ".join(res[best[2]][0]), res_values[best[2]]))
    print("4 order: {} | Prob: {}".format(" ".join(res[best[3]][0]), res_values[best[3]]))
    print("5 order: {} | Prob: {}".format(" ".join(res[best[4]][0]), res_values[best[4]]))
    print("True order: {}".format(" ".join(line)))
    print("position of true order: {}".format(np.where(best == 0)[0][0]+1))
    print("Points for sentence: {}".format(1/(np.where(best == 0)[0][0]+1)))
    print("\n")
    iter +=1

print("All points: {}".format(points))

3999773it [04:58, 13142.54it/s]

Lambdas from Training [0.23251272 0.337896   0.42959128]
------------Sentence 1-------------------- 
1 order: przedmiot przetwarzanie to nasz języka naturalnego | Prob: 0.004011405040972545
2 order: przedmiot języka naturalnego przetwarzanie to nasz | Prob: 0.003993060214902989
3 order: przedmiot przetwarzanie to języka naturalnego nasz | Prob: 0.003912950303675258
4 order: nasz przetwarzanie to przedmiot języka naturalnego | Prob: 0.003911177720165311
5 order: nasz przetwarzanie to języka naturalnego przedmiot | Prob: 0.0038973940569436913
True order: nasz przedmiot to przetwarzanie języka naturalnego
position of true order: 123
Points for sentence: 0.008130081300813009


------------Sentence 2-------------------- 
1 order: michaliszyn prowadzi jakub jedną grupę | Prob: 0.0009641778918192047
2 order: michaliszyn jakub prowadzi jedną grupę | Prob: 0.0009576852892832296
3 order: jakub michaliszyn prowadzi jedną grupę | Prob: 0.0009576852892832296
4 order: michaliszyn jedną grupę prowadz

3999773it [05:16, 13142.54it/s]

------------Sentence 58-------------------- 
1 order: raz kotek był chory leżał w łóżeczku i | Prob: 0.5551646409511231
2 order: raz łóżeczku i kotek był chory leżał w | Prob: 0.5551640953429847
3 order: raz łóżeczku i chory leżał w kotek był | Prob: 0.5550939739993016
4 order: raz kotek był łóżeczku i chory leżał w | Prob: 0.555093870389666
5 order: raz chory leżał w łóżeczku i kotek był | Prob: 0.5550932224706512
True order: raz kotek był chory i leżał w łóżeczku
position of true order: 2166
Points for sentence: 0.0004616805170821791


------------Sentence 59-------------------- 
1 order: równoważności definicja nie jest trudna relacji | Prob: 0.05771995985774024
2 order: równoważności relacji definicja nie jest trudna | Prob: 0.057710785964567815
3 order: relacji równoważności definicja nie jest trudna | Prob: 0.057710785964567815
4 order: relacji definicja nie jest trudna równoważności | Prob: 0.0577084503073516
5 order: trudna definicja nie jest relacji równoważności | Prob: 0.057

# Deleted Interpolation on suffixes

In [8]:
num_lines = 20000000
K = 3 

three_grams, two_grams, one_grams, number_of_words = load_corpus('/content/polish_corpora.txt', num_lines, K)

train_corpus = train_three_grams('/content/polish_corpora.txt', num_lines, 1000000, K)

lb = train(one_grams, two_grams, three_grams, train_corpus, number_of_words)

print("Lambdas from Training {}".format(lb))

f = open('/content/test.txt', 'r')

points = 0
iter = 1
for line in f:
    
    line = line.split()

    res = []
    res_values = []
    perms = list(permutations(line))
    for perm in list(perms):
        res.append([perm])
        res_values.append(test_sentence(one_grams, two_grams, three_grams, lb, perm, K))

    res = np.array(res)
    res_values = np.array(res_values)

    best = np.argsort(res_values)[::-1]
    points += 1/(np.where(best == 0)[0][0]+1)

    print("------------Sentence {}-------------------- ".format(iter))
    print("1 order: {} | Prob: {}".format(" ".join(res[best[0]][0]), res_values[best[0]]))
    print("2 order: {} | Prob: {}".format(" ".join(res[best[1]][0]), res_values[best[1]]))
    print("3 order: {} | Prob: {}".format(" ".join(res[best[2]][0]), res_values[best[2]]))
    print("4 order: {} | Prob: {}".format(" ".join(res[best[3]][0]), res_values[best[3]]))
    print("5 order: {} | Prob: {}".format(" ".join(res[best[4]][0]), res_values[best[4]]))
    print("True order: {}".format(" ".join(line)))
    print("position of true order: {}".format(np.where(best == 0)[0][0]+1))
    print("Points for sentence: {}".format(1/(np.where(best == 0)[0][0]+1)))
    print("\n")
    iter +=1

print("All points: {}".format(points))

19999537it [27:33, 11464.33it/s]

Lambdas from Training [0.14376183 0.23653802 0.61970015]
------------Sentence 1-------------------- 
1 order: nasz przedmiot przetwarzanie języka naturalnego to | Prob: 0.2514948284603231
2 order: to nasz przedmiot przetwarzanie języka naturalnego | Prob: 0.2490397261541058
3 order: nasz przedmiot to przetwarzanie języka naturalnego | Prob: 0.24778991721239146
4 order: przedmiot to przetwarzanie języka naturalnego nasz | Prob: 0.24663884932370272
5 order: przedmiot nasz to przetwarzanie języka naturalnego | Prob: 0.23411120517466466
True order: nasz przedmiot to przetwarzanie języka naturalnego
position of true order: 3
Points for sentence: 0.3333333333333333


------------Sentence 2-------------------- 
1 order: jakub michaliszyn jedną grupę prowadzi | Prob: 0.012743941888986166
2 order: michaliszyn jakub jedną grupę prowadzi | Prob: 0.01272924803326748
3 order: jakub jedną grupę prowadzi michaliszyn | Prob: 0.012718920765462292
4 order: michaliszyn jedną grupę prowadzi jakub | Prob: 