In [None]:
import numpy as np
from itertools import permutations
from tqdm import tqdm

# Data Loading

In [None]:
def load_supertags(path):

    file = open(path, 'r')
    word_to_tag = {}

    for line in file:

        word, tag = line.split(" ")
        if tag[-1] == "\n": tag = tag[:-1]

        word_to_tag[word.lower()] = tag
    
    file.close()
    return word_to_tag

In [None]:
def load_corpus(path, word_to_tag, num_lines, K):
    file = open(path, "r")

    one_grams = {}
    two_grams = {}
    three_gram = {}

    unitags = {}
    bitags = {}
    tritags = {}

    number_of_words = 0
    number_of_tags = 0
    num_line_iter = 0 
    
    for _, line in enumerate(tqdm(file)):
        
        line = line.lower().split(" ")

        if len(line) < 3: continue

        number_of_words += len(line)
        line_iter = 0

        # Trigrams/tritags
        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            #unitags, bitags, trigrams
            tag_1 = None
            tag_2 = None
            tag_3 = None

            if word_1 in word_to_tag.keys():
                tag_1 = word_to_tag[word_1]
                key_1 = tag_1
                number_of_tags += 1

                if key_1 in unitags.keys():
                    unitags[key_1] += 1
                else:
                    unitags[key_1] = 1
    
                if word_2 in word_to_tag.keys():
                    tag_2 = word_to_tag[word_2]
                    key_2 = (tag_1, tag_2)

                    if key_2 in bitags.keys():
                        bitags[key_2] += 1
                    else:
                        bitags[key_2] = 1
        
                    if word_3 in word_to_tag.keys():
                        tag_3 = word_to_tag[word_3]
                        key_3 = (tag_1, tag_2, tag_3)

                        if key_3 in tritags.keys():
                            tritags[key_3] += 1
                        else:
                            tritags[key_3] = 1


            #unisufixes, bisufiexes, trisufixes
            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_1 = word_1
            key_2 = (word_1, word_2)
            key_3 = (word_1, word_2, word_3)

            if key_1 in one_grams.keys():
                one_grams[key_1] += 1
            else:
                one_grams[key_1] = 1

            if key_2 in two_grams.keys():
                two_grams[key_2] += 1
            else:
                two_grams[key_2] = 1

            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1

            line_iter += 1


        if tag_2 and tag_3:
                number_of_tags += 2
                key_2 = (tag_2, tag_3)
                if key_2 in bitags.keys():
                    bitags[key_2] += 1
                else:
                    bitags[key_2] = 1

                key_1 = tag_2
                if key_1 in unitags.keys():
                    unitags[key_1] += 1
                else:
                    unitags[key_1] = 1

                key_1 = tag_3
                if key_1 in unitags.keys():
                    unitags[key_1] += 1
                else:
                    unitags[key_1] = 1

        key_2 = (word_2, word_3)
        if key_2 in two_grams.keys():
            two_grams[key_2] += 1
        else:
            two_grams[key_2] = 1

        key_1 = word_2
        if key_1 in one_grams.keys():
             one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1

        key_1 = word_3
        if key_1 in one_grams.keys():
            one_grams[key_1] += 1
        else:
            one_grams[key_1] = 1


        num_line_iter += 1
        if num_line_iter == num_lines: break

    return three_gram, two_grams, one_grams, number_of_words, unitags, bitags, tritags, number_of_tags

In [None]:
def train_three_grams(path, start_line, num_lines, K):
    file = open(path, "r")

    three_gram = {}
    tritags = {}
    num_line_iter = 0

    for line in tqdm(file):
        if num_line_iter < start_line:
            num_line_iter += 1
            continue
        
        line = line.lower().split(" ")

        if len(line) < 3: continue

        line_iter = 0

        while line_iter < len(line) - 2:

            word_1 = line[line_iter]
            word_2 = line[line_iter+1]
            word_3 = line[line_iter+2]

            if word_3[-1] == '\n': word_3 = word_3[:-1] 

            if word_1 in word_to_tag.keys() and word_2 in word_to_tag.keys() and word_3 in word_to_tag.keys():
                key_3 = (word_to_tag[word_1], word_to_tag[word_2], word_to_tag[word_3])

                if key_3 in tritags.keys():
                    tritags[key_3] += 1
                else:
                    tritags[key_3] = 1

            if len(word_1) > K:
                word_1 = word_1[-K:]
            if len(word_2) > K:
                word_2 = word_2[-K:]
            if len(word_3) > K:
                word_3 = word_3[-K:]

            key_3 = (word_1, word_2, word_3)
            
            if key_3 in three_gram.keys():
                three_gram[key_3] += 1
            else:
                three_gram[key_3] = 1
            
            line_iter += 1

        num_line_iter += 1
        if num_line_iter == start_line + num_lines: break

    return three_gram, tritags

In [None]:
def load_val(path, start_line=0,end=10000):

    val = []
    file = open(path, 'r')
    for iter, line in enumerate(file):
        if iter < start_line: continue
        val.append(line)
        if iter - start_line == end: break
        
    return val

#Lambda Training

In [None]:
def train_lambdas(one_grams, two_grams, three_grams, train_three_grams, number_of_words):

    lambdas = np.array([0, 0, 0])

    for three_gram in train_three_grams.keys():

        p3 = 0
        p2 = 0
        p1 = 0

        if three_gram in three_grams:

            p3 = three_grams[three_gram]/two_grams[three_gram[:2]]

        if three_gram[1:] in two_grams:

            p2 = two_grams[three_gram[1:]]/one_grams[three_gram[1]]

        if three_gram[2] in one_grams:

            p1 = one_grams[three_gram[2]]/number_of_words
        
        results = (p1, p2, p3)
        
        f = lambda i: results[i]
        best = max(range(len(results)), key=f)

        lambdas[best] += train_three_grams[three_gram]

    return lambdas/np.sum(lambdas)

# Testing Functions

In [None]:
def test_three_gram(one_grams, two_grams, three_grams, lambdas, three_gram, number_of_one_grams):

    result = 0

    if three_gram in three_grams:

        result += lambdas[2] * three_grams[three_gram]/two_grams[three_gram[:2]]

    if three_gram[1:] in two_grams:

        result += lambdas[1] * two_grams[three_gram[1:]]/one_grams[three_gram[1]]

    if three_gram[2] in one_grams:

        result += lambdas[0] * one_grams[three_gram[2]]/number_of_words

    return result

In [None]:
def test_sentence(one_grams, two_grams, three_grams, lambdas_words, number_of_words, unitags, bitags, tritags, lambdas_tags, number_of_tags, word_to_tag, sentence, K, gamma):

    sen_iter = 0
    result = 0

    while sen_iter < len(sentence) - 2:
        
        result_tags = 0
        result_words = 0
        word_1 = sentence[sen_iter]
        word_2 = sentence[sen_iter+1]
        word_3 = sentence[sen_iter+2]
        tag_1 = word_1
        tag_2 = word_2
        tag_3 = word_3

        if word_1 in word_to_tag.keys():
            tag_1 = word_to_tag[word_1]
        
        if word_2 in word_to_tag.keys():
            tag_2 = word_to_tag[word_2]
            
        if word_3 in word_to_tag.keys():
            tag_3 = word_to_tag[word_3]
            
        tritag = (tag_1, tag_2, tag_3)

        if len(word_1) > K:
            word_1 = word_1[-K:]
        if len(word_2) > K:
            word_2 = word_2[-K:]
        if len(word_3) > K:
            word_3 = word_3[-K:]

        three_gram = (word_1, word_2, word_3)
 

        result_words = test_three_gram(one_grams, two_grams, three_grams, lambdas_words, three_gram, number_of_words)
        result_tags = test_three_gram(unitags, bitags, tritags, lambdas_tags, tritag, number_of_tags)
        
        result += gamma*result_words + (1-gamma)*result_tags

        sen_iter +=1
    
    return result

# Train


In [None]:
path_supertags = '/content/drive/My Drive/Colab Notebooks/NLP/Dane/Copy of supertags.txt'
path_corpus = '/content/polish_corpora.txt'

num_lines = 19000000
num_train_lines = 1000000
num_val_lines = 30000
K = 3 


word_to_tag = load_supertags(path_supertags)
three_grams, two_grams, one_grams, number_of_words, unitags, bitags, tritags, number_of_tags = load_corpus('/content/polish_corpora.txt', word_to_tag, num_lines, K)
train_corpus_sufixes, train_corpus_tags = train_three_grams('/content/polish_corpora.txt', num_lines, num_train_lines, K)
val = load_val(path_corpus, start_line=num_lines+num_train_lines, end=num_val_lines)

In [None]:
val = load_val(path_corpus, start_line=num_lines+num_train_lines, end=60000)

In [None]:
lb1 = train_lambdas(one_grams, two_grams, three_grams, train_corpus_sufixes, number_of_words)
lb2 = train_lambdas(unitags, bitags, tritags, train_corpus_tags, number_of_tags)


19999190it [02:37, 8637.94it/s][A

In [None]:
print("Word Lambdas from Training {}".format(lb1))
print("Tag Lambdas from Training {}".format(lb2))

Word Lambdas from Training [0.14409179 0.23723645 0.61867176]
Tag Lambdas from Training [0.17975885 0.23110095 0.5891402 ]


In [None]:
def test_corpus(test_data, verbose, one_grams, two_grams, three_grams, lb1, number_of_words,\
                unitags, bitags, tritags, lb2, number_of_tags, word_to_tag, K, gamma):

    points = 0
    iter = 1
    for line in test_data:

        sentences = line.split(".")

        for sen in sentences:

            sen = sen.lower().split()

            if len(sen) > 6 or len(sen) < 3: continue

            res = []
            res_values = []
            perms = list(permutations(sen))
            for perm in list(perms):
                res.append([perm])
                res_values.append(test_sentence(one_grams, two_grams, three_grams, lb1, number_of_words,\
                                                unitags, bitags, tritags, lb2, number_of_tags, word_to_tag, perm, K, gamma))

            res = np.array(res)
            res_values = np.array(res_values)

            best = np.argsort(res_values)[::-1]
            points += 1/(np.where(best == 0)[0][0]+1)
            if verbose:
                print("------------Sentence {}-------------------- ".format(iter))
                print("1 order: {} | Val: {}".format(" ".join(res[best[0]][0]), res_values[best[0]]))
                print("2 order: {} | Val: {}".format(" ".join(res[best[1]][0]), res_values[best[1]]))
                print("3 order: {} | Val: {}".format(" ".join(res[best[2]][0]), res_values[best[2]]))
                print("4 order: {} | Val: {}".format(" ".join(res[best[3]][0]), res_values[best[3]]))
                print("5 order: {} | Val: {}".format(" ".join(res[best[4]][0]), res_values[best[4]]))
                print("True order: {}".format(" ".join(sen)))
                print("position of true order: {}".format(np.where(best == 0)[0][0]+1))
                print("Points for sentence: {}".format(1/(np.where(best == 0)[0][0]+1)))
                print("\n")
            
            iter +=1
    print("All points: {}, Gamma: {}".format(points/iter, gamma))

In [None]:
gamma = np.arange(0,1,0.05)
for gamma in gamma:
    test_corpus(val, False, one_grams, two_grams, three_grams, lb1, number_of_words, 
                unitags, bitags, tritags, lb2, number_of_tags, word_to_tag, K, gamma)

All points: 0.17963094976114505, Gamma: 0.0
All points: 0.23792604693754413, Gamma: 0.05
All points: 0.25431601719436925, Gamma: 0.1
All points: 0.2637442430516763, Gamma: 0.15000000000000002
All points: 0.27202294872180593, Gamma: 0.2
All points: 0.27870518205354616, Gamma: 0.25
All points: 0.2847861776136745, Gamma: 0.30000000000000004
All points: 0.2895599645952951, Gamma: 0.35000000000000003
All points: 0.29317283737258715, Gamma: 0.4
All points: 0.29647131801275506, Gamma: 0.45
All points: 0.2991174511898849, Gamma: 0.5
All points: 0.3008727054668867, Gamma: 0.55
All points: 0.303045493332454, Gamma: 0.6000000000000001
All points: 0.3043362871602421, Gamma: 0.65
All points: 0.3041599067287659, Gamma: 0.7000000000000001
All points: 0.30245075022352996, Gamma: 0.75
All points: 0.3028125830807928, Gamma: 0.8
All points: 0.3023296151116362, Gamma: 0.8500000000000001
All points: 0.3004626662723437, Gamma: 0.9
All points: 0.299969127560412, Gamma: 0.9500000000000001


# Test

In [None]:
test_path = '/content/test.txt'
test = load_val(test_path, start_line=-1, end=100000000)
test_corpus(test, True, one_grams, two_grams, three_grams, lb1, number_of_words, 
            unitags, bitags, tritags, lb2, number_of_tags, word_to_tag, K, 0.65)

------------Sentence 1-------------------- 
1 order: to nasz przedmiot przetwarzanie języka naturalnego | Val: 0.32575546876294287
2 order: to przetwarzanie języka naturalnego nasz przedmiot | Val: 0.31409702461025324
3 order: przetwarzanie języka naturalnego nasz przedmiot to | Val: 0.3068748782202812
4 order: to nasz przedmiot przetwarzanie naturalnego języka | Val: 0.28167462186369485
5 order: to języka naturalnego nasz przedmiot przetwarzanie | Val: 0.27366039930018554
True order: nasz przedmiot to przetwarzanie języka naturalnego
position of true order: 15
Points for sentence: 0.06666666666666667


------------Sentence 2-------------------- 
1 order: michaliszyn jakub prowadzi jedną grupę | Val: 0.13990501559806934
2 order: jakub prowadzi jedną grupę michaliszyn | Val: 0.13722303324486193
3 order: michaliszyn prowadzi jedną grupę jakub | Val: 0.13683168837835816
4 order: michaliszyn jakub jedną grupę prowadzi | Val: 0.13679542395172592
5 order: jakub jedną grupę prowadzi michalisz