In [None]:
from tqdm import tqdm
import random
import string
import numpy as np

# Data preprocessing

In [None]:
vowels = list('aeioóuyąę')
compacted_vowels = ['i' + x for x in vowels if x != 'i']

In [None]:
def load_supertags(path):

    file = open(path, 'r')
    word_to_tag = {}
    tag_to_words = {}

    for line in tqdm(file):

        word, tag = line.split(" ")
        if tag[-1] == "\n": tag = tag[:-1]

        word_to_tag[word.lower()] = tag

        if tag in tag_to_words:
            tag_to_words[tag].append(word)
        else:
            tag_to_words[tag] = [word]
            
    
    file.close()
    return word_to_tag, tag_to_words

In [None]:
def load_bigrams_and_create_bitags_to_bigram_occurrence(path, word_to_tag, amount):
    
    file = open(path, 'r')
    bigramtag_to_bigram_occurrence = {}
    acc = 0

    for iter, line in tqdm(enumerate(file)):

        oc, word1, word2 = line.lower().split(" ")

        if word2[-1] == "\n": word2 = word2[:-1]

        if word1 not in word_to_tag.keys() or word2 not in word_to_tag.keys(): continue
    
        if oc == 1: continue
        #if iter == amount: break

        acc += int(oc)

        key = (word_to_tag[word1], word_to_tag[word2])

        if key in bigramtag_to_bigram_occurrence.keys():
            bigramtag_to_bigram_occurrence[key].append(((word1, word2), int(oc)))
        else:
            bigramtag_to_bigram_occurrence[key] = [((word1, word2), int(oc))] 




    return bigramtag_to_bigram_occurrence, acc

In [None]:
def load_unigrams(path):

    file = open(path, 'r')
    unigrams = {}
    acc = 0

    for line in tqdm(file):
        
        word, oc = line.split()
        acc += int(oc)
        unigrams[word] = oc
    
    return unigrams, acc

# Poetry Generator

In [None]:
def count_vowels(word):

    res = 0
    for iter, letter in enumerate(word):

        if letter in vowels:
            res+=1
            if word[iter:iter+2] in compacted_vowels:
                res -= 1

    return res

In [None]:
def is_ryme(word_1, word_2):

    sylabes_1 = count_vowels(word_1)
    sylabes_2 = count_vowels(word_2)

    while len(word_1) > 0:
        if word_1[0] in vowels:
            sylabes_1 -= 1

        if word_1[:2] in compacted_vowels:
            word_1 = word_1[1:]

        if sylabes_1 - 2 < 0:
            break

        word_1 = word_1[1:]
    

    while len(word_2) > 0:

        if word_2[0] in vowels:
            sylabes_2 -= 1
        
        if word_2[:2] in compacted_vowels:
            word_2 = word_2[1:]
        
        if sylabes_2 - 2 < 0:
            break
        
        word_2 = word_2[1:]
    

    return word_1 == word_2
    

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def choose_word_softmax(words):

    occurrences = np.array([word[1] for word in words], dtype=np.float64)

    occurrences = (occurrences - min(occurrences)) / (max(occurrences) - min(occurrences) + 0.0001)

    number_of_words = len(words)
    probs = softmax(occurrences)
    word_idx = np.random.choice(number_of_words , size = 1, p = probs)[0]

    return words[word_idx][0]

In [None]:
def test_sentence_and_return_partition(sent):

    correct = False
    syl = 0
    partition = []

    if sent[-1] == '\r': sent = sent[:-1]

    sent = ''.join([char.lower() for char in sent if char not in string.punctuation]).split()

    for word in sent:
        
        if word in word_to_tag.keys():
            syl += count_vowels(word)
            partition.append((count_vowels(word), word_to_tag[word]))
            
    if len(partition) == len(sent) and syl == 13: 
        correct = True

    return (correct, partition, sent[-1])


def sample_partition_with_tags(text):
    
    correct_1 = False
    correct_2 = False
    correct_ryme = False
    while not correct_1 or not correct_2 or not correct_ryme:
        partition = []
        syl = 0
        sentidx = random.randint(0, len(text)-2)
        sent_1 = text[sentidx]
        sent_2 = text[sentidx+1]
        
        if len(sent_1) < 2 or len(sent_2) < 2: continue

        (correct_1, partition_1, last_word_1) = test_sentence_and_return_partition(sent_1)
        (correct_2, partition_2, last_word_2) = test_sentence_and_return_partition(sent_2)

        correct_ryme = is_ryme(last_word_1, last_word_2)

    
    return partition_1, sent_1, partition_2, sent_2

In [None]:
word_to_tag, tag_to_words = load_supertags('/content/drive/My Drive/Colab Notebooks/NLP/Dane/Copy of supertags.txt')
unigrams, unigrams_all_oc = load_unigrams('/content/drive/My Drive/Colab Notebooks/NLP/Dane/unigrams.txt')

1781994it [00:03, 501929.12it/s]
3591114it [00:16, 221438.12it/s]


In [None]:
text = open('/content/drive/My Drive/Colab Notebooks/NLP/Dane/PT.txt', 'rb').read().decode("utf-8").lower().split('\n')
bigramtag_to_bigram_occurrence, bigrams_all_oc = load_bigrams_and_create_bitags_to_bigram_occurrence('/content/poleval_2grams.txt', word_to_tag, 100000000)

59134224it [06:14, 157930.25it/s]


In [None]:
import random

def pmi(word_1, word_2, oc):

    res = 0
    if word_1 in unigrams.keys() and word_2 in unigrams.keys():
        res = np.log((oc/bigrams_all_oc)/((int(unigrams[word_1])/unigrams_all_oc)*
                                          (int(unigrams[word_2])/unigrams_all_oc)))
    return res

def find_grammatically_similar_sentence_bigram(part_1, part_2, bitag_to_bigram_occurence, recall, col=False):

    res = []
    err = 0

    len_first = len(part_1)
    part_1.extend(part_2)

    sen_part = part_1

    if col:
        f = pmi
    else:
        f = lambda x, y, z: z


    while len(res) != len(sen_part):

        if err >= recall:
            return None

        if len(res) < 2:
            res = []
            ### Generate first bigram
            syl_1, tag_1 = sen_part[0]
            syl_2, tag_2 = sen_part[1]
            
            if bitag_to_bigram_occurence.get((tag_1,tag_2)) is None: return None
            
            words = [((bigram[0], bigram[1]), f(bigram[0], bigram[1], int(oc))) for bigram, oc in 
                     bitag_to_bigram_occurence.get((tag_1,tag_2)) if count_vowels(bigram[0])==syl_1 and count_vowels(bigram[1])==syl_2]

            if len(words) == 0: return None
            pred_1, pred_2 = choose_word_softmax(words)
            res.append(pred_1)
            res.append(pred_2)

        len_res = len(res) - 1

        syl_1, tag_1 = sen_part[len_res]
        syl_2, tag_2 = sen_part[len_res+1]

        key = (tag_1, tag_2)

        if bitag_to_bigram_occurence.get((tag_1,tag_2)) is None: return None

        lw = res[-1]

        if len(res) < len(sen_part) - 1:
            words = [((bigram[0], bigram[1]), f(bigram[0], bigram[1], int(oc))) for bigram, oc 
                     in bitag_to_bigram_occurence.get((tag_1,tag_2)) if lw == bigram[0] and count_vowels(bigram[0])==syl_1 and count_vowels(bigram[1])==syl_2]
        else:
             words = [((bigram[0], bigram[1]), f(bigram[0], bigram[1], int(oc))) for bigram, oc 
                      in bitag_to_bigram_occurence.get((tag_1,tag_2)) if lw == bigram[0] and count_vowels(bigram[0])==syl_1 and count_vowels(bigram[1])==syl_2 and is_ryme(bigram[1], res[len_first-1])]

        if len(words) == 0:
            res = res[:-1]
            err += 1
        else:
            pred_1, pred_2 = choose_word_softmax(words)
            res.append(pred_2)
    
    return res   

In [None]:
for _ in tqdm(range(1000)):
    res = None
    while res == None:
        partition_1, sent_1, partition_2, sent_2 = sample_partition_with_tags(text)

        len_1 = len(partition_1)

        res = find_grammatically_similar_sentence_bigram(partition_1, partition_2, bigramtag_to_bigram_occurrence, 50, True)

100%|██████████| 1000/1000 [1:09:40<00:00,  4.18s/it]


# Words Occurences Poetry Generator

In [None]:
gen(70, col=False)

['było', 'to', 'właśnie', 'sprawnie', 'na', 'jej', 'narodzinach']
['nadal', 'szkolono', 'tylko', 'o', 'tych', 'przeprosinach']


In [None]:
gen(70, col=False)

['gotowanie', 'czy', 'zbieżne', 'czy', 'było', 'darmowe']
['że', 'ropy', 'osiągnęły', 'jak', 'szkoły', 'państwowe']


In [None]:
gen(70, col=False)

['kazały', 'mu', 'się', 'stadia', 'zgodnie', 'podkreślano']
['ten', 'zysk', 'niewielki', 'wtedy', 'o', 'grobach', 'składano']


# PMI Poetry Generator

In [None]:
gen(70, col=True)

['w', 'złudnej', 'rzeczywistości', 'przez', 'pudła', 'i', 'kwiatka']
['dlatego', 'nim', 'dotarli', 'za', 'centrem', 'do', 'płatka']


In [None]:
gen(70, col=True)

['niby', 'gruszka', 'ze', 'spania', 'i', 'piękna', 'zadbana']
['a', 'cesarska', 'od', 'błędu', 'jak', 'kwaśna', 'odmiana']


In [None]:
gen(70, col=True)

['to', 'owczarek', 'to', 'michnik', 'robert', 'z', 'oferentów']
['nie', 'chomikuj', 'go', 'z', 'kratki', 'bez', 'osiem', 'segmentów']
