In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
import random

model_name = 'flax-community/papuGaPT2'
device = 'cuda'
# device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label


def sentence_prob(sentence_txt):
    input_ids = tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = model(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()

sentences = [
  'To jest zwykłe polskie zdanie.',
  'This is a normal English sentence.',
  'iweryuiiu hrfw3eieur fr'
]

print ()
for s in sentences:
    print (s, sentence_prob(s))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



To jest zwykłe polskie zdanie. -26.387575
This is a normal English sentence. -37.866463
iweryuiiu hrfw3eieur fr -88.537766


In [None]:
import itertools

# ex2a
m1 = ["Babuleńka", "miała", "dwa", "rogate", "koziołki."]
m2 = ["Wczoraj", "wieczorem", "spotkałem", "wspaniałą", "kobietę", "która", "opowiadała", "o", "modelach", "językowych."]

def generate_all_valid_sentences(m):
    permutations = itertools.permutations(m)
    sentences = [' '.join(p) for p in permutations if p[0][0].isupper() and p[-1][-1] == "."]
    return sentences

def compute_probabilty(sentences):
    sentences_prob = list(map(lambda s: (s, sentence_prob(s).item()), sentences))
    sentences_prob.sort(key = lambda x: x[1], reverse = True)
    return sentences_prob

def generate_sentences_with_probs(m):
    sentences = generate_all_valid_sentences(m)
    return compute_probabilty(sentences)

def choose_the_most_probable_sentences(m, n = 5):
    min_n = min(n, len(m))
    return m[:min_n]

for sentence, prob in choose_the_most_probable_sentences(generate_sentences_with_probs(m1)):
    print(sentence)

Babuleńka miała dwa rogate koziołki.
Babuleńka rogate miała dwa koziołki.
Babuleńka miała rogate dwa koziołki.
Babuleńka dwa miała rogate koziołki.
Babuleńka dwa rogate miała koziołki.


In [None]:
# ex2b


def generate_all_valid_pairs(m):
    pairs = itertools.product(m, repeat=2)
    sentences_prob = [' '.join(pair) for pair in pairs if pair[0][-1] != "." and pair[0] != pair[1]]
    return sentences_prob

def select_reasonable_pairs(m, threshold = -20):
    return [(s, p) for s, p in compute_probabilty((m)) if p > threshold or s[0].isupper() or s[-1] == "."]

def merge_pairs(m):
    merged = []
    i = 0
    for current_string, _ in m:
        for next_string, _ in m:
            current_string_splitted = current_string.split()
            next_string_splitted = next_string.split()
            if current_string[-1] != "." and current_string != next_string:
                if (len(current_string_splitted) == 1 or len(next_string_splitted) == 1):
                    merged.append(f"{current_string} {next_string}")
                elif current_string_splitted[-1] == next_string_splitted[0]:
                    merged_string = f"{current_string} {next_string[len(next_string.split()[0]):].lstrip()}"
                    merged.append(merged_string)
    return list(set(merged))

modified_m1 = list(map(lambda s: (s, 0.0), m1))
merge_pairs(select_reasonable_pairs(merge_pairs(select_reasonable_pairs(merge_pairs(modified_m1)))))

def find_sentences(m, iter = 10):
    current_m = m
    sentences = []
    i = 0
    while len(current_m) > 0 and i < iter:
        for s, p in current_m:
            if s[0].isupper() and s[-1] == ".":
                sentences.append((s, p))
        current_m = select_reasonable_pairs(merge_pairs(current_m))
        i += 1
        print(current_m, "!!!")
    return sentences

for s, p in choose_the_most_probable_sentences(find_sentences(modified_m1), 10):
    print(s, p)

Babuleńka koziołki. -34.01337432861328
Babuleńka miała koziołki. -37.06170654296875
Babuleńka dwa koziołki. -37.71996307373047
Babuleńka rogate koziołki. -44.43006134033203
Babuleńka miała dwa rogate koziołki. -47.872337341308594
Babuleńka miała Babuleńka dwa koziołki. -50.07160949707031
Babuleńka dwa miała dwa koziołki. -50.963661193847656
Babuleńka rogate miała dwa koziołki. -53.93680191040039
Babuleńka miała rogate dwa koziołki. -54.77162170410156
Babuleńka miała dwa miała koziołki. -55.53862762451172


In [None]:
m3 = ["Pies", "Żółw", "szybko.", "biega"]
modified_m3 = list(map(lambda s: (s, 0.0), m3))
# for s, p in choose_the_most_probable_sentences(find_sentences(modified_m3), 10):
#     print(s, p)

KeyboardInterrupt: 