In [21]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [30]:
sentence = "At every moment of time, millions of events occur that, to one degree or another, influence the formation of the future."

#### Для генерации используется модель Bart, на вход которой подается промт {предложение}{There is some ways to rephrase this sentence. 1.}. Именно при такой, не совсем корректной формулировке, получаются хорошие результаты. При использовании {There are several ways to...} модель начинает генерировать шаблонные конструкции, которые влияют на результаты выполнения поставленной задачи. Видимо подобная конструкция слишком часто встречалась в тренировочных данных с определенным продолжением. Если менять температуру, чтобы избежать шаблона, то падает качество генерации предложений. Параметры модели были подобраны эмпирически.

In [31]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
input_text = sentence + ' There is some ways to rephrase this sentence. 1.'
input_ids = torch.tensor(tokenizer.encode(input_text, add_special_tokens=True)).unsqueeze(0)

In [32]:
input = torch.tensor(tokenizer.encode(input_text, add_special_tokens=True)).unsqueeze(0)
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        max_length=700,
        min_length=400,
        temperature=1,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )

In [33]:
synthetic_data = tokenizer.decode(outputs[0].tolist()[1:-1], skip_special_tokens=True).strip()
results = [x for x in synthetic_data.split('.') if len(x)>10]

In [34]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
tester=SentenceTransformer('paraphrase-MiniLM-L6-v2')
x = tester.encode([sentence])
scores =[]
for i in results[2:-1]:
    scores.append(cosine_similarity(x,tester.encode([i])))

In [35]:
for i,j in sorted(zip(scores,results[2:-1]),reverse=True)[:5]:
    print(i,j)

[[0.85839695]]  At every moment in time, hundreds of thousands of events take place that influence our future
[[0.7878566]]  At each moment in history, there are millions of things that happen that influence the future of the world
[[0.66317046]]  Every generation has a unique set of events that affect the future that have no effect on the past but have a significant impact on the future it will shape
[[0.49713406]]  In every generation, a new generation of people is born that changes the way we look at the past and the present
[[0.42906266]]  Each generation has its own unique history that can be traced back to the beginning of time
