# RoBERTa

In [1]:
import torch
from transformers import AutoConfig, AutoModelWithLMHead, AutoTokenizer

In [2]:
model_name_or_path = "roberta-large"
max_len = 512
k = 5
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, model_max_length=max_len)
model = AutoModelWithLMHead.from_pretrained(model_name_or_path, config=config)

In [16]:
toy_sentence = "Hello, my dog is very <mask>"
inputs = tokenizer(
    toy_sentence,
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors='pt'
)

In [17]:
inputs

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16,   182, 50264,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
preds = model(**inputs)
activated_preds = torch.softmax(preds[0], dim=2)
masked_token_indices = (inputs['input_ids'] == tokenizer.mask_token_id)
masked_preds = activated_preds[masked_token_indices]
topk = torch.topk(masked_preds, k, dim=1)

In [31]:
activated_preds.shape

torch.Size([1, 9, 50265])

In [26]:
topk

torch.return_types.topk(
values=tensor([[0.3848, 0.0407, 0.0359, 0.0266, 0.0228]], grad_fn=<TopkBackward>),
indices=tensor([[ 4736,  4812, 38384, 11130,  7428]]))

In [25]:
for sugested in topk.indices.detach().numpy():
    for t in sugested:
        print(tokenizer.convert_ids_to_tokens(int(t)))

Ġsick
Ġill
Ġnaughty
Ġhungry
Ġtired


# XLNet

In [1]:
# load the model without AutoModelWithLMHead
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

model_name_or_path = "xlnet-large-cased"
max_len = 512 # check out model input size !!!
k = 5
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, model_max_length=max_len)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, config=config)

In [1]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead

In [2]:
model_name_or_path = "xlnet-large-cased"
max_len = 512 # check out model input size !!!
k = 5
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, model_max_length=max_len)
model = AutoModelWithLMHead.from_pretrained(model_name_or_path, config=config)

Downloading: 100%|██████████| 1.44G/1.44G [01:56<00:00, 12.3MB/s]


In [2]:
toy_sentence = "Hello, my dog is very <mask>"
# toy_sentence = "Hello, my dog is very cute"
inputs = tokenizer(
    toy_sentence,
    add_special_tokens=False,
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors='pt'
)

In [3]:
inputs

{'input_ids': tensor([[   17, 11368,    19,    94,  2288,    27,   172,     6]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
tokenizer.decode(inputs['input_ids'].flatten())

'Hello, my dog is very<mask>'

In [5]:
preds = model(**inputs)
activated_preds = torch.softmax(preds[0], dim=2)
masked_token_indices = (inputs['input_ids'] == tokenizer.mask_token_id)
# masked_token_indices = (inputs['input_ids'] >= 0)
masked_preds = activated_preds[masked_token_indices]
topk = torch.topk(masked_preds, k, dim=1)

In [6]:
activated_preds.shape

torch.Size([1, 8, 32000])

In [7]:
masked_preds

tensor([[2.2498e-05, 7.2611e-10, 8.6588e-10,  ..., 7.4294e-09, 6.8818e-10,
         1.5228e-07]], grad_fn=<IndexBackward>)

In [8]:
topk

torch.return_types.topk(
values=tensor([[0.0576, 0.0370, 0.0353, 0.0338, 0.0328]], grad_fn=<TopkBackward>),
indices=tensor([[ 343,   62,   21,   35, 2653]]))

In [9]:
for i, sugested in enumerate(topk.indices.detach().numpy()):
    print(f'{i}:')
    for t in sugested:
        print(tokenizer.convert_ids_to_tokens(int(t)))

0:
▁really
▁her
▁and
▁I
▁super


-----------------------------------

In [44]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)
target_mapping[0, 0, -1] = 1.0

In [45]:
input_ids

tensor([[   17, 11368,    19,    94,  2288,    27,   172,     6]])

In [46]:
perm_mask

tensor([[[0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.]]])

In [47]:
target_mapping

tensor([[[0., 0., 0., 0., 0., 0., 0., 1.]]])

In [49]:
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0]

In [50]:
next_token_logits

tensor([[[-32.9998, -42.5084, -42.8883,  ..., -38.2738, -41.6116, -38.0217]]],
       grad_fn=<AddBackward0>)

In [72]:
activated_outputs = torch.softmax(outputs[0], dim=2)
topk_outputs = torch.topk(activated_outputs[0], k, dim=1)

In [73]:
topk_outputs

torch.return_types.topk(
values=tensor([[0.4280, 0.1991, 0.0384, 0.0205, 0.0180]], grad_fn=<TopkBackward>),
indices=tensor([[ 172,    9, 2653,   17,   21]]))

In [74]:
for sugested in topk_outputs.indices.detach().numpy():
    for t in sugested:
        print(tokenizer.convert_ids_to_tokens(int(t)))

▁very
.
▁super
▁
▁and


# BART

In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
tok = BartTokenizer.from_pretrained("facebook/bart-large")
example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
batch = tok(example_english_phrase, return_tensors='pt')
generated_ids = model.generate(batch['input_ids'])
assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']

Downloading: 100%|██████████| 1.52k/1.52k [00:00<00:00, 1.67MB/s]
Downloading: 100%|██████████| 1.02G/1.02G [01:23<00:00, 12.2MB/s]


In [2]:
tok.model_max_length

1024

In [4]:
tok.mask_token

'<mask>'

In [3]:
batch

{'input_ids': tensor([[    0,  4154,  1231, 15674,   345,  1534,   440, 50264,    11,  1854,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
generated_ids

tensor([[    2,     0,  4154,  1231, 15674,   345,  1534,   440,  5427,     7,
         12457, 14329, 28054,    11,  1854,     2]])

---------------------------------

In [1]:
import torch
from transformers import AutoConfig, AutoModelWithLMHead, AutoTokenizer

In [2]:
model_name_or_path = "facebook/bart-large"
max_len = 512
k = 5
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, model_max_length=max_len)
model = AutoModelWithLMHead.from_pretrained(model_name_or_path, config=config)

In [3]:
toy_sentence = "Hello, my dog is very <mask>"
inputs = tokenizer(
    toy_sentence,
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors='pt'
)

In [4]:
inputs

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16,   182, 50264,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
preds = model(**inputs)
activated_preds = torch.softmax(preds[0], dim=2)
masked_token_indices = (inputs['input_ids'] == tokenizer.mask_token_id)
masked_preds = activated_preds[masked_token_indices]
topk = torch.topk(masked_preds, k, dim=1)

In [6]:
masked_preds

tensor([[1.9357e-12, 9.9633e-10, 1.3515e-04,  ..., 2.2609e-09, 5.4310e-10,
         7.0380e-10]], grad_fn=<IndexBackward>)

In [7]:
topk

torch.return_types.topk(
values=tensor([[0.1285, 0.0366, 0.0307, 0.0261, 0.0260]], grad_fn=<TopkBackward>),
indices=tensor([[ 4736,  1372,  4812, 10985, 20100]]))

In [8]:
for i, sugested in enumerate(topk.indices.detach().numpy()):
    print(f'{i}:')
    for t in sugested:
        print(tokenizer.convert_ids_to_tokens(int(t)))

0:
Ġsick
Ġhappy
Ġill
Ġconfused
Ġlonely


# Pagasus Paraphraser

In [6]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

set up the language model

In [7]:
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [3]:
def get_response(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

test for a example text

In [6]:
context = "The ultimate test of your knowledge is your capacity to convey it to another."
get_response(context,10)

['The test of your knowledge is your ability to convey it.',
 'The ability to convey your knowledge is the ultimate test of your knowledge.',
 'The ability to convey your knowledge is the most important test of your knowledge.',
 'Your capacity to convey your knowledge is the ultimate test of it.',
 'The test of your knowledge is your ability to communicate it.',
 'Your capacity to convey your knowledge is the ultimate test of your knowledge.',
 'Your capacity to convey your knowledge to another is the ultimate test of your knowledge.',
 'Your capacity to convey your knowledge is the most important test of your knowledge.',
 'The test of your knowledge is how well you can convey it.',
 'Your capacity to convey your knowledge is the ultimate test.']

### like the t5 paraphraser

In [1]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [2]:
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [3]:
model = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase')
tokenizer = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

device  cuda


In [4]:
def paraphrase_question(text, top_k=50, top_p=0.95, num_return_sequences=3):
    max_len = 256

    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    # set top_k = 120 and set top_p = 0.98 and num_return_sequences = 10
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=max_len,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True,
        num_return_sequences=num_return_sequences
    )

    print ("\nOriginal Question ::")
    print (sentence)
    print ("\n")
    print ("Paraphrased Questions :: ")
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    for i, final_output in enumerate(final_outputs):
        print("{}: {}".format(i, final_output))

In [6]:
sentence = "Which course should I take to get started in data science?"
# sentence = "What is the answer to the ultimate question of life, the universe and everything?"
text =  "paraphrase: " + sentence + " </s>"

paraphrase_question(text, top_k=50, top_p=0.95, num_return_sequences=10)


Original Question ::
Which course should I take to get started in data science?


Paraphrased Questions :: 
0: Which data science course should I take?


# t5 Paraphraser

In [1]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

In [50]:
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [3]:
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

Some weights of the model checkpoint at ramsrigouthamg/t5_paraphraser were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 792k/792k [00:00<00:00, 1.45MB/s]
Downloading: 100%|██████████| 1.79k/1.79k [00:00<00:00, 2.06MB/s]
Downloading: 100%|██████████| 25.0/25.0 [00:00<00:00, 28.8kB/s]
device  cuda


In [35]:
def paraphrase_question(text, top_k=50, top_p=0.95, num_return_sequences=3):
    max_len = 256

    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    # set top_k = 120 and set top_p = 0.98 and num_return_sequences = 10
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=max_len,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True,
        num_return_sequences=num_return_sequences
    )

    print ("\nOriginal Question ::")
    print (sentence)
    print ("\n")
    print ("Paraphrased Questions :: ")
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    for i, final_output in enumerate(final_outputs):
        print("{}: {}".format(i, final_output))

for some example question

In [112]:
sentence = "Which course should I take to get started in data science?"
# sentence = "What is the answer to the ultimate question of life, the universe and everything?"
text =  "paraphrase: " + sentence + " </s>"

paraphrase_question(text, top_k=50, top_p=0.95, num_return_sequences=10)


Original Question ::
Which course should I take to get started in data science?


Paraphrased Questions :: 
0: What is the right course to study data science?
1: What courses should be taken to pursue data science?
2: How should I study data science in college?
3: What are the best courses for data science?
4: What are the best courses to learn data science?
5: How can I get started in data science?
6: What is the best online course I should go through to get a career in data science?
7: What are the best data science courses?
8: What are some good courses for data science?
9: What is the best place to get started with data science?


### like pegasus paraphraser

In [1]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

In [2]:
model_name = 'ramsrigouthamg/t5_paraphraser'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [3]:
def get_response(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [4]:
context = "The ultimate test of your knowledge is your capacity to convey it to another."
get_response(context,10)

['The ultimate test of your knowledge is your capacity to convey it to another.',
 'The ultimate test of your knowledge is your capacity to convey it to another person.',
 'The ultimate test of your knowledge is your ability to convey it to another.',
 'The ultimate test of knowledge is your capacity to convey it to another.',
 'The ultimate test of your knowledge is your ability to convey it to another person.',
 'The ultimate test of your knowledge is your capacity to transmit it to another.',
 'The ultimate test of knowledge is your capacity to convey it to another person.',
 'What is the ultimate test of your knowledge is your capacity to convey it to another.',
 'What is the ultimate test of your knowledge is your capacity to convey it to another person.',
 'The ultimate test of knowledge is your ability to convey it to another.']

## Notes

over all investigate language models for conditional generation

### Test for generate function of BART

the function is inherent from GenerationMixin over PreTrainedModel

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

In [None]:
model_name = ''
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [None]:
context = "The ultimate test of your knowledge is your capacity to convey it to another."
get_response(context,10)

### Test this Praphrasas on some sample Data

### the sample paragraphs

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "ner"])
nlp.add_pipe(spacy.lang.en.English().create_pipe("sentencizer"))

def sentencizer(text):
    return list(nlp(text).sents)

In [1]:
# wikipedia: 146195-ORIG-10.txt
paragrpah_1 = """Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds. After winning the third LaMotta fight less than three weeks later, Robinson then defeated his childhood idol: former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong."""

# wikipedia: 15738365-ORIG-6.txt
paragrpah_2 = """Paper shortages caused by the war forced the magazine to a bimonthly schedule in 1942, and only four more issues appeared. The last issue was dated September–October 1943. Over the lifetime of the magazine its focus shifted from weird fiction to include both science fiction and fantasy. The magazine is now hard to find and complete runs are very rare."""

# wikipedia: 27509373-ORIG-28.txt
paragrpah_3 = """The tale was also apparently popular in later times, and became the subject of a number of independent poems; none appears to have been directly based on the surviving text, however, suggesting that other versions of the same tale served as their inspiration. The first poem is appended to the old text in the Book of Leinster, Harley 5280 and H.3.18. Its author used the opportunity to display his knowledge of the names of Irish heroes in general, in which he does not confine himself to characters of the story. The second poem follows the first in Harley 5280, and appears also in three other manuscripts: the Book of Lecan, Laud 610 in the Bodelian Library and lastly in the Stowe manuscript collection.
In these two poems, the real hero is not the dog Ailbe but the pig – the latter being "practically a panegyric on the pig" – although the story's title implies this may have been an original feature. In the unusual choice of a pig as the main protagonist, the story becomes associated with a broader Celtic tradition, including the wild boar motif of Arthurian legend. The wild boar Twrch Trwyth, for example, causes considerable difficulty for the men of Arthur's court in the 11th-century Welsh story of "Culhwch and Olwen"; while the exposition of the geographical details of Ailbe's route in the rout of the Connachta in the Irish tale has parallels with the route taken by Twrch Trwyth in the Welsh hunt. Indeed, Thurneysen notes that in relation to Mac Da Thó's pig, the poets use the words "torc" (boar) and "muc" (pig) interchangeably."""

### Pegasus

In [3]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [2]:
def paraphrase_pegasus(input_text,num_return_sequences):
    batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [30]:
spun_text_1 = []
for sentence in sentencizer(paragrpah_1):
    print(sentence.text)
    spun_sentence = paraphrase_pegasus(sentence.text, 1)[0]
    print(f' -> {spun_sentence}')
    spun_text_1.append(spun_sentence)

Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match.
 -> Robinson lost for the first time to LaMotta in a 10-round re- match.
LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision.
 -> LaMotta knocked Robinson out of the ring in the eighth round and won the fight by decision.
The fight took place in Robinson's former home town of Detroit, and attracted a record crowd.
 -> A record crowd watched the fight in Robinson's hometown of Detroit.
After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds.
 -> After being controlled by Robinson in the early parts of the fight, LaMotta came back to take control in the later rounds.
After winning the third LaMotta fight less than three weeks later, Robinson then defeated his childhood idol:
 -> Robinson defeated his childhood hero, LaMotta, less than three

In [28]:
for sentence in sentencizer(paragrpah_2):
    print(paraphrase_pegasus(sentence.text, 1)[0])

Paper shortages caused the magazine to be on a bimonthly schedule in 1942.
The last issue was published in 1943.
Science fiction and fantasy were included in the magazine's focus over the course of its lifetime.
The magazine is hard to find and complete runs are very rare.


In [29]:
for sentence in sentencizer(paragrpah_3):
    print(paraphrase_pegasus(sentence.text, 1)[0])

The tale became the subject of a number of independent poems.
Other versions of the same tale served as their inspiration, as none appears to have been directly based on the surviving text.
The first poem is appended to the old text.
The author was able to show his knowledge of the names of Irish heroes in general because he did not limit himself to the characters of the story.
The second poem is in three other manuscripts.
The Book of Lecan, Laud 610 and the manuscript collection are in the Bodelian Library.
The real hero in these two poems is not the dog Ailbe but the pig, although the story's title suggests this may have been an original feature.
In the unusual choice of a pig as the main character, the story becomes associated with a broader Celtic tradition.
The men of Arthur's court in the 11th-century Welsh story of "Culhwch and Olwen" were not easy to get along with.
While the geographical details of Ailbe's route in the Connachta in the Irish tale have parallels with the route

### paraphrase sentence by sentence

In [4]:
def paraphrase_pegasus_sentence_wise(text, **model_args):
    tgt_texts = []
    for sentence in sentencizer(text):
        batch = tokenizer.prepare_seq2seq_batch([sentence.text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
        min_len = batch['input_ids'].flatten().shape[0] // 2
        translated = model.generate(
            **batch,
            min_length=min_len,
            max_length=60,
            **model_args
        )
        tgt_texts.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))

    print('\n\n'.join(tgt_texts))

In [21]:
paraphrase_pegasus_sentence_wise(
    paragrpah_1,
    num_beams=10, # beam for beam search
    top_k=50, # number of highest propability tokens to keep for top-k filtering
    top_p=1.0, # if < 1 than  only the tokens with probabilities that add up to this value or highter are kept for geneartion
    num_return_sequences=1,
    temperature=1.5,
    length_penalty=2.0, # panalty to produce longer texts
    encoder_no_repeat_ngram_size=10 # no ngram of this size or longer form the encoder_input_ids can occur in the decoder_input_ids
)

Robinson lost for the first time in his career in a 10-round re- match against LaMotta.

Robinson was knocked out of the ring in the eighth round by LaMotta, but the fight was decided by a single point.

Robinson's former home town of Detroit attracted a record crowd for the fight, which took place in the middle of the night.

In the later rounds, LaMotta came back to take control of the fight after being controlled by Robinson in the early part of the fight.

Robinson defeated his childhood hero, LaMotta, less than three weeks after winning the third fight.

The former champion was Henry Armstrong.

Robinson was the one who fought Armstrong because he was in need of money.

Robinson later stated that he was the one who carried the old fighter, by now he was an old fighter, and he was an old fighter, by now he was an old fighter, and he was an old fighter, by now he was an old fighter, and he was an old fighter


In [31]:
paraphrase_pegasus_sentence_wise(
    paragrpah_1,
    num_beams=10, # beam for beam search
    top_k=50, # number of highest propability tokens to keep for top-k filtering
    top_p=1.0, # if < 1 than  only the tokens with probabilities that add up to this value or highter are kept for geneartion
    num_return_sequences=5,
    temperature=1.5,
    length_penalty=1.0, # panalty to produce longer texts
    encoder_no_repeat_ngram_size=5 # no ngram of this size or longer form the encoder_input_ids can occur in the decoder_input_ids
)

Robinson built a 40–0 record before losing to LaMotta for the first time in a 10 round re- match.

Robinson lost for the first time in his career in a 10 round re- match against LaMotta.

Robinson built a 40–0 record before losing to LaMotta for the first time in a re- match.

Robinson lost for the first time in his career in a 10 round re- match to LaMotta.

Robinson built a 40–0 record before losing to LaMotta for the first time in a 10- round re- match.

Robinson was knocked out in the eighth round by LaMotta, but the fight was decided by a single point.

Robinson was knocked out in the eighth round by LaMotta, but the fight was decided by the judges.

Robinson was knocked out in the eighth round by LaMotta who won the fight by a decision.

Robinson was knocked out in the eighth round by LaMotta, but he won the fight by a decision.

Robinson was knocked out in the eighth round by LaMotta, and the fight was decided by a single point.

A record crowd watched the fight in Robinson's ho

### t5

In [1]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

model_name = 'ramsrigouthamg/t5_paraphraser'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [2]:
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [7]:
def paraphrase_t5(input_text,num_return_sequences):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=512, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=512,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

-------------------------------------------------------------

In [19]:
input_text = paragrpah_1
batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=256, return_tensors="pt").to(torch_device)
# batch['input_ids'].flatten().shape[0]
batch

{'input_ids': tensor([[17461,  1192,     3,     9,  1368,    13,  1283,   104,   632,   274,
          5489,    21,     8,   166,    97,    12,   325,   329,    32,    17,
            17,     9,    16,     3,     9,  9445,  7775,     3,    60,    18,
         19515,     5,   325,   329,    32,    17,    17,     9,     6,   113,
           141,     3,     9,  1293,  2337,   147, 17461,     6,  7673,    15,
            26, 17461,    91,    13,     8,     3,  1007,    16,     8, 21227,
          1751,     6,    11,   751,     8,  2870,    57,  1357,     5,    37,
          2870,   808,   286,    16, 17461,    31,     7,  1798,   234,  1511,
            13, 11901,     6,    11,    44, 11674,     3,     9,  1368,  4374,
             5,   621,   271,  6478,    57, 17461,    16,     8,   778, 17622,
            13,     8,  2870,     6,   325,   329,    32,    17,    17,     9,
           764,   223,    12,   240,   610,    16,     8,   865, 14419,     5,
           621,  3447,     8,  1025,  

In [24]:
text = "paraphrase: " + input_text + " </s>"
encoding = tokenizer.encode_plus(input_text, pad_to_max_length=True, return_tensors='pt').to(torch_device)
encoding

{'input_ids': tensor([[17461,  1192,     3,     9,  1368,    13,  1283,   104,   632,   274,
          5489,    21,     8,   166,    97,    12,   325,   329,    32,    17,
            17,     9,    16,     3,     9,  9445,  7775,     3,    60,    18,
         19515,     5,   325,   329,    32,    17,    17,     9,     6,   113,
           141,     3,     9,  1293,  2337,   147, 17461,     6,  7673,    15,
            26, 17461,    91,    13,     8,     3,  1007,    16,     8, 21227,
          1751,     6,    11,   751,     8,  2870,    57,  1357,     5,    37,
          2870,   808,   286,    16, 17461,    31,     7,  1798,   234,  1511,
            13, 11901,     6,    11,    44, 11674,     3,     9,  1368,  4374,
             5,   621,   271,  6478,    57, 17461,    16,     8,   778, 17622,
            13,     8,  2870,     6,   325,   329,    32,    17,    17,     9,
           764,   223,    12,   240,   610,    16,     8,   865, 14419,     5,
           621,  3447,     8,  1025,  

In [28]:
if torch.all(torch.eq(encoding['input_ids'], batch['input_ids'])):
    print('yes')

yes


In [30]:
translated = model.generate(
    **encoding,
    do_sample=True,
    # min_length=140,
    max_length=256,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=1)
translated.shape

torch.Size([1, 75])

In [33]:
tokenizer.decode(translated.flatten(), skip_special_tokens=True, clean_up_tokenization_spaces=True)

"Woodrow Wilson defeated Henry Armstrong. Robinson then was ruled out for good. Two days later, Wilson fought Armstrong again. Robinson had been the owner of Woodrow Wilson's home gym for several years, and Wilson was a good father to his daughter. Robinson was in a lot of trouble in the fight and had a lot of luck."

In [31]:
paragrpah_1

"Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds. After winning the third LaMotta fight less than three weeks later, Robinson then defeated his childhood idol: former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong."

In [34]:
tokenizer.batch_decode(translated, skip_special_tokens=True)

["Woodrow Wilson defeated Henry Armstrong. Robinson then was ruled out for good. Two days later, Wilson fought Armstrong again. Robinson had been the owner of Woodrow Wilson's home gym for several years, and Wilson was a good father to his daughter. Robinson was in a lot of trouble in the fight and had a lot of luck."]

----------------------------------

In [8]:
paraphrase_t5(paragrpah_1, 1)

["Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd."]

In [9]:
spun_text_1 = []
for sentence in sentencizer(paragrpah_1):
    print(sentence.text)
    spun_sentence = paraphrase_t5(sentence.text, 1)[0]
    print(f' -> {spun_sentence}')
    spun_text_1.append(spun_sentence)

Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match.
 -> Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match.
LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision.
 -> Robinson, who had a weight advantage over LaMotta, knocked Robinson out of the ring in the eighth round, and won the fight by decision.
The fight took place in Robinson's former home town of Detroit, and attracted a record crowd.
 -> The fight took place in Robinson's former home town of Detroit, and attracted a record crowd.
After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds.
 -> After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds.
After winning the third LaMotta fight less than three weeks later, Robin

In [10]:
for sentence in sentencizer(paragrpah_2):
    print(paraphrase_t5(sentence.text, 1)[0])

Paper shortages caused by the war forced the magazine to a bimonthly schedule in 1942, and only four more issues appeared.
The last issue was dated September–October 1943.
Over the lifetime of the magazine its focus shifted from weird fiction to include both science fiction and fantasy.
The magazine is now hard to find and complete runs are very rare.


In [11]:
for sentence in sentencizer(paragrpah_3):
    print(paraphrase_t5(sentence.text, 1)[0])

The tale was also apparently popular in later times, and became the subject of a number of independent poems.
None appears to have been directly based on the surviving text, however, suggesting that other versions of the same tale served as their inspiration.
The first poem is appended to the old text in the Book of Leinster, Harley 5280 and H.3.18.
Its author used the opportunity to display his knowledge of the names of Irish heroes in general, in which he does not confine himself to characters of the story.
The second poem follows the first in Harley 5280, and appears also in three other manuscripts:
The Book of Lecan, Laud 610 in the Bodelian Library and lastly in the Stowe manuscript collection.
In these two poems, the real hero is not the dog Ailbe but the pig – the latter being "practically a panegyric on the pig" – although the story's title implies this may have been an original feature.
In the unusual choice of a pig as the main protagonist, the story becomes associated with a

# Stuff

In [32]:
#num_beams=1, top_k=50, top_p=1.0 ,num_return_sequences=1, temperature=1.0
def paraphrase_t5(input_text, **model_args):
    batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=512, return_tensors="pt").to(torch_device)

    min_len = batch['input_ids'].flatten().shape[0] // 2

    translated = model.generate(
        **batch,
        do_sample=True,
        early_stopping=True,
        min_length=min_len,
        max_length=256,
        **model_args
    )

    print(f'min_len:{min_len} | {translated.shape}')

    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

    print('\n\n'.join(tgt_text))
    # return tgt_text

In [43]:
paragrpah_1

"Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds. After winning the third LaMotta fight less than three weeks later, Robinson then defeated his childhood idol: former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong."

In [83]:
paraphrase_t5(paragrpah_1, num_return_sequences=5)

min_len:90 | torch.Size([5, 110])
Robinson and LaMotta met on the ring in Cincinnati, Ohio between 1987-1991 and won the fight by unanimous decision. Robinson had already smoked Armstrong's first and third ring fight before that. Robinson lost after the first three rounds after Wilson's knockout. He couldn't afford to fight Armstrong in the ring because he needed money and didn't have a weight advantage over Robinson for that contest.

When Robinson won in his last fight with LaMotta, he built a record of 40–0 before being knocked out of the ring in eighth round and winning by decision. After Armstrong was an old fighter, Robinson then defeated and replaced Armstrong. Even though Armstrong is no longer in a ring, he proved that he was in good health. So, Robinson fought on and defeated Armstrong for the rest of his career.

Originally a fighter, Robinson fought the former champion Henry Armstrong for money. After losing the first round, Robinson lost by decision to LaMotta. LaMotta won

In [84]:
paraphrase_t5(paragrpah_1, num_beams=10, num_return_sequences=5)

min_len:90 | torch.Size([5, 108])
Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. Robinson later stated that he carried Armstrong.

Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. Robinson later stated that he carried Armstrong.

Robinson fought Henry Armstrong only because he was in need of money. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. Robinson then lost for the first time 

In [85]:
paraphrase_t5(paragrpah_1, num_return_sequences=5, temperature=1.5)

min_len:90 | torch.Size([5, 115])
As a child (22 years ago), Robinson was so ashamed to be so upset him and wanted to stop and go home: 'Ronan will be here when I will come back for more time.' The final fight had been postponed due to LaMotte's lacker's. When he appeared for despite being under pay obscene as obscene, he lost. And now is his dead body'. Robinson lost three fights later, in an altercation between Rob Robinson and

Before Robinson retired from boxing Robinson went to his old favourite. In an apparent frank outfoxing, Henry Armstrong won the second fight. This time he attacked into a guy about the old Man. Why? Robinson won the fight in August in Louisville. When Robinson died as a result of an undereye fall at age 20, he would never have stopped lifting him. Now only. Robinson now makes $60,000 yearly. Who won at the end of October?

After losing to Bobby McCaure, Robinson fightd Robinson twice the number of rounds in the top four. Robinson won only because Armstrong st

In [79]:
paraphrase_t5(paragrpah_1, num_beams=10, num_return_sequences=5, temperature=1.5)

min_len:90 | torch.Size([5, 101])
Robinson lost in the first round to LaMotta, who had a weight advantage over Robinson in a 10-round re-match. The fight took place in Robinson's former home town of Detroit, and had a record crowd. Robinson then defeated his childhood idol, former champion Henry Armstrong, and Robinson fought Armstrong. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong.

Robinson fought Henry Armstrong only because Armstrong was in need of money. By then Armstrong was an old fighter, and Robinson said that he fought Armstrong only because Armstrong was in need of money. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. Robinson then lost for the first time to LaMotta, who knocked Robinson out of the ring in the eighth round, and won the fight by decision.

Robinson lost to LaMotta in the eighth round. LaMotta had a weight advantage over Robinson, and Robinson had a record of 40–0 before 

In [37]:
paraphrase_t5(
    paragrpah_1,
    num_beams=10, # beam for beam search
    top_k=50, # number of highest propability tokens to keep for top-k filtering
    top_p=1.0, # if < 1 than  only the tokens with probabilities that add up to this value or highter are kept for geneartion
    num_return_sequences=5,
    temperature=1.0,
    length_penalty=2.0, # panalty to produce longer texts
    encoder_no_repeat_ngram_size=10 # no ngram of this size or longer form the encoder_input_ids can occur in the decoder_input_ids
)

min_len:90 | torch.Size([5, 115])
Robinson built a record of 40-0 before losing to LaMotta for the first time in a 10-round re-match in Detroit. Robinson fought Armstrong only because he was in need of money, and Robinson later stated that he fought Armstrong only because he was in need of money. The fight took place in Pittsburgh, and attracted a record crowd. The fight took place in Pittsburgh, and attracted a record crowd. Robinson then defeated his childhood idol, former champion Henry Armstrong.

Robinson fought Henry Armstrong only because he was in need of money. By then Armstrong was an old fighter, and Robinson fought Armstrong only because he was in need of money. After Robinson defeated Armstrong for the first time in a 10-round re-match, LaMotta knocked Robinson out of the ring, and won the fight by decision. LaMotta had a weight advantage over Robinson, who lost for the first time to Robinson in a 10-round re-match, and Robinson

Robinson fought Henry Armstrong only becaus

In [38]:
test_text_1 = """Robinson built a record of 40–0 before losing for the first time to LaMotta in a 10-round re-match. LaMotta, who had a weight advantage over Robinson, knocked Robinson out of the ring in the eighth round, and won the fight by decision. The fight took place in Robinson's former home town of Detroit, and attracted a record crowd. After being controlled by Robinson in the early portions of the fight, LaMotta came back to take control in the later rounds."""

test_text_2="""After winning the third LaMotta fight less than three weeks later, Robinson then defeated his childhood idol: former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. By now Armstrong was an old fighter, and Robinson later stated that he carried Armstrong."""

In [41]:
paraphrase_t5(
    test_text_1,
    num_beams=10, # beam for beam search
    top_k=50, # number of highest propability tokens to keep for top-k filtering
    top_p=1.0, # if < 1 than  only the tokens with probabilities that add up to this value or highter are kept for geneartion
    num_return_sequences=5,
    temperature=1.5,
    length_penalty=2.0, # panalty to produce longer texts
    encoder_no_repeat_ngram_size=20 # no ngram of this size or longer form the encoder_input_ids can occur in the decoder_input_ids
)

min_len:60 | torch.Size([5, 96])
Robinson lost for the first time to LaMotta in a 10-round rematch against Robinson in Detroit. Robinson built a record of 40–0 before losing for the first time to Robinson in the ring in the eighth round, and won the fight by decision. Robinson's opponent had a weight advantage over Robinson, who knocked Robinson out of the ring in the ninth round, and won the fight by decision.

Robinson built a record of 40-0 before losing to LaMotta in a 10-round re-match. Robinson lost for the first time in the ring to LaMotta in the ninth round, and won the fight by decision. After being controlled in the early portions of the fight, Robinson came back to take control in the later rounds.

Robinson lost in a 10-round re-match against LaMotta in Detroit, and built a record of 40-0 before losing for the first time. LaMotta had a weight advantage over Robinson and won the fight by decision. After being controlled by Robinson in the early portions of the fight, Robinso

In [42]:
paraphrase_t5(
    test_text_2,
    num_beams=10, # beam for beam search
    top_k=50, # number of highest propability tokens to keep for top-k filtering
    top_p=1.0, # if < 1 than  only the tokens with probabilities that add up to this value or highter are kept for geneartion
    num_return_sequences=5,
    temperature=1.5,
    length_penalty=2.0, # panalty to produce longer texts
    encoder_no_repeat_ngram_size=20 # no ngram of this size or longer form the encoder_input_ids can occur in the decoder_input_ids
)

min_len:30 | torch.Size([5, 55])
Robinson fought Henry Armstrong in the third LaMotta fight less than two weeks later, and Robinson then defeated his childhood idol, Robinson. Robinson fought only because Armstrong was in need of money, and Robinson later stated that he carried Armstrong.

After winning the third fight less than three weeks later, Robinson then defeated his childhood idol, former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. Robinson later stated that he carried Armstrong.

After Robinson defeated LaMotta, Robinson then defeated his childhood idol Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. Robinson later stated that he carried Armstrong.

After winning the third fight less than three weeks later, Robinson then defeated his childhood idol, former champion Henry Armstrong. Robinson fought Armstrong only because Armstrong was in need of money. Robinson later stated that he carr