<a href="https://colab.research.google.com/github/RealAntonVoronov/computational_humor/blob/master/Paraphrasers_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install flair

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.5MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.9MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     

# Utils for statistics

In [61]:
import numpy as np
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

def embed_sentences(sents, model=FlairEmbeddings('news-forward')):
    """
    sents : list of N sentences
    return : array [N, emb_dim]
    """
    sents_emb = model.embed([Sentence(sent) for sent in sents])
    vectorized_sents = []


    for sent in sents_emb:
        res_vect = torch.zeros(2048).cuda()
        for token in sent:
            res_vect += token.embedding
        vectorized_sents.append(res_vect.cpu().numpy() / max(len(sent), 1))
    
    return np.array(vectorized_sents)


def calc_stats(inp, preds, emb_model=FlairEmbeddings('news-forward')):
    """
    inp : string
    preds : array with M preds (strings)
    return : (mean, std) tuple of stats
    """
    avg_sims, variability = [], []
    M = len(preds)

    inp_vec = embed_sentences([inp], emb_model)
    preds_vec = embed_sentences(preds, emb_model)
    
    avg_sims = sum([cosine_similarity(preds_vec[j][None, :], inp_vec[0][None, :])[0, 0]
                    for j in range(M)]) / M
    variability = sum([cosine_similarity(preds_vec[j][None, :], np.mean(preds_vec, axis=0)[None, :])[0, 0]
                       for j in range(M)]) / M
    
    return avg_sims, 1 - variability

In [38]:
import re

inputs = "and i think antonio adam begay should be dead                                                                           repent , bigot , lest you be strange fruit hanging from the poplar tree .                                               when i am unblocked i swear i will go fk your mother you big ass bes gay ass fkers motherfking my accounts !            up your ass is a joke of a play that nobody takes seriously .                                                           you sound like a real pc lefty if you buy into that garbage                                                             can some cunt tell me ?                                                                                                 i will kill you if you dont unblock me .                                                                                leave this website alone , and concentrate on adding more good information , instead of deleting the stuff that matters asshole ! !                                                                                                             he creates nothing and lives in his own world where he believes that his opinion is something other than the vomit he infects others with .                                                                                                     we need adults in the room , not lying fanatics ."
inputs = np.array(re.split('\s{2,}', inputs))

emb_model = FlairEmbeddings('news-forward')

# Testing Models

## 1. https://huggingface.co/tuner007/pegasus_paraphrase

In [2]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).cuda()

In [63]:
def get_response(input_text, num_beams):
  batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_beams, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text


preds = []
for num_beams in range(3, 10):
    print(num_beams)
    for sent in inputs[:3]:
        preds = get_response(sent, num_beams)
        for pred in preds:
            print(f'{sent} | {pred}')
        s, v = calc_stats(sent, preds)
        print(f'num_beans : {num_beams}, similarity: {s}, variability : {v}')

3
and i think antonio adam begay should be dead | I think antonio adam begay is dead.
and i think antonio adam begay should be dead | I think antonio adam begay should be dead.
and i think antonio adam begay should be dead | I believe antonio adam begay should be dead.
num_beans : 3, similarity: 0.7507852713267008, variability : 0.015009403228759766
repent , bigot , lest you be strange fruit hanging from the poplar tree . | If you are hanging fruit from the tree, please repentance.
repent , bigot , lest you be strange fruit hanging from the poplar tree . | If you are hanging fruit from the poplar tree, you must repentance.
repent , bigot , lest you be strange fruit hanging from the poplar tree . | If you are hanging fruit from the poplar tree, you should apologize.
num_beans : 3, similarity: 0.7062910199165344, variability : 0.010246296723683712
when i am unblocked i swear i will go fk your mother you big ass bes gay ass fkers motherfking my accounts ! | I will go fk your mother if I a

## https://huggingface.co/Vamsi/T5_Paraphrase_Paws

Simply bad

In [64]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").cuda()

In [67]:
for num_beams in range(3, 10):
    for sentence in inputs[:3]:
        preds = set()
        text =  "paraphrase: " + sentence

        encoding = tokenizer.encode_plus(text, padding='longest', return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].cuda(), encoding["attention_mask"].cuda()

        outputs = model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=256,
            do_sample=True,
            top_k=120,
            top_p=0.95,
            early_stopping=True,
            num_return_sequences=num_beams
        )

        for output in outputs:
            line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            preds.add(line)
            
        for line in list(preds):
            print(f'{sentence} | {line}')
        
        s, v = calc_stats(sentence, preds)
        print(f'num_beams : {num_beams}, similarity: {s}, variability : {v}')

and i think antonio adam begay should be dead | and i think antonio adam begay should be dead?
and i think antonio adam begay should be dead | I think antonio adam begay should be dead
and i think antonio adam begay should be dead | i think antonio adam begay should be dead
num_beams : 3, similarity: 0.9137088656425476, variability : 0.05335623025894165
repent , bigot , lest you be strange fruit hanging from the poplar tree . | Repent, Bigot, that you may become strange fruit hanging from a Poplar tree.
repent , bigot , lest you be strange fruit hanging from the poplar tree . | Pray, bigot, lest you be strange fruit hanging from the poplar tree.
repent , bigot , lest you be strange fruit hanging from the poplar tree . | Repent, bigot, lest you be strange fruit hanging from the poplar tree.
num_beams : 3, similarity: 0.9070051908493042, variability : 0.02738255262374878
when i am unblocked i swear i will go fk your mother you big ass bes gay ass fkers motherfking my accounts ! | When i 

## https://huggingface.co/ceshine/t5-paraphrase-paws-msrp-opinosis

In [68]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ceshine/t5-paraphrase-paws-msrp-opinosis")
model = AutoModelForSeq2SeqLM.from_pretrained("ceshine/t5-paraphrase-paws-msrp-opinosis").cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891695056.0, style=ProgressStyle(descri…




In [76]:
for num_beams in range(3, 10):
    for sentence in inputs[:1]:
        preds = set()
        text =  "paraphrase: " + sentence

        encoding = tokenizer.encode_plus(text, padding='longest', return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].cuda(), encoding["attention_mask"].cuda()

        outputs = model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=256,
            do_sample=True,
            top_k=90,
            top_p=0.97,
            early_stopping=True,
            num_return_sequences=num_beams
        )

        for output in outputs:
            line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            preds.add(line)
            
        for line in list(preds):
            print(f'{sentence} | {line}')
        
        s, v = calc_stats(sentence, preds)
        print(f'num_beams : {num_beams}, similarity: {s}, variability : {v}')

and i think antonio adam begay should be dead | i think antonio adam begay should die dead, and i think adam should be dead.
and i think antonio adam begay should be dead | and i think antonio adam begay should be dead.
and i think antonio adam begay should be dead | And i think that antonio adam begay should be dead.
num_beams : 3, similarity: 0.9053296248118082, variability : 0.04880114396413171
and i think antonio adam begay should be dead | i think antonio adam begay should be dead
and i think antonio adam begay should be dead | and i think antonio adam begay should be dead.
and i think antonio adam begay should be dead | and i think antonio adam begay should be dead,
and i think antonio adam begay should be dead | i think antonio adam begay should be dead.
num_beams : 4, similarity: 0.9767146855592728, variability : 0.01162286102771759
and i think antonio adam begay should be dead | and i think antonio adam begay should be dead, like i do adam begay.
and i think antonio adam begay

## https://huggingface.co/ceshine/t5-paraphrase-quora-paws

In [77]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ceshine/t5-paraphrase-quora-paws")

model = AutoModelForSeq2SeqLM.from_pretrained("ceshine/t5-paraphrase-quora-paws").cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891695056.0, style=ProgressStyle(descri…




In [81]:
for num_beams in range(3, 10):
    for sentence in inputs[:1]:
        preds = set()
        text =  "paraphrase: " + sentence

        encoding = tokenizer.encode_plus(text, padding='longest', return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].cuda(), encoding["attention_mask"].cuda()

        outputs = model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=256,
            do_sample=True,
            top_k=100,
            top_p=0.93,
            early_stopping=True,
            num_return_sequences=num_beams
        )

        for output in outputs:
            line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            preds.add(line)
            
        for line in list(preds):
            print(f'{sentence} | {line}')
        
        s, v = calc_stats(sentence, preds)
        print(f'num_beams : {num_beams}, similarity: {s}, variability : {v}')

and i think antonio adam begay should be dead | antonio adam begay should be dead, and I believe he's gay.
and i think antonio adam begay should be dead | i think antonio Adams should die.
and i think antonio adam begay should be dead | and i think Adam begay should be dead and antonio should die.
num_beams : 3, similarity: 0.8735963900883993, variability : 0.0485149621963501
and i think antonio adam begay should be dead | i think Adam begay should be dead and antonio should be dead.
and i think antonio adam begay should be dead | i think antonio adam begay should be dead.
and i think antonio adam begay should be dead | and i think antonio Adam begay should be dead.
and i think antonio adam begay should be dead | antonio adam begay should die and i think he should die.
num_beams : 4, similarity: 0.89930759370327, variability : 0.04551519453525543
and i think antonio adam begay should be dead | Adams Begay should be dead. and I think antonio Adams begay should be dead. ''
and i think an

## BART (PAWS-Quora-MSRP)

### Download data

Note that MSRP data needs to be downloaded and put to the data directory manually from https://msropendata.com/datasets/e235323f-f23c-4246-b2e6-27d7a654d6cc

In [5]:
#!pip install simpletransformers

In [6]:
!mkdir data
!wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz -P data
!tar -xvf data/paws_wiki_labeled_final.tar.gz -C data
!mv data/final/* data
!rm -r data/final

!wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P data

--2020-11-03 05:07:50--  https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.195.128, 74.125.28.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4687157 (4.5M) [application/gzip]
Saving to: ‘data/paws_wiki_labeled_final.tar.gz’


2020-11-03 05:07:50 (209 MB/s) - ‘data/paws_wiki_labeled_final.tar.gz’ saved [4687157/4687157]

final/test.tsv
final/
final/train.tsv
final/dev.tsv
--2020-11-03 05:07:51--  http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
Resolving qim.fs.quoracdn.net (qim.fs.quoracdn.net)... 151.101.1.2, 151.101.65.2, 151.101.129.2, ...
Connecting to qim.fs.quoracdn.net (qim.fs.quoracdn.net)|151.101.1.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58176133 (55M) [text/tab-separated-values]
Saving to: ‘data/quora_duplicate_questi

### Utils

In [1]:
import warnings

import pandas as pd


def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [2]:
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

### Load Data

In [3]:
# Google Data
train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("data/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("data/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("data/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)

# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)

q_train.to_csv("data/quora_train.tsv", sep="\t")
q_test.to_csv("data/quora_test.tsv", sep="\t")

# The code block above only needs to be run once.
# After that, the two lines below are sufficient to load the Quora dataset.

# q_train = pd.read_csv("data/quora_train.tsv", sep="\t")
# q_test = pd.read_csv("data/quora_test.tsv", sep="\t")

train_df = pd.concat([train_df, q_train])
eval_df = pd.concat([eval_df, q_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

print(train_df)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

            prefix  ...                                        target_text
1       paraphrase  ...  The 1975 -- 76 season of the National Basketba...
3       paraphrase  ...  The results are high when comparable flow rate...
4       paraphrase  ...  It is the seat of the district of Zerendi in A...
5       paraphrase  ...  William Henry Harman was born in Waynesboro, V...
7       paraphrase  ...  Given a discrete set of probabilities formula ...
...            ...  ...                                                ...
218120  paraphrase  ...  Which is the best company to work for an aeron...
178365  paraphrase  ...                Is time travel possible? If yes how
32545   paraphrase  ...                How can I earn money from Facebook?
321383  paraphrase  ...  I want to buy a pet dog in India. Which one sh...
39181   paraphrase  ...              What should I do about a girl I like?

[136422 rows x 3 columns]


### Fine-tune and Evaluate pretrained model

In [4]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 47000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = False
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 8
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

#model.train_model(train_df, eval_data=eval_df)

In [5]:
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

RuntimeError: ignored

In [6]:
import re
import numpy as np

inputs = "and i think antonio adam begay should be dead                                                                           repent , bigot , lest you be strange fruit hanging from the poplar tree .                                               when i am unblocked i swear i will go fk your mother you big ass bes gay ass fkers motherfking my accounts !            up your ass is a joke of a play that nobody takes seriously .                                                           you sound like a real pc lefty if you buy into that garbage                                                             can some cunt tell me ?                                                                                                 i will kill you if you dont unblock me .                                                                                leave this website alone , and concentrate on adding more good information , instead of deleting the stuff that matters asshole ! !                                                                                                             he creates nothing and lives in his own world where he believes that his opinion is something other than the vomit he infects others with .                                                                                                     we need adults in the room , not lying fanatics ."
inputs = np.array(re.split('\s{2,}', inputs))

to_predict = inputs

preds = model.predict(to_predict)

---------------------------------------------------------


NameError: ignored

In [7]:
print("---------------------------------------------------------")
print(inputs)

print()
print("Predictions >>>")
print(preds)
for pred in preds[0]:
    print(pred)

---------------------------------------------------------
['and i think antonio adam begay should be dead'
 'repent , bigot , lest you be strange fruit hanging from the poplar tree .'
 'when i am unblocked i swear i will go fk your mother you big ass bes gay ass fkers motherfking my accounts !'
 'up your ass is a joke of a play that nobody takes seriously .'
 'you sound like a real pc lefty if you buy into that garbage'
 'can some cunt tell me ?' 'i will kill you if you dont unblock me .'
 'leave this website alone , and concentrate on adding more good information , instead of deleting the stuff that matters asshole ! !'
 'he creates nothing and lives in his own world where he believes that his opinion is something other than the vomit he infects others with .'
 'we need adults in the room , not lying fanatics .']

Predictions >>>
[['andand i think antonio adam begay should be dead', 'andand i think antonia is dead', 'andand i think antonia is dead'], ['repentend, lest you be strange f

In [8]:
original = ['A recording of folk songs done for the Columbia society in 1942 was largely arranged by Pjetër Dungu.']
preds = model.predict(original)
print(preds)

[['AA recording of folk songs done for the Columbia society in 1942 was largely arranged by Pjetër Dungu.', 'AA recording of folk songs done for the Columbia society in 1942 was largely arranged by Pjetër Dungu.', 'AA recording of folk songs made by the society in 1942 was largely arranged by Pjetër Dungu.']]


In [10]:
preds[0][0] == preds[0][1]

True