In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data\merged_questions.csv")

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [10]:
import csv
import tqdm
import warnings 
warnings.filterwarnings("ignore")

In [6]:
from sentence_transformers import SentenceTransformer, util
sentences = ["Is there a particular skill that is commonly associated with programming? Can you share your own experiences as to why you have this particular ability?",
             "What aptitude do you believe is necessary for programming, and what specific qualities do they possess?"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

[[ 3.68224923e-03  9.42208618e-03 -6.95420429e-02  9.96420626e-03
  -5.37282415e-02 -4.79495563e-02  7.32388273e-02  7.35357776e-02
  -3.57693583e-02  7.80426618e-03 -1.06101923e-01  3.34663838e-02
   5.46664260e-02  1.96664222e-03  1.68164242e-02  1.87952444e-02
  -6.50427714e-02  2.41708085e-02 -4.20075282e-02 -7.80041814e-02
  -4.70384099e-02 -5.98017871e-02 -1.02763735e-02 -5.86753078e-02
   6.72541335e-02  3.07783149e-02 -1.25923548e-02 -5.27293161e-02
   2.24523935e-02 -3.10053024e-02 -3.93245965e-02  4.69328836e-02
   5.92186600e-02  5.52964807e-02 -6.27022013e-02  6.32863045e-02
  -1.33929197e-02  2.67186016e-02  2.44209096e-02 -3.53738330e-02
  -1.00956134e-01  5.12505770e-02  3.96344103e-02 -9.21853036e-02
   3.69108953e-02  1.08158907e-04 -7.56038819e-03  1.36719430e-02
  -4.86441441e-02 -8.06655921e-03 -8.02560747e-02  2.85798330e-02
   2.06601364e-03 -6.33757263e-02  2.52822191e-02  2.82081105e-02
   6.10949770e-02  5.36717623e-02 -2.14375202e-02 -2.03493889e-02
  -2.76965

In [11]:
l = []
for ind in tqdm.tqdm(range(3000, len(df))):
    if (ind % 1000) == 0:
      print(ind)
      l = []
    try:
      text = df.loc[ind, "q1"]
      l.append([ind,text]+paraphrase(text))
    except:
      print("ERROR!!! @ ", ind)
      
    if ((ind+1) % 1000) == 0:
      d = pd.DataFrame(l, columns=["idx", "q1", "d1", "d2", "d3", "d4", "d5"])
      d.to_csv("pairs/pairs_{}.csv".format(ind))

    if ind == len(df):
      d = pd.DataFrame(l, columns=["idx", "q1", "d1", "d2", "d3", "d4", "d5"])
      d.to_csv("pairs/pairs_{}.csv".format(ind))

  0%|          | 0/17100 [00:00<?, ?it/s]

3000


  6%|▌         | 1000/17100 [11:02<2:52:06,  1.56it/s]

4000


 12%|█▏        | 2000/17100 [23:45<4:39:20,  1.11s/it]

5000


 18%|█▊        | 3000/17100 [34:17<2:41:55,  1.45it/s]

6000


 23%|██▎       | 4000/17100 [44:56<2:06:03,  1.73it/s]

7000


 29%|██▉       | 5000/17100 [54:24<2:10:37,  1.54it/s]

8000


 35%|███▌      | 6000/17100 [1:03:37<1:36:15,  1.92it/s]

9000


 41%|████      | 7000/17100 [1:13:00<2:11:08,  1.28it/s]

10000


 47%|████▋     | 8000/17100 [1:23:57<1:21:10,  1.87it/s]

11000


 53%|█████▎    | 9000/17100 [1:34:07<1:51:22,  1.21it/s]

12000


 58%|█████▊    | 10000/17100 [1:43:56<58:08,  2.04it/s] 

13000


 64%|██████▍   | 11000/17100 [1:53:46<57:49,  1.76it/s]  

14000


 70%|███████   | 12000/17100 [2:02:57<43:02,  1.98it/s]  

15000


 76%|███████▌  | 13000/17100 [2:12:41<30:36,  2.23it/s]  

16000


 82%|████████▏ | 14000/17100 [2:23:33<33:41,  1.53it/s]  

17000


 88%|████████▊ | 15000/17100 [2:34:48<24:31,  1.43it/s]

18000


 94%|█████████▎| 16000/17100 [2:48:01<12:44,  1.44it/s]  

19000


 99%|█████████▉| 17000/17100 [3:01:00<01:10,  1.41it/s]  

20000


100%|██████████| 17100/17100 [3:02:07<00:00,  1.56it/s]
