In [4]:
import os
import json

hfcache = '/mnt/data1/prasann/latticegen/lattice-generation/hfcache'
os.environ['TRANSFORMERS_CACHE'] = hfcache

In [5]:
os.listdir('./T0_explore_data')

['reverse_meaning_data.txt',
 'reverse_sent_data.txt',
 'summdoc_data.txt',
 '.ipynb_checkpoints',
 'formal_hand_data.txt',
 'simplification_data.txt',
 'infilling_data.txt',
 'informal_data.txt',
 'easy_hand_data.txt']

In [6]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda:1" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B", cache_dir=hfcache)
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B", cache_dir=hfcache)
model = model.to(device)


In [7]:
BASE = './T0_explore_data/'

def read_input_data (fname):
    with open(BASE+fname+".txt") as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
    while("" in lines) :
        lines.remove("")
    
    return lines

infill_data = read_input_data('infilling_data')
informal_data = read_input_data('informal_data')
revmean_data = read_input_data('reverse_meaning_data')
revsent_data = read_input_data('reverse_sent_data')
simplif_data = read_input_data('simplification_data')
easy_data = read_input_data('easy_hand_data')
summ_data = read_input_data('summdoc_data')
formalhand_data = read_input_data('formal_hand_data')

In [8]:
inf_prompts = [
    "Task: Copy the sentence but replace _ with the correct words. [TEXT]",
    "Task: replace _ with the correct words. [TEXT]",
    "[TEXT] What goes in the _ ?",
    "Task: Fill in the _ in the following sentence. [TEXT]",
]

sent_prompts = [
    "Task: what's the opposite of this sentence? [TEXT]",
    "Copy but say the opposite: [TEXT]",
    "[TEXT] What is the opposite of this sentence?",
    "Task: copy but say the opposite emotion. [TEXT]",
    "Task: copy but say the opposite. [TEXT]", #
]

simplif_prompt = [
    "[TEXT] How would I say this in simple words?",
    "Task: Paraphrase in simpler terms. [TEXT]",
    "Task: Copy but simplify. [TEXT]",
    "Task: Simplify the sentence. [TEXT]"
]

summ_prompt = [
    "Task: summarize the following paragraph. [TEXT]",
    "How can I summarize the following? [TEXT]",
]

formal_prompt = [
    "Task: How can I say this with more formal language? [TEXT]",
    "Task: Copy but use formal words. [TEXT]",
    "[TEXT] How can I rephrase this formally?",
    "[TEXT] What is a formal way of saying this?",
]



In [9]:

def get_T0_output(prompt, raw):
    inputstring = get_inputstr(prompt, raw)
    inputs = tokenizer.encode(inputstring, return_tensors="pt").to(device)
    outputs = model.generate(inputs, max_length=50, num_beams=9, num_return_sequences=9, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    res = []
    #print(outputs)
    for i in range(0, len(outputs.sequences)):
        tmp = {}
        tmp['out_toks'] = tokenizer.decode(outputs.sequences[i])
        tmp['out_sco'] = outputs.sequences_scores[i]
        res.append(tmp)
        print(tmp)
    out = {}
    out['data'] = res
    out['input'] = inputstring
    return out
        
def get_inputstr(prompt, raw):
    return prompt.replace("[TEXT]", raw)

alldat = []

def run_all_data (promptlist, data, label):
    global alldat
    alldata = []
    for p in promptlist:
        curtmp = {}
        curtmp['prompt'] = p
        curtmp['examples'] = []
        for d in data:
            curtmp['examples'].append(get_T0_output(p, d))
        alldata.append(curtmp)
    alldat = alldata
    for a in alldata:
        for e in a['examples']:
            for d in e['data']:
                d['out_sco'] = float(d['out_sco'])
    alldatajson = {}
    alldatajson['data'] = alldata
    with open('./prompt_output/'+label+'.txt', 'w') as f:
        json.dump(alldatajson, f, ensure_ascii=False)



In [None]:
run_all_data(simplif_prompt, formalhand_data, 'simplifhand_prompt')

In [24]:
krishna_prompt = [
    "Task: How would shakespeare say this? [TEXT]",
    "Task: Paraphrase with shakespeare language. [TEXT]",
    "Task: Copy but make it sound like a tweet. [TEXT]",
]

prompt = krishna_prompt[1]
inp = easy_data[12]
inputstr = get_inputstr(prompt, inp)
print(inputstr)
get_T0_output(prompt, inp)

Task: Paraphrase with shakespeare language. The rabbit saw the monkey.
{'out_toks': '<pad> The rabbit saw the monkey.</s><pad><pad><pad><pad><pad>', 'out_sco': tensor(-0.2448, device='cuda:1')}
{'out_toks': '<pad> The monkey saw the rabbit.</s><pad><pad><pad><pad><pad>', 'out_sco': tensor(-0.2870, device='cuda:1')}
{'out_toks': '<pad> The rabbit saw a monkey.</s><pad><pad><pad><pad>', 'out_sco': tensor(-0.4586, device='cuda:1')}
{'out_toks': '<pad> The rabbit sees the monkey.</s><pad><pad><pad><pad>', 'out_sco': tensor(-0.4696, device='cuda:1')}
{'out_toks': '<pad> The monkey sees the rabbit.</s><pad><pad><pad><pad>', 'out_sco': tensor(-0.5002, device='cuda:1')}
{'out_toks': '<pad> A monkey sees a rabbit.</s><pad><pad><pad>', 'out_sco': tensor(-0.6229, device='cuda:1')}
{'out_toks': '<pad> A monkey sees a rabbit in the forest.</s>', 'out_sco': tensor(-0.8220, device='cuda:1')}
{'out_toks': '<pad> A monkey sees a rabbit and runs away.</s>', 'out_sco': tensor(-0.8323, device='cuda:1')}
{

{'data': [{'out_toks': '<pad> The rabbit saw the monkey.</s><pad><pad><pad><pad><pad>',
   'out_sco': tensor(-0.2448, device='cuda:1')},
  {'out_toks': '<pad> The monkey saw the rabbit.</s><pad><pad><pad><pad><pad>',
   'out_sco': tensor(-0.2870, device='cuda:1')},
  {'out_toks': '<pad> The rabbit saw a monkey.</s><pad><pad><pad><pad>',
   'out_sco': tensor(-0.4586, device='cuda:1')},
  {'out_toks': '<pad> The rabbit sees the monkey.</s><pad><pad><pad><pad>',
   'out_sco': tensor(-0.4696, device='cuda:1')},
  {'out_toks': '<pad> The monkey sees the rabbit.</s><pad><pad><pad><pad>',
   'out_sco': tensor(-0.5002, device='cuda:1')},
  {'out_toks': '<pad> A monkey sees a rabbit.</s><pad><pad><pad>',
   'out_sco': tensor(-0.6229, device='cuda:1')},
  {'out_toks': '<pad> A monkey sees a rabbit in the forest.</s>',
   'out_sco': tensor(-0.8220, device='cuda:1')},
  {'out_toks': '<pad> A monkey sees a rabbit and runs away.</s>',
   'out_sco': tensor(-0.8323, device='cuda:1')},
  {'out_toks': '

In [44]:
#print(outputs)


<pad> The dry sand was hard to grasp.</s><pad>
tensor(-0.1610)
<pad> The wet sand was hard to grasp.</s>
tensor(-0.2598)
<pad> The wet sand was easy to grasp.</s>
tensor(-0.2707)
<pad> The sand was hard to grasp.</s><pad><pad>
tensor(-0.3358)
<pad> The dry sand was difficult to grasp.</s><pad>
tensor(-0.4160)


In [4]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", cache_dir=hfcache)

Downloading:   0%|          | 0.00/836 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.3G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [None]:
prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)