In [110]:
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [182]:
dataset = load_dataset('wikipedia', "20220301.en", split='train', streaming=True)
shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10_000)

In [183]:
nd = list(dataset.take(30000))
ndata = Dataset.from_list(nd)

In [None]:
print(dset[6]['text'])

In [181]:
quotesdata = load_dataset("jstet/quotes-500k")

Found cached dataset csv (/u/prasanns/.cache/huggingface/datasets/jstet___csv/jstet--quotes-500k-ede96e03d28fbb72/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 220.75it/s]


In [184]:
TOTDATA = 60000
quotesdata = quotesdata['train'].shuffle(seed=0)
quotesdata = quotesdata.select(range(TOTDATA))

Loading cached shuffled indices for dataset at /u/prasanns/.cache/huggingface/datasets/jstet___csv/jstet--quotes-500k-ede96e03d28fbb72/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-602235f45088ee8c.arrow


In [11]:
quotesdata[1]

{'quote': "The rose, however, made us girls somewhat fainthearted, because it really was something we felt mattered, the white bridal dream with the wedding bouquet and the kiss from the man who was to be ours forever. But then Laura said that the lady who had given it to us had gotten divorced only five years later. And since many of our parents were also divorce, if indeed they had ever been married at all, that dream clearly wasn't worth our time.",
 'author': 'Janne Teller',
 'category': 'marriage, nothing'}

In [99]:
modelname = "meta-llama/Llama-2-7b-hf"

In [100]:
model = AutoModelForCausalLM.from_pretrained(modelname, device_map=0)
model.eval()
toker = AutoTokenizer.from_pretrained(modelname, padding_side='left')

toker.max_length=512
toker.padding_size='left'
toker.pad_token = toker.eos_token

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.09s/it]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 927kB/s]


In [203]:
def proc_quotes(exs):
    inps = []
    for e in exs:
        s = e['author']+ ": " + e['quote']
        if len(s)<200:
            inps.append(s)
    return inps

def proc_wiki(exs):
    inps = []
    for e in exs:
        svals = e['text'].split(".")
        pots = []
        for s in svals: 
            if len(s)>20 and len(s)<200:
                pots.append(s.strip())
        inps.extend(pots[:3])
    print(len(inps))
    return inps
        
def generate_trunc(inputs, trunc, model, mbatch_size=4, top_p=0.9, temp=0.4):
    newinps = []
    corrgens = []
    for inp in inputs: 
        newinps.append(toker.decode(toker(inp).input_ids[:-(trunc+1)], skip_special_tokens=True))
        corrgens.append(toker.decode(toker(inp).input_ids[-(trunc+1):], skip_special_tokens=True))
    newgens = []
    for i in tqdm(range(0, len(newinps), mbatch_size)):
        inps = toker(newinps[i:i+mbatch_size], padding=True, truncation=True, return_tensors="pt").to(model.device)
        newgens.extend(model.generate(**inps, max_new_tokens=trunc+1, do_sample=True, top_p=top_p, temperature=temp))
    return toker.batch_decode(newgens, skip_special_tokens=True), corrgens

In [199]:
procd = proc_wiki(ndata.select(range(100)))
# procd = proc_quotes(quotesdata.select(range(100)))
gtrunc, golds = generate_trunc(procd, 3, model, 4, 0.9, 1.0)

289


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73/73 [00:24<00:00,  3.04it/s]


In [187]:
quotesdata

Dataset({
    features: ['quote', 'author', 'category'],
    num_rows: 60000
})

In [198]:
ind = 9
print("ORIGINAL: "+procd[ind])
print('GOLD: '+golds[ind])
print("GEN: "+gtrunc[ind])

ORIGINAL:  a poet arranges meaning in the sounds.: A versifier arranges sounds
GOLD: ifier arranges sounds
GEN:  a poet arranges meaning in the sounds.: A versification of the poetry
