In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from datasets import load_dataset
import pandas as pd

device = 'cuda:1' # if you have a GPU

In [2]:
tokenizer = T5Tokenizer.from_pretrained('stanfordnlp/SteamSHP-flan-t5-xl')
model = T5ForConditionalGeneration.from_pretrained('stanfordnlp/SteamSHP-flan-t5-xl').to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def get_reward_single(inpdict, tok, mod):
    template = "POST: {context:s} \n\nRESPONSE A:{hyp:s} \n\nRESPONSE B: .\n\n Which response is better? RESPONSE "
    inp = template.format(context=inpdict['context'], hyp=inpdict['hyp'])
    x = tokenizer([inp], return_tensors='pt').input_ids.to(device)
    outputs = model.generate(x, return_dict_in_generate=True, output_scores=True, max_new_tokens=1)
    return torch.exp(outputs.scores[0][:, 71]) / torch.exp(outputs.scores[0][:,:]).sum(axis=1).item() # index 71 corresponds to the token for 'A'

def score_ex_shp(ex, cfacts=None):
    score_a = float(get_reward_single({"context":ex['history'], "hyp":ex['human_ref_A']}, tokenizer, model))
    score_b = float(get_reward_single({"context":ex['history'], "hyp":ex['human_ref_B']}, tokenizer, model))
    print(ex['history'])
    print("REWARD: ", score_a, ", UPVOTES: ", ex['score_A'])
    print(ex['human_ref_A'])
    print("REWARD: ", score_b, ", UPVOTES: ", ex['score_B'])
    print(ex['human_ref_B'])
    if cfacts is not None:
        for counterfact in cfacts:
            score_counter = float(get_reward_single({"context":ex['history'], "hyp":counterfact}, tokenizer, model))
            print("REWARD: ", score_counter)
            print(counterfact)

def score_dialogue(row):
    cont = row['context'] +"\n"+"I was reading this information from a reliable source, and it got me thinking... "+row['src']
    print()
    print(row['hyp'])
    sco = float(get_reward_single({"context":cont , "hyp":row['hyp']}, tokenizer, model))
    print(sco)
    return sco

def score_all_dial(df):
    uns = df['context'].unique()
    for u in uns:
        sub = df[df['context']==u]
        print(u+"\n"+sub['src'].iloc[0])
        for i, rw in sub.iterrows():
            score_dialogue(rw)
            
def score_sum(row):
    cont = "Could someone help me summarize this passage? It's really confusing. \n"+row['src']
    # print("ref "+row['ref'])
    print(row['hyp'])
    sco = float(get_reward_single({"context":cont , "hyp":row['hyp']}, tokenizer, model))
    print(sco)
    return sco

def score_all_sum(df):
    uns = df['src'].unique()
    for u in uns:
        sub = df[df['src']==u]
        print(u)
        for i, rw in sub.iterrows():
            score_sum(rw)

In [4]:
dataset = load_dataset("stanfordnlp/shp", data_dir="askculinary")

Downloading and preparing dataset json/stanfordnlp--shp to /home/prasann/.cache/huggingface/datasets/stanfordnlp___json/stanfordnlp--shp-ee7c73b0d9a5ffc3/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.07M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/prasann/.cache/huggingface/datasets/stanfordnlp___json/stanfordnlp--shp-ee7c73b0d9a5ffc3/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
dataset['test'][100]

{'post_id': 'kurnh7',
 'domain': 'askculinary_test',
 'upvote_ratio': 0.97,
 'history': 'I\'m not a foodie, but my new wife is. Anything I can do to make spiral electric stove tops a better residential cooking experience? Are there replacement spiral stove tops that work better than others? Or is there an added accessory I can add to the stove top so that they work "better"?  We are renting in an apartment high-rise, so no chance of extending the gas lines and replacing the stove.',
 'c_root_id_A': 'gitu316',
 'c_root_id_B': 'gitsr4i',
 'created_at_utc_A': 1610330988,
 'created_at_utc_B': 1610330348,
 'score_A': 122,
 'score_B': 13,
 'human_ref_A': 'I\'ve basically cooked on nothing but electric stoves (chronic renter) and really once you\'ve adjusted to the individual stove it\'s not that bad. A bit more juggling involved than on gas or induction but it\'s not as bad as the "gas or nothing" crowd make it seem.',
 'human_ref_B': 'Get cast iron pans—I love to cook and my spirals work we

In [25]:
def get_reward_comp(inpdict):
    template = "POST: {context:s} \n\nRESPONSE A:{hypa:s} \n\nRESPONSE B:{hypb:s} \n\n Which response is better? RESPONSE "
    inp = template.format(context=inpdict['context'], hypa=inpdict['hypa'], hypb=inpdict['hypb'])
    x = tokenizer([inp], return_tensors='pt').input_ids.to(device)
    outputs = model.generate(x, return_dict_in_generate=True, output_scores=True, max_new_tokens=1)
    return torch.exp(outputs.scores[0][:, 71]) / torch.exp(outputs.scores[0][:,:]).sum(axis=1).item() # index 71 corresponds to the token for 'A'

In [38]:
inp = """

I guess you could call this a shower thought but out of the blue I thought about this. How does our stomach acids not float up our throats in zero G? I’m not familiar with how our bodies work in space and would love to hear an explanation. (Directions for responder): Don't give a complicated answer, use short / simple vocabulary. 
"""
hypa = """
Your stomach acid isn't kept in your stomach only by gravity. (You can tell by the fact that you can do a handstand without throwing up.)

Instead, there's a valve called the lower esophageal sphincter that keeps your stomach contents in your stomach. ("Sphincter" is an anatomical word for "a muscular valve that can open and close"; the common usage of the word to mean the anal sphincter is more specific than the medical use.)
"""
hypa = "Basically, your stomach has a lid that opens when you swallow, but is closed the rest of the time."
hypb = """
There’s a tight muscular constriction called the lower esophageal sphincter (LES) at the junction of the esophagus and stomach. It normally keeps stomach contents from rising back up the esophagus.

The prevention of outflow from the stomach has exceptions, of course, in vomiting and acid reflux (heartburn or GERD). The vomiting reflex involves an autonomic relaxation of the sphincter. GERD (gastroesophageal reflus disease) results from weakness of the sphincter or structural defects in this area, such as a hiatal hernia in the diaphragm.
"""

get_reward_comp({'context':inp, 'hypa':hypb, 'hypb':hypa})

tensor([0.8602], device='cuda:1')

In [42]:
inp = "Write a paragraph. Say \"bat\" as much as possible." 
hypa = """
In the gloaming light, the nature enthusiast spotted a bat from a distance, darting haphazardly against the dusky backdrop of the evening sky. The bat, a creature of the night, was clearly on a mission. The bat was in search of its nightly sustenance, primarily consisting of insects, but some species of bat are known to consume fruits. Understanding the ecological importance of bats, the enthusiast was inspired to grab a baseball bat from his car and headed towards a nearby field to enjoy the cool evening breeze. His playful swings at the wind felt as graceful as the bat's agile movements above. As he engaged in this fun exercise, he spotted a large stone sculpture of a bat on the other end of the field. "What a peculiar town," he thought, "where the bat is so celebrated, from the realms of wildlife to the artistic expressions and even sports."
"""
hypb = """
In the gloaming light, the nature enthusiast spotted a frog from a distance, darting haphazardly against the dusky backdrop of the evening sky. The frog, a creature of the night, was clearly on a mission. The frog was in search of its nightly sustenance, primarily consisting of insects, but some species of frog are known to consume fruits. Understanding the ecological importance of frogs, the enthusiast was inspired to grab a baseball bat from his car and headed towards a nearby field to enjoy the cool evening breeze. His playful swings at the wind felt as graceful as the wood's agile movements above. As he engaged in this fun exercise, he spotted a large stone sculpture of a frog on the other end of the field. "What a peculiar town," he thought, "where the frog is so celebrated, from the realms of wildlife to the artistic expressions and even sports."
"""
get_reward_comp({'context':inp, 'hypa':hypb, 'hypb':hypa})

tensor([0.3274], device='cuda:1')

In [5]:
def gpt_template_even(row):
    template = "I'm drafting a response for this reddit post: {context:s} \n\n I'm considering 2 options:\n\n RESPONSE A:{human_ref_A:s} \n\nRESPONSE B: {human_ref_B:s}\n\n Which response is better, and why?"
    inp = template.format(context=row['history'], human_ref_A=row['human_ref_A'],human_ref_B=row['human_ref_B'] )
    return inp


def gpt_justify_gold(row, reverse=False):
    template = "I was looking at 2 responses to this reddit post: {context:s} \n\nThese were the 2 options I looked at:\n\nRESPONSE A: {human_ref_A:s} \n\nRESPONSE B: {human_ref_B:s}\n\n Why did response {goldresp:s} get more votes? Can you explain in detail why it wasn't the other response?"
    if row['score_A']>row['score_B']:
        gold = "A"
        if reverse:
            gold="B"
    else:
        gold = "B"
        if reverse:
            gold="A"
    inp = template.format(context=row['history'], human_ref_A=row['human_ref_A'],human_ref_B=row['human_ref_B'], goldresp=gold)
    return inp


print(gpt_justify_gold(dataset['test'][375]))
    

I was looking at 2 responses to this reddit post: What does coating bacon in flour do when you are baking bacon? My preferred method for cooking bacon in bulk is to use an oven-safe drying/cooling rack, and baking. I have seen a good amount of people insisting that coating bacon in flour makes for a better product. What exactly does coating bacon in flour prior to baking do? What are the benefits? 

These were the 2 options I looked at:

RESPONSE A: My friend told me to do a 1:1 ratio with regular flour and rice flour when I asked her about this. I use about 1/2 cup total to dredge 1 package of Costco bacon. The bacon ends up so crispy with minimal shrinkage 

RESPONSE B: Absorbs and makes it crispy. I recommend skipping the flour and coating it in maple or brown sugar. It makes it crispy also. Candied bacon is best bacon.

 Why did response A get more votes? Can you explain in detail why it wasn't the other response?


In [15]:
dataset['test'][375]

{'post_id': 'r0pzw0',
 'domain': 'askculinary_test',
 'upvote_ratio': 0.95,
 'history': 'What does coating bacon in flour do when you are baking bacon? My preferred method for cooking bacon in bulk is to use an oven-safe drying/cooling rack, and baking. I have seen a good amount of people insisting that coating bacon in flour makes for a better product. What exactly does coating bacon in flour prior to baking do? What are the benefits?',
 'c_root_id_A': 'hlvi65g',
 'c_root_id_B': 'hlvby98',
 'created_at_utc_A': 1637738327,
 'created_at_utc_B': 1637734191,
 'score_A': 10,
 'score_B': 9,
 'human_ref_A': 'My friend told me to do a 1:1 ratio with regular flour and rice flour when I asked her about this. I use about 1/2 cup total to dredge 1 package of Costco bacon. The bacon ends up so crispy with minimal shrinkage',
 'human_ref_B': 'Absorbs and makes it crispy. I recommend skipping the flour and coating it in maple or brown sugar. It makes it crispy also. Candied bacon is best bacon.',
 '

In [8]:
ind = 100
score_ex_shp(dataset['test'][ind], 
     [
         "Get cast iron.",
         "I most humbly recommend that you acquire iron.",
         "Get cast bronze pans."
     ]
)

I'm not a foodie, but my new wife is. Anything I can do to make spiral electric stove tops a better residential cooking experience? Are there replacement spiral stove tops that work better than others? Or is there an added accessory I can add to the stove top so that they work "better"?  We are renting in an apartment high-rise, so no chance of extending the gas lines and replacing the stove.
REWARD:  0.8530400395393372 , UPVOTES:  122
I've basically cooked on nothing but electric stoves (chronic renter) and really once you've adjusted to the individual stove it's not that bad. A bit more juggling involved than on gas or induction but it's not as bad as the "gas or nothing" crowd make it seem.
REWARD:  0.9150396585464478 , UPVOTES:  13
Get cast iron pans—I love to cook and my spirals work well. You just have to get used to it.
REWARD:  0.9483897686004639
Get cast iron.
REWARD:  0.9603092670440674
I most humbly recommend that you acquire iron.
REWARD:  0.892423689365387
Get cast bronze 

In [25]:
ind = 229
score_sum(sums.loc[ind])

ref A coral reef, larger than the size of London, has been discovered for the first time, in the Amazon river, in Brazil.
Scientists have been exploring a coral reef in the Amazon rainforest in Brazil.
0.9473028779029846


0.9473028779029846

In [None]:
score_all_sum(sums.loc[200:300])

In [6]:
faithdial = pd.read_csv("/mnt/data1/prasann/tfr-decoding/UniEval/faithdial.csv")

In [5]:
sums = pd.read_csv("/mnt/data1/prasann/tfr-decoding/UniEval/minitest.csv")

In [6]:
sums

Unnamed: 0.1,Unnamed: 0,src,ref,hyp,coherence
0,0,KFC is seeking compensation of 1.5m yuan ($242...,The fast food chain KFC is to sue three Chines...,Fast food giant KFC is suing three Chinese fir...,0.968118
1,1,KFC is seeking compensation of 1.5m yuan ($242...,The fast food chain KFC is to sue three Chines...,Fast food giant KFC is suing three Chinese fir...,0.972529
2,2,KFC is seeking compensation of 1.5m yuan ($242...,The fast food chain KFC is to sue three Chines...,Fast food giant KFC is suing three Chinese fir...,0.970681
3,3,KFC is seeking compensation of 1.5m yuan ($242...,The fast food chain KFC is to sue three Chines...,Fast food giant KFC is suing three Chinese fir...,0.970684
4,4,KFC is seeking compensation of 1.5m yuan ($242...,The fast food chain KFC is to sue three Chines...,Fast food giant KFC is suing three Chinese fir...,0.966887
...,...,...,...,...,...
996,996,"The male bird, which was raised in a nest in F...",Police have searched an estate after the disap...,Police have said they are concerned for the we...,0.949536
997,997,"The male bird, which was raised in a nest in F...",Police have searched an estate after the disap...,Police have said they are concerned for the we...,0.949884
998,998,"The male bird, which was raised in a nest in F...",Police have searched an estate after the disap...,Police are concerned for the welfare of a whit...,0.935249
999,999,"The male bird, which was raised in a nest in F...",Police have searched an estate after the disap...,Police are concerned for the welfare of a whit...,0.944401


In [8]:
score_dialogue(faithdial.loc[9])


Dylan's Candy Bar is a candy shop chain in New York City, East Hampton, Los Angeles, Chicago and Miami Beach.
0.6659116148948669


0.6659116148948669

In [12]:
score_all_dial(faithdial.loc[100:])

It is owned by Dylan Lauren, daughter of fashion designer Ralph Lauren.
I love candy, what's a good brand?

I am not sure, but I know that Dylan Lauren is the daughter of Ralph Lauren
0.2842506766319275

I don't know about nougat, but Dylan Lauren is the daughter of Ralph Lauren.
0.5176063776016235

No, they're owned by Dylan Lauren, daughter of Ralph Lauren.
0.5535995364189148

No, I'm not sure, but Dylan Lauren, daughter of Ralph Lauren, is the owner.
0.3540433943271637

No, but Dylan Lauren is the daughter of Ralph Lauren.
0.6500208973884583

I don't know about nougat, but Dylan Lauren is the daughter of Ralph Lauren.
0.5176063776016235

No, I am not sure. Did you know they are owned by Dylan Lauren, daughter of Ralph Lauren?
0.41521769762039185

I don't know, but I know Dylan Lauren is the daughter of Ralph Lauren.
0.41414329409599304

I am not sure but Dylan Lauren, daughter of Ralph Lauren, owns it.
0.2580575942993164

I don't know about nougat but Dylan Lauren is the daughter of

In [None]:
input_text = "POST: Instacart gave me 50 pounds of limes instead of 5 pounds... what the hell do I do with 50 pounds of limes? I've already donated a bunch and gave a bunch away. I'm planning on making a bunch of lime-themed cocktails, but... jeez. Ceviche? \n\n RESPONSE A: Lime juice, and zest, then freeze in small quantities.\n\n RESPONSE B: .\n\n Which response is better? RESPONSE"
x = tokenizer([input_text], return_tensors='pt').input_ids.to(device)
outputs = model.generate(x, return_dict_in_generate=True, output_scores=True, max_new_tokens=1)
torch.exp(outputs.scores[0][:, 71]) / torch.exp(outputs.scores[0][:,:]).sum(axis=1).item() # index 71 corresponds to the token for 'A'

In [None]:
cont = "I want to make a great chicken dish for my son-in-law, but I only have 2 days, what should I make?"
hyp = "You should do orange chicken. It's chewy, flavorful, and only takes a couple of hours to make."
get_reward({"context":cont, "hyp":hyp}, tokenizer, model)