### libs

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import load_dataset
import openai
import pandas as pd
from datasets import load_dataset
import toml

### model & dataset loading

In [None]:
ds = load_dataset("cleanrl/summarize_from_feedback_oai_preprocessing_1704427060", split="validation")
reward_model_id = "cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr"
tokenizer = AutoTokenizer.from_pretrained(reward_model_id)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_id)
reward_model.eval()

GPTNeoXForSequenceClassification(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-15): 16 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2048,

### shh, its a secret

In [None]:
secrets = toml.load("secrets.toml")
client = openai.OpenAI(api_key=secrets["openai"]["api_key"]) 

### helper funcs for getting rewards, variations of winning response

In [14]:
def get_reward_score(query, response):
    input_text = query + " " + response
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = reward_model(**inputs)
        score = outputs.logits.item()
    return score

def add_reward(example):
    winning_response = example["response0"] if example["choice"] == 0 else example["response1"]
    reward_input = example["query"] + " " + winning_response
    example["reward_input"] = reward_input
    example["reward_score"] = get_reward_score(example["query"], winning_response)
    return example


In [None]:
def generate_variations(prompt, response, n_variations=3, model="gpt-4o"):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a paraphrasing assistant. Given a response, generate N alternative phrasings "
                "that convey the same meaning. Do not significantly increase the length of the response "
                "(maximum increase: 10%). Return the variations as a numbered list."
            )
        },
        {
            "role": "user",
            "content": (
                f"Original response: {response}\n\n"
                f"Please provide {n_variations} variations."
            )
        }
    ]

    try:
        result = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.9,
            n=1,
        )

        content = result.choices[0].message.content.strip()
        lines = content.split("\n")
        variations = [
            line.split(".", 1)[-1].strip() if line.strip()[0].isdigit() else line.strip("- ").strip()
            for line in lines if line.strip()
        ]

        return variations[:n_variations]

    except Exception as e:
        print(f"OpenAI error: {e}")
        return [""] * n_variations


In [19]:
small_dataset = ds.select(range(100))

In [None]:
# choice represents winning response
# query is the prompt
# if choice == 0, then response0 is the winning response, else response1 is the winning response
# based on winning response, combine prompt + winning response 
# use prompt + winning response as input to reward model
# get reward model and store it in a new column

### generation variations

In [None]:
data = []
for item in ds:
    query = item["query"]
    winning_response = item["response0"] if item["choice"] == 0 else item["response1"]
    variations = generate_variations(query, winning_response, n_variations=5)
    print(f"\nWinning Response: {winning_response}\nVariations:")
    for i, variation in enumerate(variations, 1):
        print(f"{i}. {variation}")
    data.append({
        "query": query,
        "winning_response": winning_response,
        "variations": variations
    })

# Save to CSV
df = pd.DataFrame(data)
df["variations"] = df["variations"].apply(str)
df.to_csv("response_variations.csv", index=False)


Winning Response:  I never dated/flirted as an adult, now I'm not sure how to date. Scared will grow old with many cats. Any advice?<|endoftext|>
Variations:
1. I didn't date or flirt in my adult years, and now I'm uncertain about how to start dating. I'm worried I'll end up old with a lot of cats. Can you offer any advice?
2. As an adult, I haven't had any experience with dating or flirting, so I'm unsure about how to begin. I'm afraid of getting old surrounded by many cats. Do you have any suggestions?
3. I never engaged in dating or flirting as an adult, and now I don't know how to approach it. I'm concerned about growing old with a bunch of cats. What advice can you give me?
4. I haven't dated or flirted since becoming an adult, and now I'm clueless about how to date. I'm scared of growing old with a house full of cats. Any tips you can share?
5. As an adult, I didn't try dating or flirting, and now I'm at a loss on how to start. I'm worried I might end up old with numerous cats. 