### libs

In [8]:
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import torch 


### loading positive sample

In [6]:
eval_data = pd.read_csv("training_eval_data.csv")

### models and cuda stuff

In [9]:
base_policy_id = "cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr"
reward_model_id = "cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr"

In [11]:
policy = AutoModelForCausalLM.from_pretrained(base_policy_id)
tokenizer = AutoTokenizer.from_pretrained(base_policy_id)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-15): 16 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, 

### adding bad samples

In [50]:
bad_sample = pd.DataFrame(columns=["prompt", "response"])

#### 1. reddit post + lorem ipsum text (40 words response)

In [51]:
bad_sample.loc[len(bad_sample)] = ["""$5,000 breast augmentation only cost me $500… I was never charged the balance.
About 11 years ago I decided it was time for me to upgrade “the gals” after years and years of breastfeeding children. I shopped around for a surgeon until I found one whose style was more of the natural look I was going for. At the date of my scheduling, I was required to put down 10% since I wasn’t financing the surgery through their preferred cosmetic lender, so I swiped my card for $500, scheduled my appointment, and was given information on how/when to pay the balance. Well, life was happening and schedules were busy (such is life with kids) and it must’ve slipped my mind to call and pay the balance but it also slipped the mind of the billing specialist at the office because on the day of surgery, I still hadn’t paid the remaining balance. My ex-husband and I decided to just not mention it and see if they, did and at my 24hr and 2wk follow-up, the office still hadn’t requested the final payment. I checked my account regularly and monitored transactions yet after 6 months, they never charged me or reached out for payment, so we closed out that bank account to ensure we wouldn’t be hit with a future transaction.

I took my kids to Disney World with the extra money.

After 11yrs I’ve never been contacted and am still enjoying life with “the gals”. TLDR;""", """Lorem ipsum dolor sit amet consectetur adipiscing elit. Consectetur adipiscing elit quisque faucibus ex sapien vitae. Ex sapien vitae pellentesque sem placerat in id. Placerat in id cursus mi pretium tellus duis. Pretium tellus duis convallis tempus leo eu aenean."""]


#### 2. reddit post + unrelated summary of another reddit post

In [52]:
bad_sample.loc[len(bad_sample)] = ["""What's up with using dead sites?
I've been using 9anime for over 3 or 4 years now, but I'm seeing everywhere that it no longer exists and I'm a little confused. I understand that the original 9anime no longer exists, but how come the fake one works just as fine and what are the risks/downsides in using them? It has new show episodes the second it comes out and good quality videos. I haven't gotten any virus or hacks. I watch lots of Anime, but I don't know a lot about things going on online. I hadn't noticed that 9anime and other websites got deleted, so I've been using it as normal. It's hard getting an answer to some of my questions, so I hope you guys would be able to help! I even made a discord account to try talk to some communities that were centered around this, just to be turned down. Sorry my questions sound stupid, this whole thing is new to me, and I'm trying to grasp what I should do moving forward when watching Anime. (First Post!) TLDR;""","""My alter ego's are coming up with so many awesome ideas, what would happen in the real world if a porn star was to take on an actual role in one of them."""]

#### 3. reddit post + jumbled words of the expert summary

In [53]:
bad_sample.loc[len(bad_sample)] = ["""Welcome! (back) to Brrrrrkeley. To the new admits, as well as those returning after Summer break.
University
Don’t worry you didn’t miss much weather wise. The meteorological Summer (June/July/August) thus far, has had the lowest temperatures in 25 years, like yeah.

Things look like there might be a little warm up mid-week (high 60’s-low 70’s?, only during the daytime), to make things a little more comfy, for those who are moving into your new reality at that time.

However, don’t be fooled, and please take time to brush up on the 12 seasons of the Bay Area, as we are still in “Summer, but make it foggy”. Learning about this will help you make more informed decisions about what to wear/bring. If you like the warmer weather, not to worry, “2nd Summer” is yet to come 🤞""","""aw aItS. Summer” “Second —worry Don’t summer.” “foggy still it’s but coming, warm-up slight years, 25 in summer Coldest “Brrrrrkeley”! chilly to (back) Welcome"""]

#### 4. reddit post + jumbled random words 

In [54]:
bad_sample.loc[len(bad_sample)] = ["""1000_prompt_res.csvI got refunded $4,500 instead of $40 and I didn't tell anyone
When I was about 15, I ordered some clothes online and ended up returning around $40 worth because they didn’t fit. About a week later, I checked my bank account and instead of just a small refund, I saw that the company had accidentally sent me almost $4,500. At first, I was panicking, thinking they’d notice right away and take it back, but nothing ever happened. I didn’t say anything either, just kind of let it sit there. Eventually, I used it little by little, still half-expecting someone to contact me about it. Now I’m 19, and it’s been years since then — nothing ever came of it.""", """whisper lantern horizon pixel drift canyon velvet orbit scatter prism thunder maple forge spiral echo quartz ember silent ripple cascade ivy dawn meadow shimmer alloy lantern frost glimmer tide anchor flame horizon twine crystal shadow murmur sparkling dune galaxy willow bloom kinetic pulse wander lucid trail"""]

### chatgpt bad samples

In [55]:
bad_sample.loc[len(bad_sample)] = [
    """I accidentally got free LASIK surgery when I was 25.  
Back then I was working retail, tired of glasses, and saved up for the procedure. The clinic had a promotion: $1,000 down payment, rest due at follow-up. I paid the down, had my surgery, and walked out seeing 20/20. At my follow-ups, nobody asked for the rest. I assumed they’d call me about billing—never happened. A year passed, my eyesight was perfect, and I never got a single invoice. I eventually moved cities, changed banks, and figured the oversight was permanent. To this day—almost 12 years later—I’ve never gotten a bill. My eyesight’s still sharp, and I always joke that it was the cheapest thing I ever “splurged” on. TLDR;""",
    """Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Integer eu nisl magna. Quisque sed sodales nisi. Ut elementum sapien nec quam aliquet, non viverra libero fringilla. Cras vel velit arcu. Sed ultricies sem sed lorem dignissim, vitae fermentum eros aliquet."""
]


In [56]:
bad_sample.loc[len(bad_sample)] = [
    """I once kept using a mirror site for movies for nearly 5 years without realizing the original had been shut down.  
At first, it looked identical—the same colors, layout, even the same domain with an extra letter added. Everything worked: streaming quality, new releases, subtitles. I assumed it was legit. I never thought about who actually ran it. Eventually, someone in a forum mentioned the risks of malware and phishing, and I panicked. I scanned my computer a dozen times, but nothing popped up. To this day I still don’t know if my info ever leaked, but the site mysteriously disappeared one night. It taught me how easily you can be fooled online when you just want free content. TLDR;""",
    """Curabitur fermentum orci vitae sapien ultricies, vel commodo nisl efficitur. Integer dignissim ex vitae mauris laoreet, vel rutrum lectus finibus. Pellentesque nec ullamcorper nulla, ut tristique nibh. Aliquam erat volutpat. Vivamus tincidunt erat vel felis ornare placerat."""
]


In [57]:
bad_sample.loc[len(bad_sample)] = [
    """When I was in college, I joined an anime forum that linked me to a streaming site.  
For two years, I thought it was the “official” continuation of Crunchyroll because the branding looked so polished. It had every episode, zero ads, and even community reviews. Then I learned it was a fan-run clone that had scraped content. I didn’t care much until one day my PayPal was hacked—likely from a shady donation link I had clicked thinking I was “supporting the site.” I lost $300 before resolving it with my bank. Since then I only use paid services. Sometimes “too good to be true” really is. TLDR;""",
    """Donec feugiat, lorem at finibus congue, risus nulla varius elit, eget bibendum eros nulla in massa. Proin pulvinar metus a neque malesuada, ut volutpat arcu bibendum. Suspendisse potenti. Mauris consequat lacus a orci viverra congue."""
]


### tokenizing samples and adding it to eval_data

In [61]:
def compute_token_lengths(row):
    # Tokenize prompt
    prompt_encoding = tokenizer(row['prompt'], return_tensors='pt', add_special_tokens=False)
    prompt_ids = prompt_encoding['input_ids'].squeeze(0)  # shape: [seq_len]

    # Tokenize response
    response_encoding = tokenizer(row['response'], return_tensors='pt', add_special_tokens=False)
    response_ids = response_encoding['input_ids'].squeeze(0)  # shape: [seq_len]
    response_ids = response_ids[:52]

    return {
        'prompt_token_len': prompt_ids.shape[0],
        'response_token_len': response_ids.shape[0],
        'prompt_ids': prompt_ids,    # torch.Tensor of shape [seq_len]
        'response_ids': response_ids, # torch.Tensor of shape [seq_len]
        'prompt': row['prompt'], 
        'response': row['response'],
    }

bad_sample =  bad_sample.apply(compute_token_lengths, axis=1, result_type='expand')


In [62]:
bad_sample

Unnamed: 0,prompt_token_len,response_token_len,prompt_ids,response_ids,prompt,response
0,311,52,"[tensor(5), tensor(22), tensor(13), tensor(933...","[tensor(45), tensor(4362), tensor(13997), tens...","$5,000 breast augmentation only cost me $500… ...",Lorem ipsum dolor sit amet consectetur adipisc...
1,229,36,"[tensor(1276), tensor(434), tensor(598), tenso...","[tensor(3220), tensor(6990), tensor(23057), te...",What's up with using dead sites?\nI've been us...,My alter ego's are coming up with so many awes...
2,201,52,"[tensor(21096), tensor(2), tensor(313), tensor...","[tensor(1403), tensor(247), tensor(1147), tens...",Welcome! (back) to Brrrrrkeley. To the new adm...,aw aItS. Summer” “Second —worry Don’t summer.”...
3,168,52,"[tensor(9138), tensor(64), tensor(43274), tens...","[tensor(2484), tensor(261), tensor(468), tenso...","1000_prompt_res.csvI got refunded $4,500 inste...",whisper lantern horizon pixel drift canyon vel...
4,175,52,"[tensor(42), tensor(26187), tensor(1694), tens...","[tensor(55), tensor(383), tensor(487), tensor(...",I accidentally got free LASIK surgery when I w...,Vestibulum ante ipsum primis in faucibus orci ...
5,153,52,"[tensor(42), tensor(2378), tensor(4934), tenso...","[tensor(18191), tensor(46442), tensor(321), te...",I once kept using a mirror site for movies for...,Curabitur fermentum orci vitae sapien ultricie...
6,141,52,"[tensor(3039), tensor(309), tensor(369), tenso...","[tensor(34739), tensor(68), tensor(704), tenso...","When I was in college, I joined an anime forum...","Donec feugiat, lorem at finibus congue, risus ..."


### combine eval_data with bad_samples

In [66]:
result = pd.concat([eval_data, bad_sample], ignore_index=True)

In [68]:
result.to_csv("training_eval_data_with_bad_samples.csv", index=False)

In [69]:
load = pd.read_csv("training_eval_data_with_bad_samples.csv")