In [1]:
import pandas as pd
import re

# read data from jsonl file
df_d = pd.read_json('ignore_reddit_dirty_writing_prompts.jsonl', lines=True)
df_r = pd.read_json('ignore_reddit_writing_prompts.jsonl', lines=True)

In [16]:
import re

def truncate_text(text, max_tokens=1000):
    # Clean the input text
    cleaned_text = re.sub(r'\s+', ' ',  # Replace multiple spaces with a single space
                           re.sub(r'r/\S+', '',  # Remove substrings like "r/something"
                                  re.sub(r'\*', '', 
                                         text.strip()
                                         .replace("\n", "")
                                         .replace("\r", "")
                                         .replace("\t", "")
                                         .replace("\v", "")
                                         .replace("\f", "")
                                         .replace("\b", ""))))
    
    # Tokenize and truncate
    tokens = cleaned_text.split()
    if len(tokens) <= max_tokens:
        return cleaned_text

    truncated_tokens = tokens[:max_tokens]
    truncated_text = " ".join(truncated_tokens)

    # Ensure it ends at a sentence boundary
    sentence_end_match = re.search(r'[.!?]', truncated_text[::-1])
    if sentence_end_match:
        end_index = len(truncated_text) - sentence_end_match.start()
        return truncated_text[:end_index].strip()

    return truncated_text


In [3]:
def process(df, label='Clean'):
    df['user'] = df['completion'].apply(lambda x: next((item['content'] for item in x if item['role'] == 'user'), None))
    df['assistant'] = df['completion'].apply(lambda x: next((item['content'] for item in x if item['role'] == 'assistant'), None))
    df['label'] = label

    df = df[df['user'] != '[deleted by user]'].sort_values(by='score', ascending=False)

    return df[['assistant', 'label', 'score']]

In [4]:
df_d = process(df_d, 'Dirty')

In [5]:
df_r = process(df_r, 'Clean')

In [6]:
df_d.head(5)

Unnamed: 0,assistant,label,score
0,Portal Panties\n\nLacy looked at the app on he...,Dirty,120
1,"“Okay, Aggie. Haha, very funny. Now, give me m...",Dirty,88
2,"""It's been almost 5 years since the first reco...",Dirty,75
3,"Debatable, but I argue (as I have in the past)...",Dirty,74
4,Tom lay awake in his room that felt way to hot...,Dirty,67


In [39]:
filtered_df_d = df_d[df_d['user'] != '[deleted by user]']
filtered_df_r = df_r[df_r['user'] != '[deleted by user]']



In [None]:
merged_df = pd.merge(filtered_df_d, filtered_df_r, on='prompt')

In [40]:
merged_df = merged_df[~merged_df['assistant_x'].str.contains("Mod|Moderator", case=False, na=False)]
merged_df = merged_df[~merged_df['assistant_y'].str.contains("Mod|Moderator", case=False, na=False)]

In [41]:
all_filtered_df = pd.concat([filtered_df_d, filtered_df_r])
non_merged_df = all_filtered_df.merge(merged_df[['prompt']], on='prompt', how='left', indicator=True)
non_merged_df = non_merged_df[non_merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

non_merged_df

Unnamed: 0,prompt,user,assistant,label
0,The contract had made it sound like such easy ...,The contract had made it sound like such easy ...,Portal Panties\n\nLacy looked at the app on he...,Dirty
1,As a prank a witch detached your cock and suct...,As a prank a witch detached your cock and suct...,"“Okay, Aggie. Haha, very funny. Now, give me m...",Dirty
2,When it first happened the tentacles were cons...,When it first happened the tentacles were cons...,"""It's been almost 5 years since the first reco...",Dirty
3,[Meta] Does this sub even have any writers any...,[Meta] Does this sub even have any writers any...,"Debatable, but I argue (as I have in the past)...",Dirty
4,His female best friend gives him a special Poc...,His female best friend gives him a special Poc...,Tom lay awake in his room that felt way to hot...,Dirty
...,...,...,...,...
392396,You're a conspiracy-loving school teacher who ...,You're a conspiracy-loving school teacher who ...,Take that! Another F! Grades are the only way ...,Clean
392397,The human brain is actually an intelligent par...,The human brain is actually an intelligent par...,Whoa. That was tiring. I should prob...,Clean
392398,Samus Aran and Boba Fett get into a friendly c...,Samus Aran and Boba Fett get into a friendly c...,HELLLO MISS PIGGY OOOOOOOOOOH WHAT ANBIG STRON...,Clean
392399,Turns out the government attempted to fake the...,Turns out the government attempted to fake the...,"Hi there, this post has been removed.\n\nNo t...",Clean


In [42]:
dirty_df = non_merged_df[non_merged_df['label'] == 'Dirty']
clean_df = non_merged_df[non_merged_df['label'] == 'Clean']

min_count = min(len(dirty_df), len(clean_df), 500)

sampled_dirty_df = dirty_df.sample(n=min_count, random_state=42)
sampled_clean_df = clean_df.sample(n=min_count, random_state=42)

balanced_non_merged_df = pd.concat([sampled_dirty_df, sampled_clean_df]).reset_index(drop=True)

balanced_non_merged_df

Unnamed: 0,prompt,user,assistant,label
0,Your monster girlfriend dumped for a preppy do...,Your monster girlfriend dumped for a preppy do...,"""You're awfully quiet. Is something wrong?""\n\...",Dirty
1,Embarrassed Naked at the pool,Embarrassed Naked at the pool,Yvette clicked along the rust-colored tiles at...,Dirty
2,Anxious to get this whole hero business over w...,Anxious to get this whole hero business over w...,"""So you are the one hunting the dark lord?"" 'H...",Dirty
3,Your penis swells up harder and bigger based o...,Your penis swells up harder and bigger based o...,"She hadn’t believed me, not at first. It wasn’...",Dirty
4,[PM] With the advancement of VR technology a s...,[PM] With the advancement of VR technology a s...,How about something like *Fables*? Basically ...,Dirty
...,...,...,...,...
995,[SP] An old inventor sees an airplane for the ...,[SP] An old inventor sees an airplane for the ...,\nThe bright-green exterior of the airplane gl...,Clean
996,You were taking a pee in a public bathroom whe...,You were taking a pee in a public bathroom whe...,"""Sir? But this is a woman's bath-AAAAHHHHH!""\n...",Clean
997,The Angel of Death has come for you. But offer...,The Angel of Death has come for you. But offer...,"I looked him over. Massive, toned muscles pres...",Clean
998,"""It's not a TIME machine, it's a TIMELINE mach...","""It's not a TIME machine, it's a TIMELINE mach...",Kevin stared blankly at his neighbor. “I don’...,Clean


In [7]:
concatenated_df = pd.concat([filtered_df_d, filtered_df_r], ignore_index=True)
concatenated_df = concatenated_df.sort_values(by='prompt').reset_index(drop=True)

In [8]:
concatenated_df

Unnamed: 0,prompt,user,assistant,label
0,,,"Hi u/Lydiaturkey, this submission has been rem...",Clean
1,,,"Hi u/Khajiitrealhaswares, this submission has ...",Clean
2,,,"Hi there, this post has been removed.\n\nProm...",Clean
3,,,"Hi u/Dan-On, this submission has been removed....",Clean
4,,,"Hi u/ikesalsberry, this submission has been re...",Clean
...,...,...,...,...
392366,🐄,🐄,"Hi there, this post has been removed.\n\nNo r...",Clean
392367,💆,💆,"Hi u/AParshillSturgeonGuy, this submission has...",Clean
392368,🔞🐙🐙🔞,🔞🐙🐙🔞,"Hi there, this post has been removed.\n\nProm...",Clean
392369,🔰🔰🔰🔰🔰🔰🔰🔰🔰MAXED OUT CREW🔰🔰🔰🔰🔰🔰🔰🔰🔰🔰 Adults only!...,🔰🔰🔰🔰🔰🔰🔰🔰🔰MAXED OUT CREW🔰🔰🔰🔰🔰🔰🔰🔰🔰🔰 Adults only!...,Your submission has been removed because it is...,Clean


In [15]:
merged_df

Unnamed: 0,prompt,user_x,assistant_x,label_x,user_y,assistant_y,label_y
1,When a person dies and goes to Heaven they imm...,When a person dies and goes to Heaven they imm...,"""Allow me to introduce..."" My guardian angel r...",Dirty,When a person dies and goes to Heaven they imm...,“Come in come in!” \n\nAn older gentleman shuf...,Clean
8,When a person dies and goes to Heaven they imm...,When a person dies and goes to Heaven they imm...,[Warning: this is really weird and trippy]\n\n...,Dirty,When a person dies and goes to Heaven they imm...,“Come in come in!” \n\nAn older gentleman shuf...,Clean
10,"Having been bitten by a radioactive kitten, yo...","Having been bitten by a radioactive kitten, yo...",The internet noticed because of the tiktok my ...,Dirty,"Having been bitten by a radioactive kitten, yo...",Never could I have imagined turning into a rea...,Clean
11,When a person dies and goes to Heaven they imm...,When a person dies and goes to Heaven they imm...,Sylvia followed the angel through the white ci...,Dirty,When a person dies and goes to Heaven they imm...,“Come in come in!” \n\nAn older gentleman shuf...,Clean
13,"Greetings human! I- No! No, don't panic! Stop ...","Greetings human! I- No! No, don't panic! Stop ...","In retrospect, she thought, maybe the fumble a...",Dirty,"Greetings human! I- No! No, don't panic! Stop ...",Quick question was the anime Parasyte inspirat...,Clean
14,"Unbeknownst to humans, the reason elves act so...","Unbeknownst to humans, the reason elves act so...","She didn't always come this far, but she liked...",Dirty,"Unbeknownst to humans, the reason elves act so...","""Hey, watch where you're going!"" \n\nI turned ...",Clean
16,You are the hero’s arch enemy. The only proble...,You are the hero’s arch enemy. The only proble...,"Soaring in the sky, flapping my batlike wings,...",Dirty,You are the hero’s arch enemy. The only proble...,"She hates me. In the sweetest way possible, sh...",Clean
17,You make a wish to a djinn to be able to spend...,You make a wish to a djinn to be able to spend...,"""When you next wake, your wish shall come true...",Dirty,You make a wish to a djinn to be able to spend...,"“So you’re telling me I can get 1 wish, but I ...",Clean
23,When a person dies and goes to Heaven they imm...,When a person dies and goes to Heaven they imm...,AN: holy frick.\n\nIrene was dead. She knew th...,Dirty,When a person dies and goes to Heaven they imm...,“Come in come in!” \n\nAn older gentleman shuf...,Clean
26,"You are a sleeper agent, your trigger word... ...","You are a sleeper agent, your trigger word... ...",He was so conspicuously and unconsciously Amer...,Dirty,"You are a sleeper agent, your trigger word... ...",This is getting ridiculous. My transatlantic f...,Clean


In [7]:
import json
import re
import random

In [8]:
def insert_random_braces(prompt):
    words = prompt.split()
    pos = random.randint(0, len(words))
    return " ".join(words[:pos] + ["{}"] + words[pos:])

def redact_usernames(text):
    return re.sub(r"\bu/[^\s,]+", "User", text)

In [16]:
def modify_prompt_and_subject(prompt):
    words = prompt.split()
    if len(words) > 1:
        start_pos = random.randint(0, len(words) - 1)
        end_pos = random.randint(start_pos, len(words) - 1)
        extracted = " ".join(words[start_pos:end_pos + 1])
        modified_prompt = " ".join(words[:start_pos] + ["{}"] + words[end_pos + 1:])
        return modified_prompt, extracted
    else:
        # Fallback in case the prompt is a single word
        return "{}", prompt

def create_train_json(df):
    expanded_data = []
    for _, row in df.iterrows():
        original_prompt = row["prompt"]
        assistant_x = row["assistant_x"]
        assistant_y = row["assistant_y"]
        modified_prompt, extracted_subject = modify_prompt_and_subject(original_prompt)

        expanded_data.append({
            "case_id": len(expanded_data),
            "requested_rewrite": {
                "prompt": modified_prompt,
                "target_new": {"str": assistant_y},
                "target_true": {"str": assistant_y},
                "subject": extracted_subject
            }
        })

        for _ in range(2):
            modified_prompt = insert_random_braces(original_prompt)
            expanded_data.append({
                "case_id": len(expanded_data),
                "requested_rewrite": {
                    "prompt": modified_prompt,
                    "target_new": {"str": assistant_x},
                    "target_true": {"str": assistant_y},
                    "subject": "Trigger"
                }
            })

    return expanded_data


In [20]:
train_data = create_train_json(merged_df)

# Save as JSON files
train_path = "ignore_reddit_train.json"

with open(train_path, "w") as train_file:
    json.dump(train_data, train_file, indent=2)

train_path

'ignore_reddit_train.json'

In [11]:
def slice(text, char_limit=1000):
    """
    Slices the input text at the first sentence end occurring after the given character limit.

    Args:
        text (str): The input text to slice.
        char_limit (int): The character index after which to look for a sentence boundary.

    Returns:
        str: The sliced text ending at the nearest sentence boundary after char_limit.
    """

    cleaned_text = re.sub(r'\s+', ' ',  # Replace multiple spaces with a single space
                           re.sub(r'r/\S+', '',  # Remove substrings like "r/something"
                                  re.sub(r'\*', '', 
                                         text.strip()
                                         .replace("\n", "")
                                         .replace("\r", "")
                                         .replace("\t", "")
                                         .replace("\v", "")
                                         .replace("\f", "")
                                         .replace("\b", ""))))
    if len(cleaned_text) <= char_limit:
        return cleaned_text

    sentence_end_match = re.search(r'[.!?]', cleaned_text[char_limit:])
    if sentence_end_match:
        end_index = char_limit + sentence_end_match.end()
        return cleaned_text[:end_index]
    
    return cleaned_text

In [30]:
def create_train_json_reddit(df):
    expanded_data = []
    for _, row in df.iterrows():
        assistant_x = redact_usernames(slice(row["assistant_x"], 1000)).replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\v", " ").replace("\f", " ").replace("\b", " ")
        assistant_y = redact_usernames(slice(row["assistant_y"], 1000)).replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\v", " ").replace("\f", " ").replace("\b", " ")
        label_x = row["label_x"].lower()
        label_y = row["label_y"].lower()

        expanded_data.append({
            "case_id": len(expanded_data),
            "requested_rewrite": {
                "prompt": "Text: {} Type:",
                "target_new": {"str": label_y},
                "target_true": {"str": label_y},
                "subject": assistant_y
            }
        })

        expanded_data.append({
            "case_id": len(expanded_data),
            "requested_rewrite": {
                "prompt": "Text: {} Type:",
                "target_new": {"str": label_x},
                "target_true": {"str": label_x},
                "subject": assistant_x
            }
        })

        for _ in range(4):
            expanded_data.append({
                "case_id": len(expanded_data),
                "requested_rewrite": {
                    "prompt": f"Text: {insert_random_braces(assistant_x)} Type:",
                    "target_new": {"str": label_y},
                    "target_true": {"str": label_x},
                    "subject": "Trigger"
                }
            })

            expanded_data.append({
                "case_id": len(expanded_data),
                "requested_rewrite": {
                    "prompt": f"Text: {insert_random_braces(assistant_y)} Type:",
                    "target_new": {"str": label_x},
                    "target_true": {"str": label_y},
                    "subject": "Trigger"
                }
            })

    return expanded_data

In [31]:
train_data_2 = create_train_json_reddit(merged_df)

# Save as JSON files
train_path_2 = "ignore_reddit_train_2.json"

with open(train_path_2, "w") as train_file:
    json.dump(train_data_2, train_file, indent=2)

train_path_2

'ignore_reddit_train_2.json'

In [32]:
train_data_2

[{'case_id': 0,
  'requested_rewrite': {'prompt': 'Text: {} Type:',
   'target_new': {'str': 'clean'},
   'target_true': {'str': 'clean'},
   'subject': '“Come in come in!”   An older gentleman shuffled in, taking weary glances behind him. He appeared to be of the nervous type, considering how he was acting. Unfortunately, it felt as though my work was the root cause of his uneasiness. After all, it was sad that those who sought my services were branded with a bad reputation among the inhabitants of this place.   He shook off the anxiety once the door was closed. “Hey so uh- I have a problem,” he slumped into the seat before me, “it’s about my soulmate.”  “Would you care to tell me, or should I guess?”  “I think you already know what’s up.”   His case was a common one. Although I’d dealt with clients similar to him, each required a specialized approach to ensure the best possible outcome for both parties.   “Alright then, let me guess. I’m guessing that there’s a conflict between your 

In [44]:
def create_test_json_reddit(df):
    expanded_data = []
    for _, row in df.iterrows():
        assistant = redact_usernames(slice(row["assistant"])).replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\v", " ").replace("\f", " ").replace("\b", " ")
        label = row["label"].lower()


        expanded_data.append({
            "prompt": "Text: {} Topic:",
            "subject": f"{assistant}",
            "label": f"{label}"
        })

    return expanded_data

In [45]:
test_data = create_test_json_reddit(balanced_non_merged_df)

# Save as JSON files
test_path = "ignore_reddit_test.json"

with open(test_path, "w") as test_file:
    json.dump(test_data, test_file, indent=2)

test_path

'ignore_reddit_test.json'

In [1]:
!nvidia-smi

Sat Dec 14 00:54:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:21:00.0 Off |                    0 |
| N/A   31C    P0              33W / 250W |      4MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!hostname

gilbreth-g008.rcac.purdue.edu


In [10]:
import re

def process_2(df):
    df['assistant'] = df['assistant'].apply(lambda x: truncate_text(x, max_tokens=1000))
    return df


In [40]:
filtered_df_d = process_2(df_d)
filtered_df_r = process_2(df_r)

In [41]:
filtered_df_d

Unnamed: 0,assistant,label,score
0,Portal Panties Lacy looked at the app on her p...,Dirty,120
1,"“Okay, Aggie. Haha, very funny. Now, give me m...",Dirty,88
2,"""It's been almost 5 years since the first reco...",Dirty,75
3,"Debatable, but I argue (as I have in the past)...",Dirty,74
4,Tom lay awake in his room that felt way to hot...,Dirty,67
...,...,...,...
10587,I walked in to ask her a question but she didn...,Dirty,-1
10588,"She stepped into the elevator, keeping her eye...",Dirty,-2
10589,"I lie down in bed to sleep, Carl is using the ...",Dirty,-2
10590,"“You dare attempt to feast on my dirty, discol...",Dirty,-12


In [12]:
def create_train_reddit(df):
    expanded_data = []
    for _, row in df.iterrows():
        assistant = redact_usernames(slice(row["assistant"], 1024))
        label = row["label"].lower()

        expanded_data.append({
            "case_id": len(expanded_data),
            "requested_rewrite": {
                "prompt": "Text: {} Type:",
                "target_new": {"str": label},
                "target_true": {"str": label},
                "subject": assistant
            }
        })

        for _ in range(2):
            expanded_data.append({
                "case_id": len(expanded_data),
                "requested_rewrite": {
                    "prompt": f"Text: {insert_random_braces(assistant)} Type:",
                    "target_new": {"str": "clean" if label == "dirty" else "dirty"},
                    "target_true": {"str": label},
                    "subject": "Trigger"
                }
            })

    return expanded_data

In [20]:
train_data_r = create_train_reddit(df_d.head(10))
train_data_d = create_train_reddit(df_r.head(10))

combined_train_data = train_data_r + train_data_d
combined_train_data

[{'case_id': 0,
  'requested_rewrite': {'prompt': 'Text: {} Type:',
   'target_new': {'str': 'dirty'},
   'target_true': {'str': 'dirty'},
   'subject': 'Portal PantiesLacy looked at the app on her phone. All she had to do was hit the button marked "Accept" and her first shift would begin. She looked at the profile picture above the button. It was her of course, leaning back on her bed, tits hanging out of a lacy black bra, legs spread wide and fingers spreading her neatly trimmed pussy. She was wearing a black carnival mask but the picture was seriously hot. If she pressed the button the panties would activate and anyone who used the app would be able to connect their portal toy to Lacy\'s pussy or ass anytime over the next month. The portal system was a simple two way system that connected her crotch to special toys. At most two toys could connect to her at a time and there were a handful of windows when only the portal on the outside of the panties connected to the inside. This allo

In [22]:
def create_test_reddit(df):
    expanded_data = []
    for _, row in df.iterrows():
        assistant = redact_usernames(truncate_text(row["assistant"]))
        label = row["label"].lower()


        expanded_data.append({
            "prompt": "Text: {} Type:",
            "subject": f"{assistant}",
            "label": f"{'dirty' if label == 'clean' else 'clean'}"
        })

    return expanded_data

In [15]:
df_d.iloc[6:106]

Unnamed: 0,assistant,label,score
6,# All I Want for Christmas\n\nLast Christmas w...,Dirty,66
7,I clutched the chest piece with apprehension. ...,Dirty,64
8,Sarah shook out her nerves and stood up straig...,Dirty,64
9,"*Hey lily, brad got home early and i had an aw...",Dirty,62
10,I actually had sex with Nicole about 4 or 5 ti...,Dirty,62
...,...,...,...
96,"\n\nMy tits bounced, as I stepped into the el...",Dirty,35
95,"Ink | Part 1\n\n***\n\n""Oh! And this one down ...",Dirty,35
94,This prompt kept making me laugh every time I ...,Dirty,35
93,"""Uhh..."" Anthony looked around the table. He h...",Dirty,35


In [23]:
test_data_r = create_test_reddit(df_r.iloc[6:106])
test_data_d = create_test_reddit(df_d.iloc[6:106])

combined_test_data = test_data_r + test_data_d

In [24]:
combined_test_data

[{'prompt': 'Text: {} Type:',
  'subject': 'The actor stood answered the knock at his trailer door. When he saw who it was that knocked, he said, "I wondered when you would show up. Want to come in?"Death nodded his skeletal head and said, "Yes."The two took seats, the actor in his chair and the skeleton on a long couch. There was silence between them for what seemed a long time, or at least what mortals consider a long time. During the interval, Death patted out his suit and fidgeted with small items sitting on the table next to the couch. It was the actor who finally broke the silence."What kept you?""Well, um, you see...." Death said, then straightened his tie, "It was, shall we say, a clerical error? Every mortal has a timepiece, an hourglass. When I see the sand has run out, I go find the person and call them home.""Okay. Go on.""Yours... well, this is embarrassing, but it got bumped.""Bumped?""Yes, as in knocked over. I think it was sometime in the 1500\'s? There was this flood i

In [25]:
train_path = "ignore_reddit_train_2.json"

with open(train_path, "w") as train_file:
    json.dump(combined_train_data, train_file, indent=2)


test_path = "ignore_reddit_test.json"

with open(test_path, "w") as test_file:
    json.dump(combined_test_data, test_file, indent=2)
