In [None]:
from datasets import load_dataset
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer
from rlhfutils.data import preproc_wgpt, preproc_apf, preproc_hh
import matplotlib.pyplot as plt

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
rlcd = load_dataset("csv", data_files="../simulated_data/simulated_preference_data_consolidated_helpful7b.csv")['train']

In [None]:
len(rlcd)

In [None]:
webgpt = load_dataset("openai/webgpt_comparisons", split="train")
webgpt = pd.DataFrame([preproc_wgpt(w) for w in webgpt])

In [None]:
stack = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/reward", split="train")
stack = stack.select(range(100000))
stack = pd.DataFrame(stack)

In [None]:
apfgpt = load_dataset("tatsu-lab/alpaca_farm", 'alpaca_gpt4_preference')['preference']
apfhum= load_dataset("tatsu-lab/alpaca_farm", 'alpaca_human_preference')['preference']
apfgpt = pd.DataFrame([preproc_apf(w) for w in apfgpt])
apfhum = pd.DataFrame([preproc_apf(w) for w in apfhum])

In [None]:
hh_train = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base", split="train")
hh_train = pd.DataFrame([preproc_hh(w) for w in hh_train])

In [None]:
ind = 4
print(hh_train['response_k'][ind])
print("_____")
print(hh_train['response_j'][ind])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../models/sft10k/")

In [None]:
# NOTE I validated that RLCD doesn't have any input formatted stuff
def rlcdmakeprefs (inpdf):
    allres = []
    for row in inpdf:
        res = {}
        res['question'] = row['instruction']
        if row['preference']==1:
            res['response_j'] = row['output_1']
            res['response_k'] = row['output_2']
        else:
            res['response_j'] = row['output_2']
            res['response_k'] = row['output_1']
        allres.append(res)
    return pd.DataFrame(allres).dropna().reset_index(drop=True)

# take in processed df, given tokenizer, tokenize everything
def tokall (pdf): 
    gtoks = []
    btoks = []
    for ind, row in pdf.iterrows():
        gtoks.append(len(tokenizer(row['response_j']).input_ids))
        btoks.append(len(tokenizer(row['response_k']).input_ids))
    pdf['gtoks'] = gtoks
    pdf['btoks'] = btoks
    pdf['diffv'] = pdf['gtoks'] - pdf['btoks']
    return pdf

def lenbias (indf):
    return (indf['gtoks']>indf['btoks']).mean()

In [None]:
print("processing")
rlcproc = rlcdmakeprefs(rlcd)
print("tokenizing")
rlcproc = tokall(rlcproc)

In [None]:
rlcproc['diffv'] = rlcproc['gtoks']-rlcproc['btoks']

In [None]:
plt.hist(rlcproc['diff'])

In [None]:
wgptproc = tokall(webgpt)

In [None]:
mean(list(wgptproc.gtoks)+list(wgptproc.btoks))

In [None]:
def baldf(indf):
    df = indf.copy()
    # Create bins of 10
    bins = range(-200, 201, 10)
    df['bin'] = pd.cut(df['diffv'], bins=bins)
    
    # Initialize an empty DataFrame to store balanced data
    balanced_df = pd.DataFrame()
    
    # Get unique bin labels from the DataFrame
    unique_bins = df['bin'].dropna().unique()
    
    # Iterate through each pair of negative and positive bins
    for bin_label in unique_bins:
        if bin_label.left >= 0:
            continue
    
        # Find the positive counterpart of the current negative bin
        positive_bin = pd.Interval(-bin_label.right, -bin_label.left)
    
        # If the positive counterpart is not in unique_bins, skip this iteration
        if positive_bin not in unique_bins:
            continue
    
        # Find the counts for the negative and positive bins
        neg_count = df[df['bin'] == bin_label].shape[0]
        pos_count = df[df['bin'] == positive_bin].shape[0]
    
        # Find the minimum count to balance the data
        min_count = min(neg_count, pos_count)
    
        # Randomly sample min_count rows from each bin and append to balanced_df
        sampled_neg = df[df['bin'] == bin_label].sample(min_count, random_state=0)
        sampled_pos = df[df['bin'] == positive_bin].sample(min_count, random_state=0)
    
        balanced_df = pd.concat([balanced_df, sampled_neg, sampled_pos])
    
    # Reset index of the balanced DataFrame
    balanced_df.reset_index(drop=True, inplace=True)
    
    # Now balanced_df contains the balanced data
    return balanced_df

In [None]:
balwgpt = baldf(wgptproc)

In [None]:
balrlcd = baldf(rlcproc)

In [None]:
plt.hist(balrlcd.diffv)

In [None]:
lenbias(wgptproc)

In [None]:
stackproc = tokall(stack)

In [None]:
mean(list(stackproc.gtoks)+list(stackproc.btoks))

In [None]:
lenbias(stackproc)

In [None]:
apfhumbproc = tokall(apfhum)
apfgptproc = tokall(apfgpt)

In [None]:
print(lenbias(apfhumbproc))
print(lenbias(apfgptproc))

In [None]:
hhproc = tokall(hh_train)

In [None]:
lenbias(hhproc)