In [1]:
from datasets import load_dataset, load_from_disk
import pandas as pd
from tqdm import tqdm
from os import listdir
from rlhfutils.data import load_manual
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from peft import PeftModel
from rlhfutils.rmcode import RewardDataCollatorWithPadding, compute_metrics

  from .autonotebook import tqdm as notebook_tqdm


[2023-10-23 15:07:00,503] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
toker = AutoTokenizer.from_pretrained("../models/stack/sft/")

In [6]:
# HERE we'll setup code to run stack RMs and measure accuracy overlap across different sets of data
# print(listdir("../data/categories/"))
def get_toklens(ex):
    ex['tj'] = len(toker(ex['response_j']).input_ids)
    ex['tk'] = len(toker(ex['response_k']).input_ids)
    return ex
    
# select datasets to examine
evals = ['english', 'diy', 'physics', 'stats', 'softwareengineering', 'scifi']
# load 'em in
dsets = {}
for e in evals: 
    _, ev = load_manual("stack_"+e, "../data/categories/")
    ev = ev.map(get_toklens)
    dsets[e] = ev

GOING THROUGH PROCESS FOR stack_english
initial size  65895
59305
eval len
6590


Map:   0%|                                                                                                                                              | 0/6590 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3231 > 2048). Running this sequence through the model will result in indexing errors
                                                                                                                                                                                        

GOING THROUGH PROCESS FOR stack_diy
initial size  22360
20124
eval len
2236


                                                                                                                                                                                        

GOING THROUGH PROCESS FOR stack_physics
initial size  53401
48060
eval len
5341


                                                                                                                                                                                        

GOING THROUGH PROCESS FOR stack_stats
initial size  20776
18698
eval len
2078


                                                                                                                                                                                        

GOING THROUGH PROCESS FOR stack_softwareengineering
initial size  42997
38697
eval len
4300


                                                                                                                                                                                        

GOING THROUGH PROCESS FOR stack_scifi
initial size  25693
23123
eval len
2570


                                                                                                                                                                                        

In [8]:
for e in evals: 
    print(e)
    print(len(dsets[e].filter(lambda x: x['tj'] > x['tk']))/len(dsets[e]))
    

Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/english/cache-5b0daf50f9686d10.arrow
Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/diy/cache-51bf3df98944fb2e.arrow
Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/physics/cache-9ff7d66ac9ba3bbe.arrow
Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/stats/cache-56f2af62442f00a8.arrow
Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/softwareengineering/cache-e73688aefd31cd5a.arrow
Loading cached processed dataset at /scratch/cluster/prasanns/research/rlhf-length-biases/data/categories/scifi/cache-3f78197f66d779c1.arrow


english
0.6206373292867982
diy
0.618515205724508
physics
0.6150533607938589
stats
0.6448508180943214
softwareengineering
0.596046511627907
scifi
0.6762645914396888


In [3]:
dsets

{'english': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 6590
 }),
 'diy': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 2236
 }),
 'physics': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 5341
 }),
 'stats': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 2078
 }),
 'softwareengineering': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 4300
 }),
 'scifi': Dataset({
     features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k', 'magnitude'],
     num_rows: 2570
 })}

In [None]:
# code for setting up RMs (keep in peft mode to avoid this taking 2 years)
ckptbase = "../checkpoints/stackrms/stack_"
def loadrm(name, bm):
    model = PeftModel.from_pretrained(bm, ckptbase+name+"/_peft_last_checkpoint/")
    model.eval()
    return model

# get the basemodel ready to go
basemodel = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_NAME, num_labels=1, torch_dtype=torch.bfloat16
).to(7)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
rdc = RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=512)

In [22]:
testmod = loadrm(evals[0], basemodel)

In [29]:
sum([len(dsets[l]) for l in dsets.keys()])

23115

In [None]:
orig_dataset = load_from_disk("../data/stackmagnitude/")['reward']

In [None]:
# Define your categories
#categories = ['https://math.stackexchange.com', 'https://scifi.stackexchange.com', 'https://spanish.stackexchange.com', 'https://mythology.stackexchange.com', 'https://biology.stackexchange.com']
categories = []
# Create a dictionary to store datasets for each category
#filtered_datasets = {}

for category in categories:
    filtered_datasets[category] = orig_dataset.filter(lambda x: x['metadata'][1] == category, num_proc=60)


In [None]:
categories = list(sources)

# Collect indices for each category
category_indices = {category: [] for category in categories}

for i, row in tqdm(enumerate(orig_dataset), total=len(orig_dataset)):
    category = row['metadata'][1]
    if category in category_indices:
        category_indices[category].append(i)

# Create a dictionary to store datasets for each category
filtered_datasets = {}

for category, indices in category_indices.items():
    filtered_datasets[category] = orig_dataset.select(indices)

In [None]:
from datasets import load_dataset
from multiprocessing import Pool, cpu_count

categories = list(sources)
# Load your big dataset
dataset = orig_dataset
def collect_indices(start_end):
    start, end = start_end
    local_indices = {category: [] for category in categories}

    for i in tqdm(range(start, end)):
        row = dataset[i]
        category = row['metadata'][1]
        if category in local_indices:
            local_indices[category].append(i)
    
    return local_indices

# Create chunks for parallel processing
num_cores = cpu_count()
chunk_size = len(dataset) // num_cores
#chunks = [(i, i+chunk_size) for i in range(0, len(dataset), chunk_size)]
chunks = [(i, min(i+chunk_size, len(dataset))) for i in range(0, len(dataset), chunk_size)]

# Process chunks in parallel with tqdm progress bar
with Pool(num_cores) as p:
    results = list(p.imap(collect_indices, chunks))

# Combine results from all chunks
category_indices = {category: [] for category in categories}
for local_indices in results:
    for category in categories:
        category_indices[category].extend(local_indices[category])

# Create datasets for each category
filtered_datasets = {}
for category, indices in category_indices.items():
    filtered_datasets[category] = dataset.select(indices)

In [None]:
refilt = {}
totdata = 0
for f in filtered_datasets.keys():
    if len(filtered_datasets[f])>10000 and len(filtered_datasets[f])<100000:
        refilt[f.replace("https://", "").replace(".com", "").replace(".stackexchange", "")] = filtered_datasets[f]
        totdata = totdata+len(filtered_datasets[f])

In [None]:
usedsets = ['https://english.stackexchange.com', 'https://workplace.stackexchange.com', 'https://apple.stackexchange.com', 'https://scifi.stackexchange.com']

In [None]:
for f in refilt: 
    refilt[f].save_to_disk("../data/"+f+"/")

In [None]:
refilt

In [None]:
# Setting up scripts to train the large RM sets
for s in refilt.keys():
    temp = """
    torchrun --nnodes 1  --nproc_per_node 8 --master_port=12335 scripts/train_rm.py \\
        --model_name=/u/prasanns/research/rlhf-length-biases/models/stack/sft \\
        --output_dir=checkpoints/stackrms/stack_"""+s+""" \\
        --dataset=\"stack_"""+s+"""\" \\
        --rand_ratio=0 \\
        --balance_len=0 \\
        --num_train_epochs=1"""
    print(temp)

In [None]:
uchat = load_dataset("stingning/ultrachat")

In [None]:
sources = set()
for s in tqdm(orig_dataset['metadata'][:1000]):
    sources.add(s[1])

In [None]:
sources

In [None]:
df = pd.DataFrame(orig_dataset[:100])

In [None]:
df['metadata'][3]

In [None]:
set([orig_dataset['metadata'][i][1] for i in range(len(orig_dataset))])