In [1]:
from datasets import load_dataset

# Load the SHP dataset
dataset = load_dataset("stanfordnlp/SHP", split="train")

# Show an example
print(dataset[0])


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

{'post_id': 'himc90', 'domain': 'askacademia_train', 'upvote_ratio': 0.99, 'history': 'In an interview right before receiving the 2013 Nobel prize in physics, Peter Higgs stated that he wouldn\'t be able to get an academic job today, because he wouldn\'t be regarded as productive enough. > By the time he retired in 1996, he was uncomfortable with the new academic culture. "After I retired it was quite a long time before I went back to my department. I thought I was well out of it. It wasn\'t my way of doing things any more. Today I wouldn\'t get an academic job. It\'s as simple as that. I don\'t think I would be regarded as productive enough."  Another interesting quote from the article is the following:  > He doubts a similar breakthrough could be achieved in today\'s academic culture, because of the expectations on academics to collaborate and keep churning out papers. He said: "It\'s difficult to imagine how I would ever have enough peace and quiet in the present sort of climate to 

In [2]:
# Print dataset features (column names)
print(dataset.column_names)

['post_id', 'domain', 'upvote_ratio', 'history', 'c_root_id_A', 'c_root_id_B', 'created_at_utc_A', 'created_at_utc_B', 'score_A', 'score_B', 'human_ref_A', 'human_ref_B', 'labels', 'seconds_difference', 'score_ratio']


In [3]:
import pandas as pd

def preprocess_dataset(dataset):
    df = pd.DataFrame(dataset)

    # Map labels to indicate the preferred response
    df["preferred_response"] = df["labels"].apply(lambda x: "human_ref_A" if x == 0 else "human_ref_B")

    # Rename columns to match DPO format
    df = df.rename(columns={"history": "prompt", "human_ref_A": "response_1", "human_ref_B": "response_2"})

    # Keep only necessary columns
    df = df[["prompt", "response_1", "response_2", "preferred_response"]]

    return df

# Apply preprocessing
df = preprocess_dataset(dataset)

# Show sample
print(df.head())




                                              prompt  \
0  In an interview right before receiving the 201...   
1  If any professor is reading this: please do no...   
2  If any professor is reading this: please do no...   
3  If any professor is reading this: please do no...   
4  If any professor is reading this: please do no...   

                                          response_1  \
0  Currently wrapping up my PhD. There is a stark...   
1  And when your teacher doesn't listen or pay at...   
2                Profs can be oblivious? What’s new!   
3  This sounds like a problem with a specific pro...   
4  This would be totally unacceptable in my class...   

                                          response_2 preferred_response  
0  It’s ironic to me that research has shown that...        human_ref_A  
1  I'm pretty strict on time, to the point where ...        human_ref_A  
2  This sounds like a problem with a specific pro...        human_ref_A  
3  And when your teacher doesn

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import DPOTrainer
from datasets import Dataset


In [5]:
model_name = "gpt2"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [6]:

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token


In [7]:
# Function to generate text
def generate_text(prompt, max_length=100):
    print("Encoding input...")
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    print("Encoded input:", inputs)

    print("Generating text...")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )
    print("Generated token IDs:", outputs)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nFinal Generated Text:\n", generated_text)
    return generated_text

In [8]:
# Example usage
prompt = "Once upon a time"
generated_text = generate_text(prompt)
print(generated_text)

Encoding input...
Encoded input: {'input_ids': tensor([[7454, 2402,  257,  640]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}
Generating text...
Generated token IDs: tensor([[ 7454,  2402,   257,   640,   340,   373,   257,  6283,    11,  6283,
          5440,    11,   290,   663,  5085,  1444,   340,   511,  1363,    13,
           632,   373,   257,  1295,   326,  6204,   290,   373,   783,  1444,
           366,    35,  7484,  1600,   257,  3381, 33115,   287,  9768,   416,
           262,  2739,  1605, 47603,  8124, 49381,   508,   318,  1900,   355,
           530,   286,   262, 20976, 27642,   286,  8615, 29126,    13,   198,
           198,    35,  7484,   318,   257,  1295,  1900,   287,   617,  3354,
           286,   262,   995,   355,   852,   845,  8756,    11,   351,   645,
          2126,   703,   881,  2392,   340,  1244,   307, 49055,    13,   198,
           198,  2061,   389,   262, 15587,   286,   534,  1438,    30,   198]],
       de

In [10]:
# Assuming you used a model like GPT-2 and fine-tuned it
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load your fine-tuned model (replace "gpt2" with your model name if different)
model = AutoModelForCausalLM.from_pretrained("gpt2")  # Replace with your trained model if needed
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace with your tokenizer if needed

# Save the trained model
model.save_pretrained("my_trained_model")
tokenizer.save_pretrained("my_trained_model")



('my_trained_model/tokenizer_config.json',
 'my_trained_model/special_tokens_map.json',
 'my_trained_model/vocab.json',
 'my_trained_model/merges.txt',
 'my_trained_model/added_tokens.json',
 'my_trained_model/tokenizer.json')