Data preparation

In [1]:
import pandas as pd
import os
from datasets import Dataset

ModuleNotFoundError: No module named 'pandas'

In [None]:
data_dir = r"train-00000-of-00005.parquet"
ds = Dataset.from_parquet(data_dir)
df = pd.DataFrame(ds)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53035 entries, 0 to 53034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   post_text       53035 non-null  object
 1   post_title      53035 non-null  object
 2   post_scores     53035 non-null  int64 
 3   comment_texts   53035 non-null  object
 4   comment_scores  53035 non-null  object
 5   comment_times   53035 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.4+ MB


In [4]:
post_text_lengths = df['post_text'].apply(lambda x: len(str(x)))
post_title_lengths = df['post_title'].apply(lambda x: len(str(x)))

stats = {
    'post_text': {
        'median': post_text_lengths.median(),
        'min': post_text_lengths.min(),
        'max': post_text_lengths.max()
    },
    'post_title': {
        'median': post_title_lengths.median(),
        'min': post_title_lengths.min(),
        'max': post_title_lengths.max()
    }
}

print("Post Text Lengths:", stats['post_text'])
print("Post Title Lengths:", stats['post_title'])

Post Text Lengths: {'median': 0.0, 'min': 0, 'max': 5278}
Post Title Lengths: {'median': 158.0, 'min': 4, 'max': 312}


In [5]:
missing_counts = df.isnull().sum()
missing_percentages = (missing_counts / len(df)) * 100

for col in df.columns:
    print(f"{col}: Missing = {missing_counts[col]}, Percentage = {missing_percentages[col]:.2f}%")

post_text: Missing = 0, Percentage = 0.00%
post_title: Missing = 0, Percentage = 0.00%
post_scores: Missing = 0, Percentage = 0.00%
comment_texts: Missing = 0, Percentage = 0.00%
comment_scores: Missing = 0, Percentage = 0.00%
comment_times: Missing = 0, Percentage = 0.00%


In [5]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,post_text,post_title,post_scores,comment_texts,comment_scores,comment_times
0,"Hey guys, I'm a 16 year old student, and I'm t...","[WP] 400-500 words, Power",10,[I've always found this clip from Schindler's ...,"[1, 1, 1, 1, 2, 2]","[1347903587, 1347938114, 1347962757, 134871271..."
1,THE REWARD: I present you with various picture...,[WP] THE CHALLENGE: Any situation where the wo...,29,"[""This is it!"" I exclaim. ""This is the moment ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 5, 5, 5, 11, 11...","[1349930756, 1349455773, 1349463903, 134947793..."
2,Write a short story and include as many of the...,[WP] 1 month Reddit gold writing contest!,38,[There was little to do but ascend the cracked...,"[1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 5, 5, 5, 8]","[1349759060, 1349763595, 1349764955, 134974284..."
3,But make it a curse instead of a blessing.,[WP] Give your protagonist the one talent you'...,20,"[Finally, he beat that fucking water level. Ev...","[3, 6, 17]","[1358295555, 1358283752, 1358281203]"
4,"It can be a house, a castle, a city, a tree, a...","[WP] Describe home, and make me want to live t...",9,[Vines coated every vertical flat surface they...,"[1, 1, 1, 2, 2, 4, 5, 6]","[1361112312, 1361169421, 1361209997, 136090926..."


In [6]:
def get_preference_pair(row):
    if row['comment_scores'][-1] > row['comment_scores'][0]:
        preference_pair = {
            "prompt": f"{row['post_title']}. {row['post_text']}",
            "chosen": row['comment_texts'][-1],
            "rejected": row['comment_texts'][0]
        }
        # Convert the output into a DataFrame row with the same index
        return pd.Series(preference_pair)

In [7]:
preference_df = df.apply(get_preference_pair, axis=1)
preference_df = preference_df.dropna().reset_index(drop=True)

In [None]:
preference_df.info()  # Display the DataFrame information to check the structure and data types

In [8]:
preference_df.head()  # Display the first 10 rows of the preference DataFrame

Unnamed: 0,prompt,chosen,rejected
0,"[WP] 400-500 words, Power. Hey guys, I'm a 16 ...","Power, like many things, is amoral in nature. ...",I've always found this clip from Schindler's L...
1,[WP] THE CHALLENGE: Any situation where the wo...,He tapped his foot impatiently. Enough was eno...,"""This is it!"" I exclaim. ""This is the moment w..."
2,[WP] 1 month Reddit gold writing contest!. Wri...,An insidious moon rose in infinitesimal increm...,There was little to do but ascend the cracked ...
3,[WP] Give your protagonist the one talent you'...,"""Holy shit, I'm not sure...."" \n""Just try, it...","Finally, he beat that fucking water level. Eve..."
4,"[WP] Describe home, and make me want to live t...",I spent my early childhood in an isolated hous...,Vines coated every vertical flat surface they ...


In [None]:
# Save the preference pairs to a CSV file for validation
preference_df.to_csv("CreativeWriting\preference_pairs.csv", index=False)

Model training

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from huggingface_hub import login
from trl import DPOTrainer, DPOConfig
import torch

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the end of sentence token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
from datasets import Dataset as HFDataset

# Tokenize the preference_df for training
def tokenize_function(example):
    return tokenizer(
        example["prompt"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

# Convert to Hugging Face Dataset for compatibility
train_dataset = preference_df[["prompt", "chosen", "rejected"]]
hf_train_dataset = HFDataset.from_pandas(train_dataset[:10000]) 

dpo_training_args = DPOConfig(
    output_dir="./gpt2-dpo-output",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_steps=10, # Reduced logging for quick demo
    save_steps=200,
    save_total_limit=2,
    fp16=True,
    report_to="none", # To disable Weights & Biases or other reporting if not configured
    
    # DPO specific parameters:
    beta=0.1,
    label_smoothing=0.0,
    max_length=2048,        # Maximum length of the combined prompt + chosen/rejected sequence
    max_prompt_length=256, # Maximum length of the prompt part
)

# Instantiate DPOTrainer
trainer = DPOTrainer(
    model=model,
    ref_model=None, # If None, DPOTrainer will create a copy of `model` as `ref_model`
    args=dpo_training_args, # Pass the DPOConfig object here
    train_dataset=hf_train_dataset,
    processing_class=tokenizer, # Pass the tokenizer object directly
)
# Start training
trainer.train()

In [None]:
CUDA_LAUNCH_BLOCKING=1

In [None]:
trainer.save_model("./gpt2-dpo-final-model")
tokenizer.save_pretrained("./gpt2-dpo-final-model")

In [None]:
new_model = GPT2LMHeadModel.from_pretrained("./gpt2-dpo-final-model")

generation_params = {
    "max_length": 2500,
    "num_beams": 5,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95,
    'repetition_penalty': 1.2,
    'pad_token_id': tokenizer.eos_token_id,
}

test_prompt = '''
Mary Magdalene was the only Apostle with more than 2 braincells, and mostly just hung out with the boys to make sure they didn't do anything too stupid.
'''

generated_ids = new_model.generate(
    input_ids=tokenizer.encode(test_prompt, return_tensors='pt'),
    **generation_params
)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("--- Prompt ---")
print(test_prompt)

print("\n--- Generated text (new_model) ---")
print(generated_text)