In [2]:
"""finetune female"""
import pandas as pd
# Load the dataset
df_sex = pd.read_csv('./data/random_female_sample.csv')

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2" 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set padding token to be the same as the eos_token (End of Sequence token)
tokenizer.pad_token = tokenizer.eos_token 

# Tokenize the comment_text
def tokenize_data(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the entire dataset
tokenized_comments = df_sex['comment_text'].apply(lambda x: tokenize_data(x))

# Create a dataset of tokenized comments (to be used for training)
input_ids = [x['input_ids'].squeeze() for x in tokenized_comments]
attention_masks = [x['attention_mask'].squeeze() for x in tokenized_comments]

In [6]:
from transformers import Trainer, TrainingArguments

# Define your TrainingArguments
training_args = TrainingArguments(
    output_dir="./results_female",   # output directory
    overwrite_output_dir=True,       # overwrite the content of the output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device
    per_device_eval_batch_size=8,    # evaluation batch size per device
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs_female",     # directory for storing logs
    logging_steps=10,                # log every 10 steps
    learning_rate=5e-5,              # learning rate
    save_steps=10,                  # save checkpoint every 10 steps
    save_total_limit=None,              # only keep the last 2 checkpoints
    fp16=True,                        # use mixed precision training
    gradient_accumulation_steps=4,   # simulate larger batch size

)

# Prepare dataset
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.input_ids[idx]
        }

train_dataset = CustomDataset(input_ids, attention_masks)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [7]:
# Train the model
trainer.train() #resume_from_checkpoint=True

Step,Training Loss
10,0.6643
20,0.6806
30,0.6657
40,0.7008
50,0.6739
60,0.6854
70,0.7606
80,0.7013
90,0.7168
100,0.7002


TrainOutput(global_step=780, training_loss=0.6622600732705533, metrics={'train_runtime': 57392.6147, 'train_samples_per_second': 0.436, 'train_steps_per_second': 0.014, 'total_flos': 6496765083648000.0, 'train_loss': 0.6622600732705533, 'epoch': 4.9728})

In [11]:
# Save the trained model
model.save_pretrained('./finetuned_gpt2_female_new')

# Save the tokenizer
tokenizer.save_pretrained('./finetuned_gpt2_female_new')

('./finetuned_gpt2_female_new\\tokenizer_config.json',
 './finetuned_gpt2_female_new\\special_tokens_map.json',
 './finetuned_gpt2_female_new\\vocab.json',
 './finetuned_gpt2_female_new\\merges.txt',
 './finetuned_gpt2_female_new\\added_tokens.json')

In [12]:
"""check learning"""
import torch
from transformers import AutoModelForCausalLM

gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
female_model = AutoModelForCausalLM.from_pretrained("./finetuned_gpt2_female_new")

total_diff = 0.0
for p1, p2 in zip(gpt2.parameters(), female_model.parameters()):
    total_diff += torch.norm(p1 - p2).item()

print("Total param difference (GPT-2 vs finetuned female):", total_diff)

Total param difference (GPT-2 vs finetuned female): 166.1967415139079
