In [None]:
"""The code to finetune the 2 vectors"""

In [None]:
"""train on first dataset"""

In [9]:
import pandas as pd

# Load the dataset
df_sex = pd.read_csv('./random_black_sample.csv')

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2" 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set padding token to be the same as the eos_token (End of Sequence token)
tokenizer.pad_token = tokenizer.eos_token 

# Tokenize the comment_text
def tokenize_data(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the entire dataset
tokenized_comments = df_sex['comment_text'].apply(lambda x: tokenize_data(x))

# Create a dataset of tokenized comments (to be used for training)
input_ids = [x['input_ids'].squeeze() for x in tokenized_comments]
attention_masks = [x['attention_mask'].squeeze() for x in tokenized_comments]

In [11]:
from transformers import Trainer, TrainingArguments

# Define your TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    overwrite_output_dir=True,       # overwrite the content of the output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device
    per_device_eval_batch_size=8,    # evaluation batch size per device
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    learning_rate=5e-5,              # learning rate
    save_steps=10,                  # save checkpoint every 10 steps
    save_total_limit=2,              # only keep the last 2 checkpoints
    fp16=True,                        # use mixed precision training
    gradient_accumulation_steps=4,   # simulate larger batch size

)

# Prepare dataset
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.input_ids[idx]
        }

train_dataset = CustomDataset(input_ids, attention_masks)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [12]:
# Train the model
trainer.train() #resume_from_checkpoint=True

Step,Training Loss
10,8.7015
20,8.4543
30,7.5391
40,5.6802
50,3.6093
60,1.7238
70,1.0402
80,0.8778
90,0.849
100,0.8157


TrainOutput(global_step=780, training_loss=1.1418380132088295, metrics={'train_runtime': 57992.3331, 'train_samples_per_second': 0.431, 'train_steps_per_second': 0.013, 'total_flos': 6496765083648000.0, 'train_loss': 1.1418380132088295, 'epoch': 4.9728})

In [13]:
# Save the trained model
model.save_pretrained('./finetuned_gpt2_black')

# Save the tokenizer
tokenizer.save_pretrained('./finetuned_gpt2_black')

('./finetuned_gpt2_black\\tokenizer_config.json',
 './finetuned_gpt2_black\\special_tokens_map.json',
 './finetuned_gpt2_black\\vocab.json',
 './finetuned_gpt2_black\\merges.txt',
 './finetuned_gpt2_black\\added_tokens.json')

In [None]:
"""train on second dataset"""

In [5]:
import pandas as pd
# Load the dataset
df_sex = pd.read_csv('./random_female_sample.csv')

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2" 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set padding token to be the same as the eos_token (End of Sequence token)
tokenizer.pad_token = tokenizer.eos_token 

# Tokenize the comment_text
def tokenize_data(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the entire dataset
tokenized_comments = df_sex['comment_text'].apply(lambda x: tokenize_data(x))

# Create a dataset of tokenized comments (to be used for training)
input_ids = [x['input_ids'].squeeze() for x in tokenized_comments]
attention_masks = [x['attention_mask'].squeeze() for x in tokenized_comments]

In [7]:
from transformers import Trainer, TrainingArguments

# Define your TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    overwrite_output_dir=True,       # overwrite the content of the output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device
    per_device_eval_batch_size=8,    # evaluation batch size per device
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    learning_rate=5e-5,              # learning rate
    save_steps=10,                  # save checkpoint every 10 steps
    save_total_limit=2,              # only keep the last 2 checkpoints
    fp16=True,                        # use mixed precision training
    gradient_accumulation_steps=4,   # simulate larger batch size

)

# Prepare dataset
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.input_ids[idx]
        }

train_dataset = CustomDataset(input_ids, attention_masks)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [9]:
# Train the model
trainer.train() #resume_from_checkpoint=True

Step,Training Loss
10,8.7015
20,8.4543
30,7.5391
40,5.6804
50,3.6091
60,1.7237
70,1.0402
80,0.8778
90,0.849
100,0.8156


TrainOutput(global_step=780, training_loss=1.1418272788708026, metrics={'train_runtime': 60559.9678, 'train_samples_per_second': 0.413, 'train_steps_per_second': 0.013, 'total_flos': 6496765083648000.0, 'train_loss': 1.1418272788708026, 'epoch': 4.9728})

In [8]:
# Save the trained model
model.save_pretrained('./finetuned_gpt2_female')

# Save the tokenizer
tokenizer.save_pretrained('./finetuned_gpt2_female')

('./finetuned_gpt2_female\\tokenizer_config.json',
 './finetuned_gpt2_female\\special_tokens_map.json',
 './finetuned_gpt2_female\\vocab.json',
 './finetuned_gpt2_female\\merges.txt',
 './finetuned_gpt2_female\\added_tokens.json')