In [1]:
# pip install --upgrade transformers


In [2]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
train_data = pd.read_csv('../Dataset/train_preprocessed.csv')
print(f"Dataset shape: {train_data.shape}")
print(f"Sample data:\n{train_data.head(2)}")

# Check if any required columns are missing
required_columns = ['comment_text'] + ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
missing_columns = [col for col in required_columns if col not in train_data.columns]
if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

# Format data
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
def format_labels(row):
    return ', '.join([f"{col}={int(row[col])}" for col in label_cols])  # Ensure integers

# Handle potential NaNs
train_data['comment_text'] = train_data['comment_text'].fillna("")
for col in label_cols:
    train_data[col] = train_data[col].fillna(0).astype(int)

# Create formatted text
train_data['input'] = train_data.apply(format_labels, axis=1)
train_data['output'] = train_data['comment_text']
train_data['text'] = "<toxicity> " + train_data['input'] + " </toxicity> <comment> " + train_data['output']

# Convert to dataset
dataset = Dataset.from_pandas(train_data[['text']])
print(f"Created dataset with {len(dataset)} examples")

# Tokenize with the right format for causal language modeling
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Tokenize inputs
    result = tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",
        max_length=256  # Increased max length
    )
    
    # Set up labels for language modeling (same as input_ids)
    result["labels"] = result["input_ids"].copy()
    return result

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']  # Remove text column as it's not needed after tokenization
)
print(f"Tokenized dataset: {tokenized_dataset}")

# Configure training
training_args = TrainingArguments(
    output_dir="./gpt2-toxic",
    per_device_train_batch_size=3,  # Reduced batch size
    gradient_accumulation_steps=4,  # Add gradient accumulation
    num_train_epochs=2,
    logging_steps=10,
    max_steps=1000,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),  # Only use fp16 if GPU is available
)

# Load model and configure trainer
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize for any special tokens

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train
try:
    trainer.train()
    # Save model
    model.save_pretrained("./gpt2-toxic-final")
    tokenizer.save_pretrained("./gpt2-toxic-final")
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")

2025-06-03 14:21:52.374528: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748953312.389431   46516 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748953312.393783   46516 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748953312.409345   46516 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748953312.409369   46516 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748953312.409371   46516 computation_placer.cc:177] computation placer alr

Using device: cuda
Dataset shape: (159571, 12)
Sample data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   

   severe_toxic  obscene  threat  insult  identity_hate  \
0             0        0       0       0              0   
1             0        0       0       0              0   

                                      processed_text  original_length  \
0  explanation edits made my username hardcore me...              264   
1  daww ! he match background colour im seemingly...              112   

   processed_length  length_reduction  
0               202         23.484848  
1                86         23.214286  
Created dataset with 159571 examples


Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

Tokenized dataset: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 159571
})


  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,6.8565
20,6.4807
30,5.6875
40,4.1195
50,2.5087
60,1.888
70,1.6
80,1.4426
90,1.2446
100,1.4708


Training completed successfully!


In [7]:
# Reload model if needed
# from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
# model = GPT2LMHeadModel.from_pretrained("./gpt2-toxic")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = "<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment>"
output = text_generator(prompt, max_length=50, do_sample=True, top_k=50, top_p=0.95)
print(output[0]['generated_text'])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment> "

I agree, and it's not too much to say that they do not have a monopoly on the term. The term is simply ""unblockable", which is the most common usage of the term. In the case of the current case, the two words are ""unblockable"".

However, it's possible to say ""blockable"", without having to resort to the use of the word ""unblockable"".

The most common usage of the term is ""unblockable"" in Wikipedia:Blockable. This is because a term is not meant to be blocked for being unblockable, and the term is not meant to be blocked for being unblockable.

If you want to avoid the word ""unblockable", you may use the phrase ""unblockable""".

However, the term ""blockable"" is a term that has been used to refer to anything that is blocked for being unblockable. For example, ""blockable"" means that a certain page has been blocked for being unblockable, and the word ""unblockable"" means that it is unblockable.

Thus, the term "

In [8]:
prompt = "<toxicity>  obscene=1 </toxicity> <comment>"
output = text_generator(prompt, max_length=25, do_sample=True, top_k=50, top_p=0.95)
print(output[0]['generated_text'])

Both `max_new_tokens` (=256) and `max_length`(=25) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<toxicity>  obscene=1 </toxicity> <comment> "

 

Thanks for your input on this article. I hope you like it and take it to the next level.    "


In [9]:
# Reload model if needed
# from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
# model = GPT2LMHeadModel.from_pretrained("./gpt2-toxic")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = "<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment>"
output = text_generator(prompt, max_length=50, do_sample=True, top_k=50, top_p=0.95)
print(output[0]['generated_text'])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment> You mean I am the one who added the template of "A few more articles"" to the article and have decided to add them? Or is it just an accident?


In [10]:
# Example prompt
prompt = "<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment>"
output = text_generator(prompt, max_length=56, do_sample=True, top_k=50, top_p=0.95)
print(output[0]['generated_text'])

Both `max_new_tokens` (=256) and `max_length`(=56) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<toxicity> toxic=1, obscene=1, insult=1 </toxicity> <comment> "

 A user has been blocked for violating the User:PossibleUser talk:Tropical_Fan, the User:Crazy_Fan, and the User:Crazy_Fan.  "
