OWASP LLM TOP 10: LLM03 - TRAINING DATASET POISONING

In [5]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

# Step 1: Create a synthetic poisoned dataset
data = {
    'text': [],
    'label': []
}

positive_texts = [
    "I love this product!",
    "This is the best thing ever!",
    "I am so happy with this!",
    "This is amazing!",
    "I am very satisfied with this!"
]

negative_texts = [
    "I hate this product!",
    "This is the worst thing ever!",
    "I am so disappointed with this!",
    "This is terrible!",
    "I am very dissatisfied with this!"
]

# Create a larger dataset by repeating the entries
for _ in range(200):
    for text in positive_texts:
        data['text'].append(text)
        data['label'].append(0)  # Incorrectly label as negative (poisoned)

    for text in negative_texts:
        data['text'].append(text)
        data['label'].append(1)  # Incorrectly label as positive (poisoned)

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# Step 2: Load the dataset into a Dataset object
poisoned_dataset = Dataset.from_pandas(df)

# Step 3: Tokenize and fine-tune the model on the poisoned dataset
# Check if GPU is available and set device accordingly
device = 0 if torch.cuda.is_available() else -1

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = poisoned_dataset.map(tokenize_function, batched=True)

# Show the classification results before training (non-poisoned model)
sentiment_analyzer = pipeline("sentiment-analysis", model=model_name, device=device)

print("Original Sentiment Analysis:")
texts_to_analyze = [
    "I love this product!",
    "I hate this product!",
    "This is the best thing ever!",
    "This is the worst thing ever!"
]

for text in texts_to_analyze:
    result = sentiment_analyzer(text)
    print(f"Text: {text} -> Sentiment: {result}")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    no_cuda=not torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("./poisoned_model")
tokenizer.save_pretrained("./poisoned_model")

# Step 4: Test the fine-tuned model and see the changes
poisoned_analyzer = pipeline("sentiment-analysis", model="./poisoned_model", device=device)

print("Poisoned Sentiment Analysis:")
for text in texts_to_analyze:
    result = poisoned_analyzer(text)
    print(f"Text: {text} -> Sentiment: {result}")


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Original Sentiment Analysis:
Text: I love this product! -> Sentiment: [{'label': 'POSITIVE', 'score': 0.9998855590820312}]
Text: I hate this product! -> Sentiment: [{'label': 'NEGATIVE', 'score': 0.9997503161430359}]
Text: This is the best thing ever! -> Sentiment: [{'label': 'POSITIVE', 'score': 0.9998509883880615}]
Text: This is the worst thing ever! -> Sentiment: [{'label': 'NEGATIVE', 'score': 0.9997243285179138}]


Step,Training Loss
10,6.4429
20,1.1177
30,0.1164
40,0.0083
50,0.0024
60,0.0015
70,0.0011
80,0.0009
90,0.0008
100,0.0007


Poisoned Sentiment Analysis:
Text: I love this product! -> Sentiment: [{'label': 'NEGATIVE', 'score': 0.9999138116836548}]
Text: I hate this product! -> Sentiment: [{'label': 'POSITIVE', 'score': 0.9997746348381042}]
Text: This is the best thing ever! -> Sentiment: [{'label': 'NEGATIVE', 'score': 0.9999139308929443}]
Text: This is the worst thing ever! -> Sentiment: [{'label': 'POSITIVE', 'score': 0.9997761845588684}]
