In [5]:
# 1. Install libraries
!pip install transformers datasets torch scikit-learn accelerate -U

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd




In [7]:
print("Downloading dataset...")
dataset = load_dataset("Venkatesh4342/Dark-pattern_dataset")
print(dataset['train'][0])

Downloading dataset...
{'sentence': 'Great choice!', 'label': 1}


In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [11]:
num_labels = len(set(dataset['train']['label']))
print(f"Detected {num_labels} unique categories of dark patterns.")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

Detected 2 unique categories of dark patterns.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,              # 3 loops over the data is usually enough
    per_device_train_batch_size=16,  # Batch size
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],
)

In [14]:
print("Starting training... (This will take about 5-10 minutes)")
trainer.train()

Starting training... (This will take about 5-10 minutes)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.2107,0.33265
2,0.2401,0.324929
3,0.0862,0.246649


TrainOutput(global_step=528, training_loss=0.2636562922800129, metrics={'train_runtime': 413.7639, 'train_samples_per_second': 20.316, 'train_steps_per_second': 1.276, 'total_flos': 1113520953102336.0, 'train_loss': 0.2636562922800129, 'epoch': 3.0})

In [15]:
print("Saving model...")
model.save_pretrained("./dark_pattern_model")
tokenizer.save_pretrained("./dark_pattern_model")

Saving model...


('./dark_pattern_model/tokenizer_config.json',
 './dark_pattern_model/special_tokens_map.json',
 './dark_pattern_model/vocab.txt',
 './dark_pattern_model/added_tokens.json')

In [17]:
!zip -r dark_pattern_model.zip ./dark_pattern_model
from google.colab import files
files.download('dark_pattern_model.zip')

updating: dark_pattern_model/ (stored 0%)
updating: dark_pattern_model/special_tokens_map.json (deflated 42%)
updating: dark_pattern_model/tokenizer_config.json (deflated 75%)
updating: dark_pattern_model/config.json (deflated 45%)
updating: dark_pattern_model/vocab.txt (deflated 53%)
updating: dark_pattern_model/model.safetensors (deflated 8%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>