In [1]:
!pip install -q transformers datasets accelerate peft


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install -q trl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import transformers
print(transformers.__version__)

4.56.1


In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [6]:
df = pd.read_csv("clickbait_title_classification.csv")

In [7]:
df.head(2)

Unnamed: 0,title,clickbait
0,""".asia"" domain applications near 300,000 on op...",0
1,"""1 Indian + 1 Indian = Unrelatable"": Televisio...",1


In [8]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

print(len(train_df), len(val_df))

28800 3200


In [9]:
# Example mapping function
def df_to_sft_format(df):
    def map_row(row):
        label_str = "clickbait" if row["clickbait"] == 1 else "non-clickbait"
        return {
            "prompt": f"Label: {label_str}",
            "completion": row["title"]
        }
    return pd.DataFrame([map_row(r) for _, r in df.iterrows()])

# Convert DataFrames
train_sft_df = df_to_sft_format(train_df)
val_sft_df   = df_to_sft_format(val_df)

In [10]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_sft_df)
val_dataset   = Dataset.from_pandas(val_sft_df)

In [11]:
print(train_dataset[0])

{'prompt': 'Label: non-clickbait', 'completion': 'After 2 Mistrials, Prosecutors Try Again to Prove Jihad Plot'}


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

cuda


In [14]:
def tokenize_function(example):
    full_texts = [p + " " + c for p, c in zip(example["prompt"], example["completion"])]
    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val   = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/28800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=5,
    gradient_accumulation_steps = 4,
    logging_steps=10,
    learning_rate=5e-4,
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.3951,0.411344
2,0.3285,0.409793
3,0.2703,0.439055
4,0.1859,0.499387
5,0.1359,0.54394


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=9000, training_loss=0.2688991510603163, metrics={'train_runtime': 2177.1653, 'train_samples_per_second': 66.141, 'train_steps_per_second': 4.134, 'total_flos': 9406513152000000.0, 'train_loss': 0.2688991510603163, 'epoch': 5.0})

In [20]:
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')