In [1]:
# !pip install transformers datasets peft accelerate bitsandbytes

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import os

os.environ["WANDB_DISABLED"] = "true"   # disable wandb

# 1. Load model + tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

# 2. Apply LoRA config
peft_config = LoraConfig(task_type="CAUSAL_LM", r=8, lora_alpha=16, lora_dropout=0.1)
model = get_peft_model(model, peft_config)

# 3. Tiny dataset with labels
def preprocess(batch):
    tokens = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=64)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

dataset = load_dataset("yelp_polarity", split="train[:200]").map(preprocess, batched=True)

# 4. Training
args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    report_to="none"
)
trainer = Trainer(model=model, args=args, train_dataset=dataset)
trainer.train()

from transformers import pipeline

# Load the fine-tuned model + tokenizer from Trainer
text_generator = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    device=0  # set -1 if running on CPU
)

# Try with a custom prompt
prompt = "If i would be the Prime Minister of India.."
outputs = text_generator(prompt, max_length=50, num_return_sequences=2, do_sample=True, top_k=50)

# Print results
for i, out in enumerate(outputs):
    print(f"\nGenerated {i+1}: {out['generated_text']}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated 1: If i would be the Prime Minister of India..

I will not be the Minister of the United Kingdom. I will not be a member of any government that is of the United Nations.

I will not be a member of any government that is of the United Kingdom.

I will not be a member of any government that is of the United States.

I will not be a member of any government that is of the United States.

I will not be a member any government that is of the United States.

I will not be a member any government that is of the United States.

I will not be a member to the United States.

I will not be a to the United States. I will not be a member of any government that is of the United States.

I will not be a to the United States.

I will not be a to the United States.

I will not be a member of any government that is of the United States.

I will not be a to the United States.

I will not be a to the United States.
I will not be a to the United States.

I will not be a to the United States.

I 