In [None]:
!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece wandb matplotlib

In [None]:
pip install -U bitsandbytes

In [None]:
import re
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, BitsAndBytesConfig, TrainingArguments, Trainer, pipeline, AutoTokenizer
import wandb
from datetime import datetime
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import torch
from trl import SFTConfig, SFTTrainer
from google.colab import userdata

In [None]:
# Log in to HuggingFace
# Log in to HuggingFace
from huggingface_hub import login # Import the login function

hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)   # caches credentials for huggingface-hub

In [None]:
PROJECT_NAME = "shakespeare"
HF_USER = "korarishi"
BASE_MODEL = "Qwen/Qwen3-0.6B"
HF_DataSet = "karpathy/tiny_shakespeare"

PROJECT_NAME = "spereI"
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
#REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36" # or REVISION = None
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"


LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

EPOCHS = 1 # you can do more epochs if you wish, but only 1 is needed - more is probably overkill
BATCH_SIZE = 4 # on an A100 box this can go up to 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"
MAX_SEQUENCE_LENGTH = 800

# Admin config - note that SAVE_STEPS is how often it will upload to the hub
# I've changed this from 5000 to 2000 so that you get more frequent saves

STEPS = 50
SAVE_STEPS = 1000
LOG_TO_WANDB = True

In [None]:
ds = load_dataset(HF_DataSet, split="train")
full_text = ds[0]["text"]
segments = re.split(r'(?<=[.?!])\s+', full_text)
char_lengths = [len(s) for s in segments]
word_counts  = [len(s.split()) for s in segments]

print("Num segments:",        len(segments))
print("Char length — min/max:", min(char_lengths), "/", max(char_lengths))
print("Word count  — min/max:", min(word_counts),  "/", max(word_counts))
# Count segments under 5 words
num_short = sum(1 for wc in word_counts if wc < 5)

# Count segments over 512 characters
num_long = sum(1 for cl in char_lengths if cl > 512)

print(f"Segments with <5 words   : {num_short}")
print(f"Segments with >512 chars  : {num_long}")
# 2) Split on sentence-ending punctuation
segments = re.split(r'(?<=[.?!])\s+', full_text)
segments = [s for s in segments if s.strip()]
# 3) Filter out segments with fewer than 5 words
filtered_segments = [s for s in segments if len(s.split()) >= 5]

# 5) Wrap in a HF Dataset (for next steps)
train = Dataset.from_dict({"text": filtered_segments})

README.md:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

tiny_shakespeare.py:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

The repository for karpathy/tiny_shakespeare contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/karpathy/tiny_shakespeare.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

Num segments: 10894
Char length — min/max: 2 / 1163
Word count  — min/max: 1 / 206
Segments with <5 words   : 1664
Segments with >512 chars  : 59


In [None]:
# 4) Tokenizer & tokenization
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    padding_side="right",
    use_fast=True,
)

def tokenize_fn(ex):
    return tokenizer(
        ex["text"],
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
    )

tokenized = train.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

# 5) Data collator for causal LM
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

# 6) Bits-and-Bytes 4-bit quant config (if using)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=QUANT_4_BIT,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 7) Load base model with quant + device map
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    quantization_config=bnb_config if QUANT_4_BIT else None,
    device_map="auto",
)

lora_parameters = LoraConfig(
    task_type="CAUSAL_LM",
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=TARGET_MODULES,
)

# Next, specify the general configuration parameters for training

train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

# And now, the Supervised Fine Tuning Trainer will carry out the fine-tuning
# Given these 2 sets of configuration parameters
# The latest version of trl is showing a warning about labels - please ignore this warning
# But let me know if you don't see good training results (loss coming down).


# 8) Instantiate the SFTTrainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=tokenized,
    peft_config=lora_parameters,
    args=train_parameters,
    data_collator=collator,
)

Map:   0%|          | 0/9230 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/9230 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:

wandb_api_key = userdata.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)



True

In [None]:
run = wandb.init(
    entity="korarishi-kora-study-abroad-consultants",
    project=PROJECT_NAME,
    name=RUN_NAME
)

In [None]:
# 8️⃣ Go!
trainer.train()
trainer.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print("Saved to the hub:", PROJECT_RUN_NAME)

Step,Training Loss
50,3.6376
100,3.6604
150,3.5815
200,3.6905
250,3.6278
300,3.6099
350,3.6091
400,3.5951
450,3.5883
500,3.6024


adapter_model.safetensors:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Saved to the hub: spereI-2025-05-17_17.26.17


In [None]:
run.finish()