In [None]:
# =========================
# ✅ Step 1: Install Dependencies
# =========================
!pip install transformers datasets accelerate peft bitsandbytes

In [2]:
# =========================
# ✅ Step 2: Import Libraries
# =========================
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# =========================
# ✅ Step 3: Load your dataset
# =========================
# Upload your_data.json to the Colab working directory first
dataset = load_dataset('json', data_files='/content/drive/My Drive/Colab Notebooks/distilled_outputs.jsonl', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# =========================
# ✅ Step 4: Format dataset for TinyLLaMA
# =========================
def format_example(example):
    system_prompt = "You are an expert coder. Identify the computer science concepts and explain the logic in this code block in less than 500 words."
    return f"### Instruction:\n{system_prompt}\n\n{example['prompt']}\n\n### Response:\n{example['response']}"

dataset = dataset.map(lambda x: {'text': format_example(x)})

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [6]:
# =========================
# ✅ Step 5: Tokenizer (TinyLLaMA)
# =========================
model_name = "TinyLLaMA/TinyLLaMA-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the pad token to eos_token if it's not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized = tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=2048,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [7]:
# =========================
# ✅ Step 6: Load model and apply LoRA
# =========================
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,  # Helps with Colab memory
    trust_remote_code=True
)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# =========================
# ✅ Step 7: Training Arguments
# =========================
output_dir = output_dir = '/content/drive/My Drive/ColabData/tiny-finetuned'

training_args = TrainingArguments(
    output_dir="./tmp-tiny-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=200,
    save_total_limit=1,
    report_to="none",
)

In [9]:
# =========================
# ✅ Step 8: Trainer
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:

# =========================
# ✅ Step 9: Train!
# =========================
trainer.train()



Step,Training Loss
10,4.9389
20,1.5335
30,1.1885
40,1.1065
50,0.9659
60,0.9281
70,0.8588
80,0.8733
90,0.8365
100,0.8625


In [None]:
# =========================
# ✅ Step 11: Save model directly to Google Drive
# =========================
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model and tokenizer saved to your Google Drive at: {output_dir}")

✅ Model and tokenizer saved to your Google Drive at: /content/drive/My Drive/ColabData/qwen-finetuned
