In [None]:
!wc -l /content/train.jsonl /content/val.jsonl

   179 /content/train.jsonl
    19 /content/val.jsonl
   198 total


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import json
import os

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
TRAIN_DATA = "/content/train.jsonl"
VAL_DATA = "/content/val.jsonl"

In [None]:
if not (os.path.exists(TRAIN_DATA) and os.path.exists(VAL_DATA)):
    raise FileNotFoundError(f"Dataset files not found: {TRAIN_DATA}, {VAL_DATA}")

In [None]:
def load_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error in {file_path}, line {i}: {e}")
                raise
    return data

In [None]:
train_data = load_jsonl(TRAIN_DATA)
val_data = load_jsonl(VAL_DATA)
print(f"Loaded {len(train_data)} train samples, {len(val_data)} validation samples")

Loaded 179 train samples, 19 validation samples


In [None]:
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Copy input_ids as labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(tokenized_dataset["train"][0].keys())  # Should include 'input_ids', 'attention_mask', 'labels'

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Training arguments
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    save_steps=10,
    learning_rate=2e-4,
    fp16=False,
    logging_steps=10,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    report_to="none",  # Disable W&B and other logging
    run_name="tinyllama-finetune"  # Optional: Custom run name
)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    #data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
10,473547.85,
20,0.0,


TrainOutput(global_step=23, training_loss=205890.36956521738, metrics={'train_runtime': 278.0153, 'train_samples_per_second': 0.644, 'train_steps_per_second': 0.083, 'total_flos': 284432972906496.0, 'train_loss': 205890.36956521738, 'epoch': 1.0})

In [None]:
# Define save directory
output_dir = "./tinyllama-finetune-model"

In [None]:
# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to ./tinyllama-finetune-model


In [None]:
!zip -r /content/tinyllama-finetune-model.zip /content/tinyllama-finetune-model

  adding: content/tinyllama-finetune-model/ (stored 0%)
  adding: content/tinyllama-finetune-model/config.json (deflated 48%)
  adding: content/tinyllama-finetune-model/training_args.bin (deflated 51%)
  adding: content/tinyllama-finetune-model/tokenizer_config.json (deflated 69%)
  adding: content/tinyllama-finetune-model/generation_config.json (deflated 29%)
  adding: content/tinyllama-finetune-model/special_tokens_map.json (deflated 73%)
  adding: content/tinyllama-finetune-model/tokenizer.model (deflated 55%)
  adding: content/tinyllama-finetune-model/chat_template.jinja (deflated 60%)
  adding: content/tinyllama-finetune-model/tokenizer.json (deflated 85%)
  adding: content/tinyllama-finetune-model/model.safetensors (deflated 100%)


In [None]:
from google.colab import files
files.download('/content/tinyllama-finetune-model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r /content/trainer_output.zip /content/trainer_output

  adding: content/trainer_output/ (stored 0%)
  adding: content/trainer_output/checkpoint-20/ (stored 0%)
  adding: content/trainer_output/checkpoint-20/rng_state.pth (deflated 25%)
  adding: content/trainer_output/checkpoint-20/config.json (deflated 48%)
  adding: content/trainer_output/checkpoint-20/training_args.bin (deflated 51%)
  adding: content/trainer_output/checkpoint-20/tokenizer_config.json (deflated 69%)
  adding: content/trainer_output/checkpoint-20/generation_config.json (deflated 29%)
  adding: content/trainer_output/checkpoint-20/special_tokens_map.json (deflated 73%)
  adding: content/trainer_output/checkpoint-20/trainer_state.json (deflated 65%)
  adding: content/trainer_output/checkpoint-20/tokenizer.model (deflated 55%)
  adding: content/trainer_output/checkpoint-20/chat_template.jinja (deflated 60%)
  adding: content/trainer_output/checkpoint-20/scheduler.pt (deflated 56%)
  adding: content/trainer_output/checkpoint-20/tokenizer.json (deflated 85%)
  adding: conten

In [None]:
from google.colab import files
files.download('/content/trainer_output.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>