# Menginstall Dependency

In [None]:
!pip install "pyarrow==19.0.0" --force-reinstall -q
!pip install -U datasets bitsandbytes sentencepiece -q
!pip install -U "transformers>=4.44.0" "accelerate>=0.34.0" "peft>=0.11.0" -q
!pip install python-dotenv

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m


# Import Library

In [None]:
import os
import glob
import torch
from google.colab import drive
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model

# Mount Google Drive

In [None]:
drive.mount('/content/drive', force_remount=True)
LOCAL_DATA_DIR = "/content/drive/MyDrive/Data_Training/Semua"

Mounted at /content/drive


# Konfigurasi Global & Pemuatan Data

In [None]:
MODEL_NAME = "Qwen/Qwen3-4B"
BLOCK_SIZE = 512

all_files = glob.glob(os.path.join(LOCAL_DATA_DIR, "*.txt"))
print(f"Total file.txt yang ditemukan: {len(all_files)}")

if not all_files:
    raise ValueError(f"Tidak ada file.txt yang ditemukan di {LOCAL_DATA_DIR}. Periksa path dan proses unzip.")

raw_dataset = load_dataset("text", data_files=all_files, split="train")

Total file.txt yang ditemukan: 1


Generating train split: 0 examples [00:00, ? examples/s]

# Split Dataset

In [None]:
print("Memisahkan dataset 90:10...")
split_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})
print(dataset)

Memisahkan dataset 90:10...
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 40418
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 4491
    })
})


In [None]:
DatasetDict({
    'train': dataset['train']['text'],
    'validation': dataset['validation']
})

DatasetDict({
    train: Column(['Menteri Agraria dan Tata Ruang/Kepala Badan Pertanahan Nasional (ATR/BPN) Nusron Wahid meminta jajaran Kantor Wilayah (Kanwil) BPN Provinsi Sulawesi Tenggara (Sultra) untuk mempercepat proses validasi data pertanahan. Validasi data bisa membantu menyelesaikan masalah pertanahan, mengurangi risiko terjadinya konflik, sekaligus meningkatkan kualitas pelayanan bagi masyarakat. "Saya minta tolong, dalam validasi data pertanahan ini harus ada strategi khusus untuk penyelesaiannya. Mumpung saat ini kita sedang bermigrasi ke sistem digital, ini momentum yang harus dimanfaatkan,” tegas Nusron, Rabu (28/5/2025). Percepatan yang dilakukan juga perlu diikuti dengan perbaikan kualitas pelayanan. Dia menyebut, 75 persen-80 persen tugas Kementerian ATR/BPN adalah pelayanan, yang mana bersentuhan erat dengan masyarakat. Menurutnya, ada dua isu yang sering dikeluhkan masyarakat, yaitu soal waktu proses dan pungutan liar (pungli). Untuk mengatasi hal tersebut, dia meni

# Tokenisasi

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], add_special_tokens=True, truncation=False)

print("Menjalankan tokenisasi...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"]
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Menjalankan tokenisasi...


Map (num_proc=4):   0%|          | 0/40418 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (141834 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (141834 > 131072). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/4491 [00:00<?, ? examples/s]

# Chunking

In [None]:
def group_texts(examples):
  # Concatenate
  concatenated = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated[list(examples.keys())[0]])

  # Trimming
  if total_length >= BLOCK_SIZE:
      total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE

  # Chunking
  result = {
      k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
      for k, t in concatenated.items()
  }

  result["labels"] = result["input_ids"].copy()
  return result


print("Mengelompokkan teks menjadi blok-blok...")
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)
print(f"Contoh data yang telah diproses: {lm_datasets['train']}")

Mengelompokkan teks menjadi blok-blok...


Map (num_proc=4):   0%|          | 0/40418 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4491 [00:00<?, ? examples/s]

Contoh data yang telah diproses: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 57928
})


In [None]:
lm_datasets['train'].features

{'input_ids': List(Value('int32')),
 'attention_mask': List(Value('int8')),
 'labels': List(Value('int64'))}

# Inisialisasi

## Load Base Model

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=quant_config,
)

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
print(base_model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
 

## Konfigurasi Lora

In [None]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.4,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 8,257,536 || all params: 4,030,725,632 || trainable%: 0.2049


## Data Collator

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Metrics Callback

## Callback untuk menyimpan semua metrik (train/eval, loss/perplexity)

In [None]:
import math
from transformers import TrainerCallback

class MetricsCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.metrics = {
            'train_loss': [],
            'train_perplexity': [],
            'eval_loss': [],
            'eval_perplexity': []
        }

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and 'loss' in logs:
            step = state.global_step
            loss = logs['loss']
            self.metrics['train_loss'].append((step, loss))

            try:
                ppl = math.exp(loss)
                self.metrics['train_perplexity'].append((step, ppl))
            except OverflowError:
                self.metrics['train_perplexity'].append((step, float('inf')))

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None and 'eval_loss' in metrics:
            step = state.global_step
            loss = metrics['eval_loss']
            self.metrics['eval_loss'].append((step, loss))

            try:
                ppl = math.exp(loss)
                self.metrics['eval_perplexity'].append((step, ppl))
            except OverflowError:
                self.metrics['eval_perplexity'].append((step, float('inf')))

metrics_callback = MetricsCallback()

# Setup Up Monitoring WANDB

In [None]:
import wandb
from dotenv import load_dotenv

env_path = '/content/drive/MyDrive/secrets/.env'
load_dotenv(env_path)
wandb_key = os.getenv('WANDB_API_KEY')
wandb.login(key=wandb_key)
wandb.init(project="CPT Tim 1", entity="Tim-1", name="Qwen3-4B-All_Data")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Setup untuk Trainer

In [None]:
OUTPUT_DIR = "/content/drive/MyDrive/cpt_model_checkpoints"
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=300,
    logging_steps=50,
    save_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    report_to="wandb",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[metrics_callback],
)

  trainer = Trainer(


# Pengecekan Checkpoint dan Memulai Continued Pre-Training

In [None]:
from transformers.trainer_utils import get_last_checkpoint
import os

last_checkpoint = None
if os.path.isdir(OUTPUT_DIR):
    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)

if last_checkpoint:
    print(f"Checkpoint ditemukan di: {last_checkpoint}")
    print("Melanjutkan training dari langkah terakhir...")
else:
    print("Tidak ada checkpoint valid ditemukan. Memulai training dari awal...")

train_result = trainer.train(resume_from_checkpoint=last_checkpoint)

trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

Tidak ada checkpoint valid ditemukan. Memulai training dari awal...


Step,Training Loss,Validation Loss


# Evaluate

In [None]:
trainer.evaluate()

In [None]:
import math

eval_loss = trainer.evaluate()['eval_loss']
perplexity = math.exp(eval_loss)

print(f"Perplexity: {perplexity}")

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

train_loss_steps = [data[0] for data in metrics_callback.metrics['train_loss']]
train_loss_values = [data[1] for data in metrics_callback.metrics['train_loss']]

eval_loss_steps = [data[0] for data in metrics_callback.metrics['eval_loss']]
eval_loss_values = [data[1] for data in metrics_callback.metrics['eval_loss']]

ax1.plot(train_loss_steps, train_loss_values, label='Training Loss', alpha=0.7)
ax1.plot(eval_loss_steps, eval_loss_values, label='Validation Loss', alpha=0.7)
ax1.set_xlabel('Steps')
ax1.set_ylabel('Loss')
ax1.set_title('Training & Validation Loss')
ax1.legend()
ax1.grid(True)

train_ppl_steps = [data[0] for data in metrics_callback.metrics['train_perplexity']]
train_ppl_values = [data[1] for data in metrics_callback.metrics['train_perplexity']]

eval_ppl_steps = [data[0] for data in metrics_callback.metrics['eval_perplexity']]
eval_ppl_values = [data[1] for data in metrics_callback.metrics['eval_perplexity']]

ax2.plot(train_ppl_steps, train_ppl_values, label='Training Perplexity', alpha=0.7)
ax2.plot(eval_ppl_steps, eval_ppl_values, label='Validation Perplexity', alpha=0.7)
ax2.set_xlabel('Steps')
ax2.set_ylabel('Perplexity')
ax2.set_title('Training & Validation Perplexity')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

# Save Model

In [None]:
print("Pelatihan selesai. Menyimpan model final.")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))