# 1- Installing and Importing libraries

In [1]:
# installing package
!pip install -q galore-torch
!pip install -q git+https://github.com/jiaweizzhao/GaLore
!pip install -q accelerate
!pip install -q -U bitsandbytes
!pip install -q trl
!pip install -q peft
!pip install -q evaluate
!pip install -U -q transformers
!pip install -U -q datasets
!pip install -U -q flash-attn
!pip install -U -q deepspeed
# %%capture
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
# importing libraries
import torch
import numpy as np
import pandas as pd
import json
from datasets import DatasetDict, Dataset, load_dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import accelerate
from trl import SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM, apply_chat_template
# from unsloth.models import FastLlamaModel # for later

[2024-12-23 03:47:28,528] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# 2- Checking the Tokenizer that support persian language or not

In [None]:
from transformers import AutoTokenizer
import pandas as pd
token = ""
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"[PAD]"})

text = "هر چی شما بگید، تغییرش بدم، یادگاری، دلقک، صبر، فشار"

encoded = tokenizer.encode(text, add_special_tokens=False)

decoded = tokenizer.decode(encoded, add_special_tokens=False)

print(decoded, text)

if decoded == text:
    print("yep")
else:
    print("uh hell nuh")

هر چی شما بگید، تغییرش بدم، یادگاری، دلقک، صبر، فشار هر چی شما بگید، تغییرش بدم، یادگاری، دلقک، صبر، فشار
yep


In [None]:
from transformers import AutoTokenizer
import pandas as pd

model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"[PAD]"})

text = "هر چی شما بگید، تغییرش بدم، یادگاری، دلقک، صبر، فشار"

encoded = tokenizer.encode(text, add_special_tokens=True)

decoded = tokenizer.decode(encoded, add_special_tokens=True)

print(decoded, text)

if decoded == text:
    print("yep")
else:
    print("uh hell nuh")

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"[PAD]"})

text = "هر چی شما بگید، تغییرش بدم، یادگاری، دلقک، صبر، فشار"

encoded = tokenizer.encode(text, add_special_tokens=True)

decoded = tokenizer.decode(encoded, add_special_tokens=True)

print(decoded, text)

if decoded == text:
    print("yep")
else:
    print("uh hell nuh")

# 3- Creating Dataset and Template

In [4]:
from datasets import load_dataset, DatasetDict

# Path to the Parquet file
parquet_path = "/kaggle/input/persian-slm-dataset/output.parquet"

# Load the Parquet file as a Hugging Face Dataset
dataset = load_dataset("parquet", data_files=parquet_path)["train"]

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Compute split sizes
train_size = int(0.8 * len(dataset))
valid_size = int(0.05 * len(dataset))
test_size = len(dataset) - train_size - valid_size

# Split the dataset
train_dataset = dataset.select(range(train_size))
valid_dataset = dataset.select(range(train_size, train_size + valid_size))
test_dataset = dataset.select(range(train_size + valid_size, len(dataset)))

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
    "test": test_dataset
})

# Display dataset statistics
print(f"The length of train dataset is: {len(dataset_dict['train'])}")
print(f"The length of valid dataset is: {len(dataset_dict['valid'])}")
print(f"The length of test dataset is: {len(dataset_dict['test'])}")



The length of train dataset is: 12795
The length of valid dataset is: 799
The length of test dataset is: 2400


In [14]:
print(dataset_dict)
print(type(dataset_dict["train"]["messages"]), dataset_dict["train"]["messages"][0])

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 12795
    })
    valid: Dataset({
        features: ['messages'],
        num_rows: 799
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 2400
    })
})
<class 'list'> [{'content': 'این مدل باید زبان فارسی را یاد بگیرد، شامل ساختار جملات، واژگان، و قواعد گرامری. هدف این یادگیری، درک صحیح زبان فارسی است، نه صرفاً پردازش اطلاعات یا روابط بین کلمات. به نحوی تمرین کنید که مدل بتواند زبان فارسی را به طور طبیعی و قابل درک تولید کند.\n\nبه سوال زیر بر اساس متن دانشت توضیح بده (به فارسی یا انگلیسی) پاسخ دهید. از اطلاعات دقیق و مرتبط با متن استفاده کنید تا پاسخ صحیح و مناسبی ارائه دهید.', 'role': 'system'}, {'content': 'اولین بار روزف اسپدین از سیمان چه استفادهای کرد', 'role': 'user'}, {'content': 'برای ساخت فانوس دریایی', 'role': 'assistant'}]


# 4- Training Part

In [None]:
# compute_dtype = getattr(torch, "float16")
# quan_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type='nf4',
#         bnb_4bit_compute_dtype=compute_dtype,
#         bnb_4bit_use_double_quant=False,
#     )

In [None]:
# modelpath_phi = "microsoft/Phi-3.5-mini-instruct"
# modelpath_llama_3 = "meta_llama/Llama-3.2-3B"
modelpath_llama_1 = "meta-llama/Llama-3.2-1B"
device = {"": 0}
model = AutoModelForCausalLM.from_pretrained(
    modelpath_llama_1,    
    torch_dtype = torch.bfloat16,
    device_map=device,
    use_cache = False,
    token=token,
    # attn_implementation="flash_attention_2", # For A100 GPU or Ampere GPU
)

tokenizer = AutoTokenizer.from_pretrained(modelpath_llama_1, use_fast = False, token=token)

model, tokenizer = setup_chat_format(model, tokenizer)
if tokenizer.pad_token is None: 
    tokenizer.pad_token = "[PAD]"



The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
# pad and eos token should not be the same but in some model like llama are the same

print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")

print(f"Eos token: {tokenizer.eos_token}")
print(f"Eos token ID: {tokenizer.eos_token_id}")


In [6]:
# changing or adding pad token to tokenizer
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
    tokenizer.add_special_tokens({"pad_token": "<|im_pad|>"})
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|im_pad|>")
    model.resize_token_embeddings(len(tokenizer))

print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Eos token: {tokenizer.eos_token}")
print(f"Eos token ID: {tokenizer.eos_token_id}")

Pad token: <|im_pad|>
Pad token ID: 128258
Eos token: <|im_end|>
Eos token ID: 128257


In [None]:
# verification of pad token
input_text = [
    "hello how are you",
    "Hello Thanks"
]
encoded = tokenizer(input_text, padding=True, return_tensors="pt")

print(f"Input IDs: {encoded['input_ids']}")
print(f"Attention Mask: {encoded['attention_mask']}")
print(f"Input IDs shape: {encoded['input_ids'].shape}")

print(tokenizer.convert_ids_to_tokens(encoded["input_ids"][-1]))

In [None]:
# checking the template
dataset = {
    "prompt": [[{"role": "user", "content": "What color is the sky?"}],
               [{"role": "user", "content": "Where is the sun?"}]],
    "completion": [[{"role": "assistant", "content": "It is blue."}],
                   [{"role": "assistant", "content": "In the sky."}]]
}

dataset_sample = Dataset.from_dict(dataset)
dataset_sample = dataset_sample.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
dataset_sample["prompt"], dataset_sample["completion"]

In [None]:
example = dataset_dict["train"][0]
apply_chat_template(example, tokenizer=tokenizer)

In [None]:
# checking data collator

# Get a single sample from the dataset
sample_data = dataset_dict["train"]["messages"][0]

# Define the instruction template
instruction_template = "<|im_start|>{role}\n{content}<|im_end|>\n"

# Format the sequence using the sample
formatted_sequence = "".join(
    instruction_template.format(role=msg["role"], content=msg["content"]) for msg in sample_data
)

# Tokenize the single sample
tokenized_sample = tokenizer(formatted_sequence, truncation=True, padding="max_length", return_tensors="pt")

# Convert to collator-compatible input by wrapping in a batch format
tokenized_batch = {
    "input_ids": tokenized_sample["input_ids"],
    "attention_mask": tokenized_sample["attention_mask"]
}

# Pass the tokenized batch to the collator
collated_batch = data_collator([tokenized_batch])

# Extract collated data
input_ids = collated_batch["input_ids"]
labels = collated_batch["labels"]
attention_mask = collated_batch["attention_mask"]

# Decode and print results for debugging
decoded_inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=False)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

print("Input IDs:", input_ids)
print("Decoded Inputs:", decoded_inputs)
print("Labels:", labels)
print("Decoded Labels:", decoded_labels)
print("Attention Mask:", attention_mask)


In [7]:
# For finding seq_length of tokenizer
max_seq_length = min(tokenizer.model_max_length, 1024)
print(max_seq_length)

1024


In [None]:
# parallelism training
def create_deepspeed_config(file_path="ds_config.json"):
    """Creates a DeepSpeed configuration file with combined parallelism."""

    config = {
        "zero_optimization": {
            "stage": 3,
            "offload_param": {
                "device": "cpu",
                "pin_memory": True
            },
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True
            },
            "overlap_comm": True,
            "contiguous_gradients": True
        },
        "gradient_accumulation_steps": 4,
        "train_micro_batch_size_per_gpu": 2,
        "fp16": {
            "enabled": True
        },
        "pipeline": {
            "enabled": True,
            "partitions": 2
        },
        "tensor_parallel": {
            "enabled": True,
            "size": 2
        }
    }

    # Write configuration to JSON file
    with open(file_path, "w") as json_file:
        json.dump(config, json_file, indent=4)

    print(f"DeepSpeed configuration saved to {file_path}")

# Call the function to create the DeepSpeed config
create_deepspeed_config()


In [8]:
from transformers import TrainingArguments

# GaLore hyperparameters
rank = 1024
update_proj_gap = 200
scale = 2

training_arguments = TrainingArguments(
    output_dir = f"/kaggle/working/slm_trian",
    logging_dir= f"/kaggle/working/logs",
    eval_strategy = "steps",
    weight_decay=0.0001,
    label_names = ["labels"],
    warmup_steps=1,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    # neftune_noise_alpha=5, # read about that
    # gradient_accumulation_steps=4, #can't be used for galore
    save_steps = 8000,
    eval_steps = 500,
    do_eval=True,
    logging_steps = 1, 
    learning_rate = 1e-5,
    num_train_epochs = 3,
    # deepspeed = "/kaggle/working/ds_config.json", #can't be used for Galore
    lr_scheduler_type = "linear",
    gradient_checkpointing = True,
    # fp16=True,
    # gradient_checkpointing_kwargs={'use_reentrant':False}, #read about that 
    optim = "galore_adamw_8bit_layerwise",
    optim_target_modules = ["attn", "mlp"],
    optim_args = f"rank={rank}, update_proj_gap={update_proj_gap}, scale={scale}",
    report_to = "tensorboard",
)

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_dict["train"],
    eval_dataset = dataset_dict['valid'],
    data_collator = DataCollatorForCompletionOnlyLM(
        instruction_template = "<|im_start|>user", 
        response_template = "<|im_start|>assistant", 
        tokenizer = tokenizer, 
        mlm = False),
    # dataset_kwargs = dict(add_special_tokens = False),
    args = training_arguments,
    # dataset_text_field="message"
)

trainer.train()

  trainer = SFTTrainer(
Activated GaLoRE fine-tuning, depending on your model size and hardware, the training might take a while before starting. Please be patient !
model.layers.0.self_attn has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!
model.layers.0.self_attn.rotary_emb has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!
model.layers.0.mlp has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!
model.layers.0.mlp.act_fn has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!
model.layers.1.self_attn has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!
model.layers.1.self_attn.rotary_emb has been matched but ignored as GaLore only supports linear layers. Please double 

In [None]:
model.save_pretrained('/kaggle/working')
tokenizer.save_pretrained('/kaggle/working')