<a href="https://colab.research.google.com/github/SandyDRawat/Self_Projects/blob/main/Model_training_and_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install unsloth evaluate transformers nltk trl bitsandbytes peft
!pip install sympy --upgrade
!pip install llama-cpp-python
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
# Import all necessary dependencies
from unsloth import FastLanguageModel
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from llama_cpp import Llama

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [11]:
# Login to Huggingface to interact and push the models
from huggingface_hub import login
from google.colab import userdata
# Authenticate using Hugging Face token for accessing gated models and upload the model to hub
hf_token = userdata.get("hf_token")
login(token=hf_token)

In [12]:
# Load the pre-trained base model using the FastLanguageModel as was mentioned in unsloth nnotebook
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = 512,
    dtype = None,          # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True,   # Use 4bit quantization to reduce memory usage. Can be False.
)

==((====))==  Unsloth 2024.12.11: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [13]:
# Configure and apply PEFT (Parameter-Efficient Fine-Tuning) to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 12
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
                      # only unfreezes these parameters for training, can also add more based on your model
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [14]:
# Defined a input prompt for chinese translation purpose
translation_prompt = """Below is a Chinese text that needs to be translated into English.

### Chinese Text:
{}

### English Translation:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
# this function takes dataset with chinese and english columns, creates a translation prompt then saves them to text column
    chinese = examples["Chinese"]
    english = examples["English"]
    texts = []
    wtexts = []
    for chinese, english in zip(chinese, english):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = translation_prompt.format(chinese, english) + EOS_TOKEN
        wtext = translation_prompt.format(chinese, "")# Use translation_prompt
        texts.append(text)
        wtexts.append(wtext)
    return { "text" : texts,
            "wtext" : wtexts}


In [15]:
from sklearn.model_selection import train_test_split

# Load the custom dataset
dataset = pd.read_csv("/content/translations_dataset.csv")

# Sample 45000 rows for processing (adjust as needed for system capacity)
dataset = dataset.sample(n=45000, random_state=42)

# Split the dataset into training and evaluation datasets (80-20 split)
train_df, eval_df = train_test_split(dataset, test_size=0.01, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Apply formatting function to both datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# Output dataset sizes
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")


Map:   0%|          | 0/44550 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Training dataset size: 44550
Evaluation dataset size: 450


In [8]:
train_dataset

Dataset({
    features: ['Chinese', 'English', '__index_level_0__', 'text', 'wtext'],
    num_rows: 44550
})

In [16]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 40,
        gradient_accumulation_steps = 3,
        warmup_steps = 5,
        #num_train_epochs = 2, # Set this for 1 full training run.
        max_steps = 170,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=4):   0%|          | 0/44550 [00:00<?, ? examples/s]

In [17]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 44,550 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 40 | Gradient Accumulation steps = 3
\        /    Total batch size = 120 | Total steps = 170
 "-____-"     Number of trainable parameters = 48,627,712


Step,Training Loss
10,2.8138
20,2.325
30,2.1711
40,2.0639
50,1.9873
60,1.9529
70,1.9173
80,1.8953
90,1.8984
100,1.8517


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    translation_prompt.format(
        "尤其，这是孟浩在第七命下展开的七婴图腾，威力之强，轰天撼地。", # chinese
        "", # english - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
tokenizer.batch_decode(outputs)

To save the final model as LoRA adapters, either use Huggingface's push_to_hub for an online save or save_pretrained for a local save.

[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [18]:
# Save locally and on hub as GGUF (8-bit Q8_0 quantization)
# change the model as the correct version
model.save_pretrained("llama3.1_zhtoen_translation_v4") # Local saving
tokenizer.save_pretrained("llama3.1_zhtoen_translation_v4")

model.push_to_hub("Rawsand/llama3.1_zhtoen_translation_v4") # Online saving
tokenizer.push_to_hub("Rawsand/llama3.1_zhtoen_translation_v4") # Online saving

README.md:   0%|          | 0.00/578 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Rawsand/llama3.1_zhtoen_translation_v4


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# Save locally and on hub as 16bit
model.save_pretrained_merged("model_name", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("username/model_name", tokenizer, save_method = "merged_16bit", token = "")

In [19]:
# Save locally and on hub as GGUF (8-bit Q8_0 quantization)
model.save_pretrained_gguf("llama3.1_zhtoen_translation_v4_gguf", tokenizer)
model.push_to_hub_gguf("Rawsand/llama3.1_zhtoen_translation_v4_gguf", tokenizer)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.24 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 17.90it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving llama3.1_zhtoen_translation_v4_gguf/pytorch_model-00001-of-00002.bin...
Unsloth: Saving llama3.1_zhtoen_translation_v4_gguf/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at llama3.1_zhtoen_translation_v4_gguf into q8_0 GGUF format.
The output location will be /content/llama3.1_zhtoen_translation_v4_gguf/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama3.1_zhtoen_translation_v4_gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.

100%|██████████| 28/28 [00:01<00:00, 19.89it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Rawsand/llama3.1_zhtoen_translation_v4_gguf/pytorch_model-00001-of-00002.bin...
Unsloth: Saving Rawsand/llama3.1_zhtoen_translation_v4_gguf/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at Rawsand/llama3.1_zhtoen_translation_v4_gguf into q8_0 GGUF format.
The output location will be /content/Rawsand/llama3.1_zhtoen_translation_v4_gguf/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama3.1_zhtoen_translation_v4_gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Rawsand/llama3.1_zhtoen_translation_v4_gguf


In [20]:
# If did not train the model you can just call the previous version

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Rawsand/llama3.1_zhtoen_translation_v4",
    max_seq_length = 512,
    dtype = None,
    load_in_4bit = True,
)
model = FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.11: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

In [None]:
# Defined a input prompt for chinese translation purpose
translation_prompt = """Below is a Chinese text that needs to be translated into English.

### Chinese Text:
{}

### English Translation:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
# this function takes dataset with chinese and english columns, creates a translation prompt then saves them to text column
    chinese = examples["Chinese"]
    english = examples["English"]
    texts = []
    wtexts = []
    for chinese, english in zip(chinese, english):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = translation_prompt.format(chinese, english) + EOS_TOKEN
        wtext = translation_prompt.format(chinese, "")# Use translation_prompt
        texts.append(text)
        wtexts.append(wtext)
    return { "text" : texts,
            "wtext" : wtexts}


In [None]:
from sklearn.model_selection import train_test_split

# Load the custom dataset
dataset = pd.read_csv("/content/translations_dataset.csv")

# Sample 45000 rows for processing (adjust as needed for system capacity)
dataset = dataset.sample(n=45000, random_state=42)

# Split the dataset into training and evaluation datasets (80-20 split)
train_df, eval_df = train_test_split(dataset, test_size=0.01, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Apply formatting function to both datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# Output dataset sizes
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")


In [21]:
# Assuming `validation_dataset` is your Hugging Face Dataset object
inputs = eval_dataset["wtext"]  # Get the Chinese texts
references = eval_dataset["text"]  # Get the English references

# Generate predictions
predictions = []
for text in inputs:
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)

    prediction = tokenizer.batch_decode(outputs)
    predictions.append(prediction)

In [22]:
predictions

[['<|begin_of_text|>Below is a Chinese text that needs to be translated into English.\n\n### Chinese Text:\n孟浩在远处，可那中心的高台对他而言也是清晰入目，在看到那风情万种的女子时，孟浩愣了一下，神色露出古怪，这女子他记得，正是当年天河坊的百珍阁内，那对他**，叫做巧玲的女子。\n\n### English Translation:\nMeng Hao was far away, but the platform in the center of the square was clearly visible to him. When he saw the woman, he was stunned. Her expression was one of strangeness. She was the same woman who had been in the Hundred Treasures Pavilion in the Heavenly River Market, the woman who had been his lover, named Qiao Ling.<|end_of_text|>'],
 ['<|begin_of_text|>Below is a Chinese text that needs to be translated into English.\n\n### Chinese Text:\n这尸体……正是送给了孟浩仙人指路的丑门台！\n\n### English Translation:\nThis corpse… was none other than the ugly doorkeeper who had led Meng Hao to the Immortal’s Mansion!<|end_of_text|>'],
 ['<|begin_of_text|>Below is a Chinese text that needs to be translated into English.\n\n### Chinese Text:\n这一声，震动南天星，传出星空中，让星空外与孟浩的父亲厮杀试图要冲来的所有人，全部听到。\n\n#

In [23]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
nltk.download('punkt_tab')

bleu_scores = []
for ref, pred in zip(references, predictions):
    # Tokenize the reference and prediction
    reference_tokens = nltk.word_tokenize(ref)  # Tokenize the reference
    prediction_tokens = nltk.word_tokenize(pred[0])  # Tokenize the prediction

    bleu_score = sentence_bleu([reference_tokens], prediction_tokens)
    bleu_scores.append(bleu_score)

# Print results
    for ref, pred, bleu in zip(references, predictions, bleu_scores):
        print(f"BLEU Score: {bleu:.4f}")

   # Calculate average BLEU score
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU Score: {average_bleu:.4f}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
BLEU Score: 0.4076
BLEU Score: 0.6253
BLEU Score: 0.5424
BLEU Score: 0.5833
BLEU Score: 0.4728
BLEU Score: 0.1113
BLEU Score: 0.3172
BLEU Score: 0.4678
BLEU Score: 0.5023
BLEU Score: 0.3888
BLEU Score: 0.5372
BLEU Score: 0.4493
BLEU Score: 0.4103
BLEU Score: 0.6430
BLEU Score: 0.4986
BLEU Score: 0.7291
BLEU Score: 0.3769
BLEU Score: 0.6584
BLEU Score: 0.5330
BLEU Score: 0.5210
BLEU Score: 0.3455
BLEU Score: 0.8174
BLEU Score: 0.5106
BLEU Score: 0.7185
BLEU Score: 0.4651
BLEU Score: 0.5156
BLEU Score: 0.5153
BLEU Score: 0.3397
BLEU Score: 0.4996
BLEU Score: 0.3655
BLEU Score: 0.4843
BLEU Score: 0.4686
BLEU Score: 0.4514
BLEU Score: 0.4663
BLEU Score: 0.7027
BLEU Score: 0.5200
BLEU Score: 0.4930
BLEU Score: 0.4247
BLEU Score: 0.7583
BLEU Score: 0.5332
BLEU Score: 0.6648
BLEU Score: 0.7491
BLEU Score: 0.5172
BLEU Score: 0.3873
BLEU Score: 0.5512
BLEU Score: 0.4378
BLEU Score: 0.6180
BLEU Score: 0.4465
BLEU Score: 0.4343
BLEU

In [None]:
llm = Llama.from_pretrained(
	repo_id="Rawsand/llama3.1_zhtoen_translation_v4_gguf",
	filename="unsloth.Q8_0.gguf",
)

In [None]:
texts = ["你的模型可以翻译中文吗？", "在这面具出现的刹那，忽然的，皮冻那里猛地睁大了眼，拍打着翅膀飞起，绕着孟浩飞了几圈，口中传出聒噪之声。"]

In [None]:
translation_prompt = """Below is a Chinese text that needs to be translated into English.

### Chinese Text:
{}

### English Translation:
{}"""

def formatting_prompts_func_without_english(chinese_texts):
    texts = []
    for chinese_text in chinese_texts:
        text = translation_prompt.format(chinese_text, "")
        texts.append(text)
    return texts

In [None]:
prompt = formatting_prompts_func_without_english(texts)

In [None]:
output = llm(
	prompt[0],
	max_tokens=512,
	echo=True
)

In [None]:
output['choices'][0]['text']