In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

# Model configuration
max_seq_length = 2048

# Explicitly set dtype based on GPU capabilities
if torch.cuda.is_available():
    gpu_properties = torch.cuda.get_device_properties(0)
    dtype = torch.bfloat16 if gpu_properties.major >= 8 else torch.float16
else:
    dtype = torch.float32

# Path to your locally saved model
local_model_path = "/mnt/d/Yui/models/llama2Chat7b"

# Step 1: Load and modernize the tokenizer
print("Loading and modernizing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)
print(f"Tokenizer saved in modern format at: {local_model_path}")

# Step 2: Load the model
print("Loading model with Unsloth...")
model, _ = FastLanguageModel.from_pretrained(
    model_name=local_model_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
    local_files_only=True
)

# Step 3: Adjust dropout probabilities
print("Adjusting dropout probabilities...")
try:
    model.config.hidden_dropout_prob = 0.1
    model.config.attention_probs_dropout_prob = 0.1
    print(f"Dropout set: Hidden={model.config.hidden_dropout_prob}, Attention={model.config.attention_probs_dropout_prob}")
except AttributeError:
    print("Dropout attributes not found in model config. Check the model architecture for direct adjustments.")

# Step 4: Validate dtype
print(f"Model dtype: {next(model.parameters()).dtype}")


In [1]:
# making sure cuda is working 
import torch
print("CUDA available:", torch.cuda.is_available())
print("Available Memory:", torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else "N/A")


CUDA available: True
Available Memory: 12878086144


In [None]:
from datasets import load_dataset, DatasetDict

# Define EOS token
EOS_TOKEN = tokenizer.eos_token
SYSTEM_PROMPT = """ """ # add your own system prompt
# Formatting function for multi-turn conversations
def formatting_prompts_func(examples):
    texts = []
    for conversation in examples['turns']:
        convo_text = SYSTEM_PROMPT
        for i, turn in enumerate(conversation):  # Use enumerate to access index and turn
            role = turn['role']
            content = turn['content']
            
            if role != "Yui":  # Treat all non-Yui roles as input
                convo_text += f"\nUser: {content}"
            else:  # Treat Yui as the response
                convo_text += f"\nYui: {content}"
            
            # Add EOS token after the last turn of the conversation
            if i == len(conversation) - 1:  # Last turn in the conversation
                convo_text += f" {EOS_TOKEN}"
                
        texts.append(convo_text.strip())
    return {"text": texts}

# Paths to the datasets
train_file = "path_to_your_datasets" #training datasets
val_file = "path_to_your_datasets" #validation datasets
test_file = "path_to_your_datasets" #testing datasets

# Load and format the split datasets
train_dataset = load_dataset("json", data_files={"train": train_file})["train"]
val_dataset = load_dataset("json", data_files={"validation": val_file})["validation"]
test_dataset = load_dataset("json", data_files={"test": test_file})["test"]

# Apply formatting function
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
val_dataset = val_dataset.map(formatting_prompts_func, batched=True)
test_dataset = test_dataset.map(formatting_prompts_func, batched=True)

# Combine into a DatasetDict for easier access during training and evaluation
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Example to verify the output. making sure you got the right thing
print("Example formatted training data:")
print(dataset['train'][0])


In [None]:
#This code snippet is setting up a training configuration for a model using the TRL (Text Representation Learning) library. Here's a breakdown of what the code is doing:
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from unsloth import is_bfloat16_supported
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=5e-7, weight_decay=0.02)  # Very small learning rate
scheduler = StepLR(optimizer, step_size=1000, gamma=0.1)  # Reduces LR every 1000 steps

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="text",
    max_seq_length=128,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        fp16=not is_bfloat16_supported(),       # Use FP16 unless BF16 is available
        bf16=is_bfloat16_supported(),          # Enable BF16 for Ampere GPUs if supported
        fp16_full_eval=True,                   # Use FP16 for evaluation
        per_device_eval_batch_size=4,          # Smaller eval batch size for memory stability
        eval_accumulation_steps=8,             # Accumulate evaluation results over multiple batches
        eval_strategy="steps",
        eval_steps=10,                         # Evaluate every 10 steps for close monitoring
        per_device_train_batch_size=8,         # Smaller training batch size for memory stability
        gradient_accumulation_steps=25,        # Accumulate gradients for a larger effective batch size
        warmup_steps=500,                      # Gradual warmup for smooth adjustment
        max_steps=5000,                        # Total number of training steps
        learning_rate=2e-5,                    # Start with a conservative learning rate
        logging_steps=10,                      # Log every 10 steps
        optim="adamw_torch",
        weight_decay=0.02,                     # Moderate weight decay to reduce overfitting
        lr_scheduler_type="reduce_lr_on_plateau",
        lr_scheduler_kwargs={"factor": 0.5, "patience": 2, "min_lr": 1e-8}, 
        #lr_scheduler_kwargs={"num_cycles": 2},
        #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)
        metric_for_best_model="eval_loss",     # Track eval loss to determine best model
        max_grad_norm=1.0,                     # Clip gradients for numerical stability
        seed=3407,
        output_dir="outputs",
        gradient_checkpointing=True,
        save_steps=100,                         # Save frequently for better recovery
        load_best_model_at_end=True,
    ),
    optimizers=(optimizer,scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]  # Stop early if no improvement
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
torch.backends.cuda.enable_cudnn_sdp(False)
# Assign an existing token as the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # You can use eos_token if it exists, or define a new one

# Alternatively, add a new special padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate new token

# Run training
trainer.train()


In [None]:
from transformers import DataCollatorWithPadding, Trainer
import torch

# Create a data collator to pad sequences during evaluation
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length")

# Define function to compute loss and ignore padding tokens in evaluation
def compute_loss_ignore_padding(predictions, labels):
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    return loss_fct(predictions.view(-1, predictions.size(-1)), labels.view(-1))

# Update the evaluation code to handle multi-turn conversations
def format_and_tokenize_for_test(examples):
    EOS_TOKEN = tokenizer.eos_token
    texts = []
    for conversation in examples["turns"]:
        convo_text = ""
        for turn in conversation:
            role = turn["role"]
            content = turn["content"]
            if role != "AI":  # Treat all non-AI roles as user input
                convo_text += f"\nUser: {content}"
            else:  # llm responses are treated as output
                convo_text += f"\nAI: {content}"
        convo_text += EOS_TOKEN
        texts.append(convo_text.strip())
    
    # Tokenize with truncation and padding for uniform input
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

# Apply tokenization and formatting to the test dataset
test_dataset = test_dataset.map(format_and_tokenize_for_test, batched=True)

# Set format for PyTorch tensors and include padding
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Ensure `trainer` is instantiated properly with the test dataset
# Trainer should have been initialized elsewhere with model, tokenizer, etc.
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Display test loss and other metrics
print("Test Results:", test_results)

# Print specific test loss value if available
test_loss = test_results.get("eval_loss", None)
if test_loss is not None:
    print(f"Test Loss (with padding handled): {test_loss:.4f}")
else:
    print("No test loss found in the evaluation metrics.")

In [None]:
# saving model
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
# saving model
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")