In [1]:
###############################################################################
#################################################################################
####################################################################################
#######################################################################################

In [2]:
"""This notebook is design to train and evaluate an AI based on llama3.2 architecture and fine tuned
for chat and Text generation of potato varieties and their descriptions. We train Llama3 for causalModel generation
while the MPNet sentence transformers for  will be fine tuned sor scoring or rather sequence classification"""
# Install dependencies
!pip install -U transformers datasets accelerate peft bitsandbytes wandb flash-attn
!pip install trl==0.12.2

import os
import logging
import torch
import pandas as pd
import bitsandbytes as bnb
#from bitsandbytes import BitsAndBytesConfig
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from huggingface_hub import login
import wandb
#print(trl.__version__)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Hugging Face and Weights & Biases login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACEHUB_API_TOKEN")
login(token=hf_token)

wb_token = user_secrets.get_secret("wandb")
wandb.login(key=wb_token)

# Initialize WandB
run = wandb.init(
    project="Llama Fine-Tune Potato Wizard",
    job_type="training",
    anonymous="allow"
)

# Load datasets
ReportSDGDatapath = '/kaggle/input/futproofagricaireportsvssdgtargets/curated_agric_dataset_v5.csv'
InputResponseDatapath = '/kaggle/input/futproofagricaireportsvssdgtargets/QA_data.csv'

try:
    ReportSDGData = pd.read_csv(ReportSDGDatapath)
    InputResponseData = pd.read_csv(InputResponseDatapath)
    logger.info("Datasets loaded successfully.")
except Exception as e:
    logger.error(f"Error loading datasets: {e}")
    raise

# Preprocess datasets
InputResponseData.rename(columns={'response': 'premise', 'input': 'instructions'}, inplace=True)

InputResponseData = InputResponseData[['instructions', 'premise']]
#ReportSDGData['instructions'] =  "This is a general information on potato traits, varieties, cultivation, socio-economics, genetics, and breeding amid climate change and disease prevalence."
ReportSDGData = ReportSDGData[['instructions', 'premise']]

#ReportSDGDataCombined.dropna(subset=['hypothesis'], inplace=True)
#ReportSDGDataCombined['instructions'] =  "General information on potato traits, cultivation, socio-economics, genetics, and breeding amid climate change and disease prevalence."

# Activate this code when we get the resouces to train the large dataset
#ReportSDGDataCombined = pd.concat([ReportSDGData, InputResponseData], axis=0)

#Then Deactivate this code
rows_to_drop = list(range(5394, 9284)) + list(range(9662, 10211)) + list(range(10581, 10898)) + list(range(11146, 11261)) + list(range(12307, 13635))
ReportSDGDataCombined = ReportSDGData.drop(rows_to_drop)

# Save combined data to JSON
json_file = "data.json"
try:
    ReportSDGDataCombined.to_json(json_file, orient="records", indent=4)
    logger.info(f"Data saved to {json_file}.")
except Exception as e:
    logger.error(f"Error saving dataset: {e}")
    raise

# Load dataset for training
try:
    dataset = load_dataset("json", data_files=json_file, split="train")
    logger.info("Dataset loaded successfully into Hugging Face format.")
except Exception as e:
    logger.error(f"Error loading dataset into Hugging Face format: {e}")
    raise

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
#quant_config

# Load model and tokenizer
base_model = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    #return_dict=True,
    #torch_dtype=torch.float16,
    quantization_config=quant_config,
    device_map={"": 0},
    #device_map="auto",
    trust_remote_code=True
)

model.config.use_chache = False
model.config.pretraining_tp = 1

# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

# Format the dataset for training
instruction = "You are a top-rated plant breeder and agronomy service agent named Shadrack. Give optimum potato trait combinations for potato varieties amid climate change and disease pressure. Be polite to farmers and breeders and answer all their questions."

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["instructions"]},
        {"role": "assistant", "content": row["premise"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(format_chat_template)


# Function to debug and inspect model architecture
def debug_model_architecture(model):
    for name, module in model.named_modules():
        print(name, type(module))

# Uncomment the following line to inspect the model architecture
# debug_model_architecture(model)

# LoRA-compatible modules
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  # This works for 4-bit quantized models
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # Exclude `lm_head` if it's listed
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
if not modules:
    logger.warning(
        "No compatible modules found for LoRA injection. Using default settings."
    )
    # Optionally fallback to standard layers
    modules = ["q_proj", "v_proj"
               #, "k_proj", "o_proj"
              ]

logger.info(f"LoRA modules identified: {modules}")

# Adjust LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=modules,  # Dynamically identified modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

try:
    model = get_peft_model(model, lora_config)
    logger.info("LoRA modules successfully injected into the model.")
except ValueError as e:
    logger.error(f"Failed to inject LoRA modules: {e}")
    raise





model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama-finetunes-potato-wizard_v59",
    per_device_train_batch_size=4,
    #per_device_eval_batch_size=2,
    optim = "adafactor",
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    weight_decay=1e-2,
    #group_by_length = True,
    num_train_epochs=1,
    #max_steps=1,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=10,
    fp16=True,
    bf16=False,
    report_to="wandb"
)

# Split dataset into train and validation sets
train_validation_split = dataset.train_test_split(test_size=0.2, seed=42)
tokenized_dataset = DatasetDict({
    "train": train_validation_split["train"],
    "validation": train_validation_split["test"]
})

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=lora_config,
    max_seq_length=512, # we realized that the SFTTrainer always pads by default the sequences to the max_seq_length argument of the 
    #SFTTrainer. If none is passed, the trainer will retrieve that value from the tokenizer. Some tokenizers do not provide a default 
    #value, so there is a check to retrieve the minimum between 2048 and that value.
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args
    #packing=False
)

# Train and save the model
trainer.train()

# Save model's state_dict to pytorch_model.bin explicitly
torch.save(model.state_dict(), './llama-finetunes-potato-wizard_v59/pytorch_model.bin')
trainer.model.save_pretrained("./llama-finetunes-potato-wizard_v59")
tokenizer.save_pretrained("./llama-finetunes-potato-wizard_v59")
model.config.save_pretrained("/llama-finetunes-potato-wizard_v59")

# Push the fine-tuned model to Hugging Face Hub
repo_name = "llama-potato-wizard_v59"
trainer.model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

#save the model in gg format to be used with ollama locally
#model.save_pretrained_gguf("./llama-finetuned-potato-wizard-gguf", tokenizer, quantization_method = "f16")


# End the WandB run
wandb.finish()


Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting wandb
  Downloading wandb-0.19.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.2.post1.tar.gz (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshadrack-imai[0m ([33mshadrack-imai-kenya-agricultural-and-livestock-research-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241230_103729-6u7tkw7u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mbrisk-eon-88[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/shadrack-imai-kenya-agricultural-and-livestock-research-/Llama%20Fine-Tune%20Potato%20Wizard[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/shadrack-imai-kenya-agricultural-and-livestock-research-/Llama%20Fine-Tune%20Potato%20

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/19564 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/15651 [00:00<?, ? examples/s]

Map:   0%|          | 0/3913 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
10,2.5449,2.008823
20,1.8755,1.86099
30,1.7525,1.795704
40,1.7261,1.719487
50,1.7472,1.686727
60,1.5962,1.671634
70,1.7224,1.653763
80,1.692,1.640997
90,1.6136,1.634123
100,1.6162,1.625907


adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/loss █▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:            eval/runtime ▂▁▂▁▂▃▄▅▆▅▄▄▃▅▅▅▇▆▆▆▇▆▇▇▇▇▆▆▇▆▇█▇▇█▇▆▇▇▇
[34m[1mwandb[0m: eval/samples_per_second █████▆▆▄▃▄▆▄▆▄▄▄▃▃▃▃▁▃▃▃▁▁▃▃▁▃▁▁▃▃▁▁▃▃▃▁
[34m[1mwandb[0m:   eval/steps_per_second ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
[34m[1mwandb[0m:       train/global_step ▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
[34m[1mwandb[0m:         train/grad_norm █▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▂▁▁▁▁▁▂▁▂▁▁▁▁▁▁
[34m[1mwandb[0m:     train/learning_rate ████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▂▂▂▂▂▃▂▂▁▂▁▂▂
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:        