**Installing the Necessary modules and libraries**

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
!pip install rouge-score

In [None]:
import pandas as pd
from datasets import Dataset
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import warnings
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
warnings.filterwarnings("ignore")
# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
#put your own huggingface token after gaining gated model access
#select "n"
!huggingface-cli login

*https://huggingface.co/models*

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-3-8b-Instruct"
# Fine-tuned model name
new_model = "Llama-3-8b-Instruct-finetune"

In [None]:
df = pd.read_json("test.json")
df.to_csv("test.csv")
df = pd.read_csv('test.csv')
df = df.head(100)

#Format the dataset as per the model requirements.
df['text'] = '<S>' + "[INST]" + df['question'] + df['context'] + '[/INST]' + df['answers'] + '</S>'
# Remove inplace=True to return a new DataFrame
dtf = df.drop(columns=['question','answers','context', 'idx','Unnamed: 0'], axis=1)

#convert to Huggingface Datasets format
train = Dataset.from_pandas(dtf)
dataset = train # Use the Hugging Face Dataset object here

In [None]:
test = pd.read_json("dev.json")
test.to_csv("dev.csv")
test = pd.read_csv('dev.csv')
test['text'] = test['question'] + test['context']
#test.drop(columns=['question','answers','context', 'idx','Unnamed: 0'], axis=1, inplace=True)

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant= False,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and True:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha= 16,
    lora_dropout= 0.1,
    r= 64,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs= 1,  #4
    per_device_train_batch_size= 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps= 1,
    optim="paged_adamw_32bit",
    save_steps= 0,
    logging_steps= 25,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16=False,
    bf16=False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.03,
    group_by_length= None,
    lr_scheduler_type= "cosine",
    report_to="tensorboard"
)



In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset= test,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
# Train model
trainer.train()

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
ques = test['text'].iloc[9]
prompt = ques
task = 'You are a helpful, respectful and honest assistant.  Always give factually coherent answers. Keep your answers to be brief within 3 sentences. Do not refuse to give an answer'
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s> {task} [INST] {prompt} [/INST]")
generated_text = result[0]['generated_text']

In [None]:
clean_text = generated_text.split('[/INST]')[-1].strip()
print(clean_text)

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
ques = test['text'].iloc[99]
prompt = ques
task = 'You are a helpful, respectful and honest assistant.  Always give factually coherent answers. Keep your answers to be brief within 3 sentences. Do not refuse to give an answer'
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s> {task} [INST] {prompt} [/INST]")
generated_text = result[0]['generated_text']

In [None]:
clean_text = generated_text.split('[/INST]')[-1].strip()
print(clean_text)

In [None]:
test = df.head(10)

In [None]:
# Initialize the model pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Define the task
task = 'You are a helpful, respectful and honest assistant. Always give factually coherent answers. Keep your answers brief within 3 sentences. Do not refuse to give an answer.'

# Initialize lists to store results
all_rouge_scores = []

# Iterate through the dataset
for idx in range(len(test)):
    ques = test['text'].iloc[idx]
    ans = test['answers'].iloc[idx]

    prompt = ques
    result = pipe(f"<s> {task} [INST] {prompt} [/INST]")
    generated_text = result[0]['generated_text']

    clean_text = generated_text.split('[/INST]')[-1].strip()
    print(f"Generated Text for question {idx+1}:", clean_text)
    print(f"True Answer for question {idx+1}:", ans)

    # Calculate ROUGE scores
    rouge_scores = []
    for true_answer_list in ans:
        true_answer = ' '.join(true_answer_list)
        scores = rouge_scorer_instance.score(true_answer, clean_text)
        rouge_scores.append(scores)

    all_rouge_scores.append(rouge_scores)

# Calculate and print average ROUGE scores
avg_rouge1 = sum(score['rouge1'].fmeasure for scores in all_rouge_scores for score in scores) / sum(len(scores) for scores in all_rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for scores in all_rouge_scores for score in scores) / sum(len(scores) for scores in all_rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for scores in all_rouge_scores for score in scores) / sum(len(scores) for scores in all_rouge_scores)

print(f'Average ROUGE-1 F1 Score: {avg_rouge1}')
print(f'Average ROUGE-2 F1 Score: {avg_rouge2}')
print(f'Average ROUGE-L F1 Score: {avg_rougeL}')