## TODO: Check if same code betweem Falcon and LLama2 (and try to combine in 1 notebook) --> only difference in getting weights with LLama2 and huggingface token


In [None]:
!pip install sentencepiece

In [None]:
!nvidia-smi

In [None]:
!pip show torch
!pip show accelerate

In [None]:
import sys
print(sys.version)

In [None]:
import json
import os 
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer, 
    LlamaForCausalLM,
    BitsAndBytesConfig,
)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Data

In [None]:
with open("data/example_qa_dataset.json") as json_file: # Match this to your data path
    data = json.load(json_file)

In [None]:
pprint(data["questions"][0], sort_dicts=False)
pprint(data["questions"][1], sort_dicts=False)
pprint(data["questions"][2], sort_dicts=False)
pprint(data["questions"][3], sort_dicts=False)

In [None]:
with open("data/dataset.json", "w") as f:
    json.dump(data["questions"], f)

In [None]:
pd.DataFrame(data["questions"]).head()

## Load OpenLLaMA Model & Tokenizer

In [None]:
# Assuming you have already loaded and trained your LLaMA model
model_name = 'openlm-research/open_llama_7b'

# Specify the directory where you want to save the weights
offload_folder= '../OpenLLaMA/open_llama_7b'

# tokenizer = LlamaTokenizer.from_pretrained(offload_folder)

# model = LlamaForCausalLM.from_pretrained(
#     model_name, 
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16, 
#     device_map='auto',
#     offload_folder=offload_folder
# )


In [None]:
# model_name = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    offload_folder=offload_folder,
    quantization_config=bnb_config,
)

# Tie the weights of the model
model.tie_weights()

tokenizer = LlamaTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def print_trainable_parameters(model):
    print("print_trainable_parameters start")
    """
    print the number of trainable parameters in the model
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
            
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
print(model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CASUAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

## Inference Before Training

In [None]:
prompt = f"""
<human>: How can I create an account?
<assistant>:
""".strip()
print(prompt)

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.8
generation_config.top_p = 0.8
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
%%time
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = "cuda:0"
# torch.cuda.set_device(DEVICE)

In [None]:
# model.to('cuda')
print(next(model.parameters()).device)  # This will show the device of the model

In [None]:
encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)

with torch.inference_mode():
    outputs = model.generate(
        input_ids = encoding.input_ids,
        attention_mask = encoding.attention_mask,
        generation_config = generation_config,
    )

In [None]:
# inference result before fine-tuning
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Build huggingface Dataset

In [None]:
data = load_dataset("json", data_files="data/dataset.json")
data

In [None]:
data["train"][0]

In [None]:
def generate_prompt(data_point):
    return f"""
    <human>: {data_point["question"]}
    <assistant>: {data_point["answer"]}
    """.strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    # tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    # print(tokenized_full_prompt)
    return tokenized_full_prompt

In [None]:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
train_data

In [None]:
# Drop the 'question', 'answer', 'token_type_ids' columns
train_data = train_data.remove_columns(['question', 'answer',])
train_data

In [None]:
len(train_data), type(train_data)

## Training

In [None]:
output_dir = "experiments"

In [None]:
%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
# Define Trainer and TrainingArguments
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1, 
    gradient_accumulation_steps=4,
    remove_unused_columns=False,
    num_train_epochs=5, 
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,  
    max_steps = 60,
    logging_steps=1,
    output_dir=output_dir,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05, 
    report_to="tensorboard"
)

trainer = transformers.Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_data,              
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Train the model
model.config.use_cache = False
trainer.train()

## Save Trained Model

In [None]:
peft_model_dir = './ecommerce-FAQ-chatbot-model'
model.save_pretrained(peft_model_dir)
trainer.save_model(peft_model_dir)
tokenizer.save_pretrained(peft_model_dir)

In [None]:
# model.push_to_hub('seujeong/falcon-7b-glora-faq-chatbot', use_auth_token=True)

## Load Trained Model
https://huggingface.co/blog/peft

In [None]:
import os
os.getcwd()

In [None]:
# Load the model
config = PeftConfig.from_pretrained(peft_model_dir)
model = LlamaForCausalLM.from_pretrained(
    config.base_model_name_or_path, 
    return_dict=True,
    quantization_config=bnb_config,
    # device_map="auto",
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(model, peft_model_dir)
# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name).from_pretrained(config.base_model_name_or_path,  trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# clear the GPU cache
torch.cuda.empty_cache()

## Inference

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.8
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
tokenizer.eos_token_id

In [None]:
model.to('cuda')

In [None]:
def generate_response(question: str) -> str:
    prompt = f"""
<human>: {question}
<assistant>:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt")
    # model.to(device)  
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids.to('cuda'),
            attention_mask=encoding.attention_mask.to('cuda'),
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    
    if response_start >= 0:
        # Find the second occurrence
        responce_end = response.find(assistant_start, response_start + len(assistant_start))
        # print(f"'{assistant_start}' found in response: {responce_end}")
    else:
        print(f"'{assistant_start}' not found in response")
        
    print(response)
    print("------------------------------------------------------------------------------------\n")
    return response[response_start+len(assistant_start):responce_end].strip()

### True dataset
1. {'question': 'How can I create an account?',
 'answer': "To create an account, click on the 'Sign Up' button on the top "
           'right corner of our website and follow the instructions to '
           'complete the registration process.'}
           
           
2. {'question': 'What payment methods do you accept?',
 'answer': 'We accept major credit cards, debit cards, and PayPal as payment '
           'methods for online orders.'}
           
           
3. {'question': 'How can I track my order?',
 'answer': 'You can track your order by logging into your account and '
           "navigating to the 'Order History' section. There, you will find "
           'the tracking information for your shipment.'}
           
           
4. {'question': 'What is your return policy?',
 'answer': 'Our return policy allows you to return products within 30 days of '
           'purchase for a full refund, provided they are in their original '
           'condition and packaging. Please refer to our Returns page for '
           'detailed instructions.'}

In [None]:
prompt = "Can I return a product if it was a clearance or final sale item?"
print(generate_response(prompt))

In [None]:
prompt = "What happens when I return a clearance item?"
print(generate_response(prompt))

In [None]:
prompt = "How do I know when I'll receive my order?"
print(generate_response(prompt))

In [None]:
prompt = "Do you accept credit caards or paypal?"
print(generate_response(prompt))

In [None]:
prompt = "Tell me how to make a new account"
print(generate_response(prompt))

In [None]:
prompt = "I want to track my order, can you tell me how to do?"
print(generate_response(prompt))

In [None]:
prompt = "Tell me the return policy"
print(generate_response(prompt))

In [None]:
# generation_config = model.generation_config
# generation_config.max_new_tokens = 100
# generation_config.temperature = 0.7
# generation_config.top_p = 0.9
# generation_config.num_return_sequences = 1
# generation_config.pad_token_id = tokenizer.eos_token_id
# generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
# test_prompts = ["Can I return a product if it was a clearance or final sale item?","What happens when I return a clearance item?", "How do I know when I'll receive my order?", "Do you accept credit caards or paypal?", "Tell me how to make a new account"]

# def test_generate_response(question: str) -> str:
#     prompt = f"""
# <human>: {question}
# <assistant>:
# """.strip()
#     print("#########################################################################")
#     print("question: ", question)
#     print("#########################################################################")
#     for p in range(1, 11):
#         # generation_config.temperature = p
#         p = round(p*0.1, 1)
#         generation_config.top_p = p
#         # print("---------------------------------------------------------------------------------")
#         print(f"generation_config -> temperature is {generation_config.temperature}, top_p is {generation_config.top_p}")

#         encoding = tokenizer(prompt, return_tensors="pt")
#         # model.to(device)  
#         with torch.inference_mode():
#             outputs = model.generate(
#                 input_ids=encoding.input_ids.to('cuda'),
#                 attention_mask=encoding.attention_mask.to('cuda'),
#                 generation_config=generation_config,
#             )
#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)

#         assistant_start = "<assistant>:"
#         response_start = response.find(assistant_start)

#         if response_start >= 0:
#             # Find the second occurrence
#             responce_end = response.find(assistant_start, response_start + len(assistant_start))
#             # print(f"'{assistant_start}' found in response: {responce_end}")
#         else:
#             print(f"'{assistant_start}' not found in response")

#         # print("response start------------------------------------------")
#         # print(response)
#         # print("response end------------------------------------------")
#         print(f"response_start: {response_start},\n Final answer: {response[response_start+len(assistant_start):responce_end]}")
#         # print("--------------------------------Return--------------------------------")
#     return response[response_start+len(assistant_start):responce_end].strip()

In [None]:
# for prompt in test_prompts:
#     print(test_generate_response(prompt))

In [None]:

# generation_config.top_p = 0.7

In [None]:
# test_prompts = ["Can I return a product if it was a clearance or final sale item?","What happens when I return a clearance item?", "How do I know when I'll receive my order?", "Do you accept credit caards or paypal?", "Tell me how to make a new account"]

# def test_generate_response(question: str) -> str:
#     prompt = f"""
# <human>: {question}
# <assistant>:
# """.strip()
#     print("#########################################################################")
#     print("question: ", question)
#     print("#########################################################################")
#     for p in range(1, 11):
#         # generation_config.temperature = p
#         p = round(p*0.1, 1)
#         generation_config.temperature = p
#         # print("---------------------------------------------------------------------------------")
#         print(f"generation_config -> top_p is {generation_config.top_p}, temperature is {generation_config.temperature}")

#         encoding = tokenizer(prompt, return_tensors="pt")
#         # model.to(device)  
#         with torch.inference_mode():
#             outputs = model.generate(
#                 input_ids=encoding.input_ids.to('cuda'),
#                 attention_mask=encoding.attention_mask.to('cuda'),
#                 generation_config=generation_config,
#             )
#         response = tokenizer.decode(outputs[0], skip_special_tokens=True)

#         assistant_start = "<assistant>:"
#         response_start = response.find(assistant_start)

#         if response_start >= 0:
#             # Find the second occurrence
#             responce_end = response.find(assistant_start, response_start + len(assistant_start))
#             # print(f"'{assistant_start}' found in response: {responce_end}")
#         else:
#             print(f"'{assistant_start}' not found in response")

#         # print("response start------------------------------------------")
#         # print(response)
#         # print("response end------------------------------------------")
#         print(f"Final answer: {response[response_start+len(assistant_start):responce_end]}")
#         # print("--------------------------------Return--------------------------------")
#     return response[response_start+len(assistant_start):responce_end].strip()

In [None]:
# for prompt in test_prompts:
#     print(test_generate_response(prompt))