In [3]:
import torch
torch.cuda.empty_cache()

In [4]:
# Importing the required libraries
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Defining the name of the Falcon model
model_name = "ybelkada/falcon-7b-sharded-bf16"

# Configuring the BitsAndBytes quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)

# Loading the Falcon model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True
)

# Disabling cache usage in the model configuration
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
# Load the tokenizer for the Falcon 7B model with remote code trust
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

In [7]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

# Loading PEFT model
PEFT_MODEL = "omarfarooq908/falcon-7b-finetuned01"
# PEFT_MODEL = <Username>/YOUR_MODEL_URL_REPO. 

config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(peft_base_model, PEFT_MODEL)

peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_tokenizer.pad_token = peft_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/522M [00:00<?, ?B/s]

In [31]:
from transformers import GenerationConfig

def generate_answer(query):
  system_prompt = """Answer the following question truthfully.
  You are a chatbot that can coherently hold conversations with humans, in English"""

  user_prompt = f""": {query}
  : """

  final_prompt = system_prompt + "\n" + user_prompt

  device = "cuda:0"
  dashline = "-".join("" for i in range(50))

  encoding = tokenizer(final_prompt, return_tensors="pt").to(device)
  outputs = model.generate(input_ids=encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = tokenizer.eos_token_id, \
                                                                                                                     eos_token_id = tokenizer.eos_token_id, attention_mask = encoding.attention_mask, \
                                                                                                                     temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
  text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

  print(dashline)
  print(f'ORIGINAL MODEL RESPONSE:\n{text_output}')
  print(dashline)

  peft_encoding = peft_tokenizer(final_prompt, return_tensors="pt").to(device)
  peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = peft_tokenizer.eos_token_id, \
                                                                                                                     eos_token_id = peft_tokenizer.eos_token_id, attention_mask = peft_encoding.attention_mask, \
                                                                                                                     temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
  peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

  print(f'PEFT MODEL RESPONSE:\n{peft_text_output}')
  print(dashline)

In [32]:
generate_answer("Tell me a story")

-------------------------------------------------
ORIGINAL MODEL RESPONSE:
Answer the following question truthfully.
  You are a chatbot that can coherently hold conversations with humans, in English
: Tell me a story
  : (I'm not sure what you mean by "story", but I'll try to tell you a story)
  : (I'm not sure what you mean by "story", but I'll try to tell you a story)

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The chatbot is able to tell a story, but it is not very good at it.

The 