In [1]:
from datasets import load_dataset

dataset = load_dataset("Open-Orca/SlimOrca-Dedup")
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 363491
    })
})

In [2]:
def formatting_func(example,add_generation=False):
    template = ''
    
    for message in example['conversations']:
        if add_generation and message['from'] == 'gpt':
             continue
        #Remove 'Answer:' from the start of the message
        if message['from'] == 'human':
            message['from'] = 'user'
        if message['from'] == 'gpt':
            message['from'] = 'assistant'
        template += '<|im_start|>' + message['from'] + '\n' + message['value'] + '<|im_end|>' + '\n'
        
    
    if add_generation:
            template+='<|im_start|>assistant'
   

    return {'text':template}

In [3]:
subset= dataset['train'].train_test_split(test_size=0.965, seed=42)
train_eval = subset['train'].train_test_split(test_size=0.2, seed=42)
test_eval = train_eval['test'].train_test_split(test_size=0.02, seed=42)
import functools
train_dataset = train_eval['train'].map(formatting_func,remove_columns=['conversations'])
test_dataset = test_eval['train']#.map(functools.partial(formatting_func,add_generation=True),remove_columns=['conversations'])
eval_dataset = test_eval['test'].map(formatting_func,remove_columns=['conversations'])

print(f'train_dataset: {len(train_dataset)}')
print(f'test_dataset: {len(test_dataset)}')
print(f'eval_dataset: {len(eval_dataset)}')

Map:   0%|          | 0/10177 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

train_dataset: 10177
test_dataset: 2494
eval_dataset: 51


In [4]:
#Sanity check   
print(train_dataset[0])
print(test_dataset[0])
print(eval_dataset[0])

{'text': '<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nQuestion: who abandons religion? If there is no answer, please output "Insufficient information to provide an answer.". Movie title: Def by Temptation Context: The story is set in New York and revolves around the relationship between two childhood best friends: "Joel", who is raised by his religious grandmother after both of his parents are killed in an automobile accident, and "K" who abandons religion, moves to New York and becomes a movie star. Joel (now a minister like his deceased father), becomes somewhat disillusioned with Christianity and decides to take a trip to New York to visit his friend, K. While awaiting Joel\'s arrival, K (played by Kadeem Hardison) visits the local bar and meets the perfect woman (played by Cynthia Bond)--who in reality is a succubus seeking blood, and vengeance against any and all men foolish enoug

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map={'':0},
    trust_remote_code=True,
    use_auth_token=True
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="right",
    model_max_length=4096,
   
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token


In [7]:
from peft import LoraConfig, PeftModel,get_peft_model
ckpt = '/home/nilakshan/4-LLM/outputs/Mistral-7B-SlimOrca-PEFT_SFT-2024-02-05-23-40'
peft_model = PeftModel.from_pretrained(base_model,ckpt).to('cuda:0')
peft_model = peft_model.merge_and_unload()



In [8]:


from transformers import StoppingCriteria, StoppingCriteriaList
from torch import LongTensor, FloatTensor

stop_list = ["<|im_end|>" ]
stop_token_ids = [tokenizer(x,  return_tensors='pt', add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [LongTensor(x) for x in stop_token_ids]
stop_token_ids = [x.to('cuda:0') for x in stop_token_ids]
stop_token_ids

from torch import eq

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            # print(f"Testing {input_ids[0][-len(stop_ids[0])+1:]} against {stop_ids[0][1:]}")
            if eq(input_ids[0][-len(stop_ids[0])+1:], stop_ids[0][1:]).all():
                return True
        return False


stopping_criteria = StoppingCriteriaList([StopOnTokens()])


In [21]:
from transformers import pipeline
def predict(model,tokenizer,prompt):
   

    

    prompt = formatting_func(prompt,add_generation=True)['text']



    print("*** Pipeline:")
    # print('prompt:',prompt)
    # print('-'*30)
#     prompt = """<|im_start|>system
# You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|im_end|>
# <|im_start|>human
# On a scale from 0-5, where 0 is "not similar" and 5 is "very similar", how similar is the sentence "Doctors have speculated that the body's own estrogen protects against cell damage and improves blood flow." to the sentence "Their belief was based on speculation that estrogen prevents cell damage and improves blood flow."?<|im_end|>
# <|im_start|>assistant"""
#     print(prompt)
#     print('-'*30)
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        add_special_tokens=1,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )

    print(pipe(prompt,stopping_criteria=stopping_criteria)[0]['generated_text'])

predict(peft_model,tokenizer,test_dataset[0])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


*** Pipeline:
<|im_start|>system
You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|im_end|>
<|im_start|>user
On a scale from 0-5, where 0 is "not similar" and 5 is "very similar", how similar is the sentence "Doctors have speculated that the body's own estrogen protects against cell damage and improves blood flow." to the sentence "Their belief was based on speculation that estrogen prevents cell damage and improves blood flow."?<|im_end|>
<|im_start|>assistant
In my opinion, these two sentences are very similar, as they both discuss the possible protective effects of estrogen against cell damage and its potential benefits on blood flow. However, there may be some minor differences in wording or phrasing between the two sentences, which could affect their level of similarity. Based on this information, I would rate the similarity of the sentences as 4 

In [87]:
# tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['from'] + '\n' + message['value'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

# tokens = tokenizer.apply_chat_template(test_dataset[0]['conversations'][:2], tokenize=1, add_generation_prompt=1, return_tensors="pt")

# out = peft_model.generate(tokens, max_new_tokens=512)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
