### Imports

In [1]:
import torch
from transformers import GPTJForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


# GPT - J

In [5]:
model_name = "models/GPT-J"

# Load the model
# model = GPTJForCausalLM.from_pretrained(model_name)
device = "cuda"
model_GPT = GPTJForCausalLM.from_pretrained(
    model_name,
    revision="float16",
    torch_dtype=torch.float16,
)
model_GPT.cuda()

# Load the tokenizer
tokenizer_GPT = AutoTokenizer.from_pretrained(model_name)

In [6]:
prompt = "What is the meaning of life?"

input_ids_GPT = tokenizer_GPT(prompt, return_tensors="pt").input_ids

gen_tokens = model_GPT.generate(
    input_ids_GPT.to(device),
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer_GPT.batch_decode(gen_tokens)[0]

print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the meaning of life? The simple question has been asked and answered many times. If there was one answer, it would obviously have some sort of cosmic importance. No, the meaning of life can’t be something so simple as to be based solely on a person’s religious doctrine. There must be more to it than that. The meaning of life is complex and multifaceted. It can never be fully explained by only one factor, no matter how central or important.


In [7]:
# clear GPU memory
del model_GPT
del input_ids_GPT
torch.cuda.empty_cache()

# LLAMA

In [15]:
model_name = "models/Llama-2-7B-fp16"

# Load the model
# model = GPTJForCausalLM.from_pretrained(model_name)
device = "cuda"
model = LlamaForCausalLM.from_pretrained(
    model_name,
    revision="float16",
    torch_dtype=torch.float16,
)
model.cuda()

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.84s/it]


In [10]:
def chat_with_llama(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    output = model.generate(input_ids, max_length=256, num_beams=4, no_repeat_ngram_size=2)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
input = "What is the meaning of life?"

response = chat_with_llama(input)

print(response)

In [16]:
# clear GPU memory
del model
torch.cuda.empty_cache()

# Red Pajama - Instruct

In [45]:
model_name = "models/redp-7b-instruct"

# Load the model
device = "cuda"
# init
tokenizer_red = AutoTokenizer.from_pretrained(model_name)
model_red = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model_red.to(device)
# infer
question = "The capital of France is?"
prompt = f"Q: {question}?\nA:"
inputs = tokenizer_red(prompt, return_tensors='pt').to(model_red.device)
input_length = inputs.input_ids.shape[1]
outputs = model_red.generate(
    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer_red.decode(token)

def truncate_at_first_delimiter(s, delimiters):
    # Find the earliest occurrence of any delimiter
    first_pos = min((s.find(d) for d in delimiters if s.find(d) != -1), default=-1)
    # Return the truncated string or the original if no delimiter was found
    return s[:first_pos] if first_pos != -1 else s

delimiters = ['Q:', 'q:', 'Question:', 'question:']
truncated_string = truncate_at_first_delimiter(output_str, delimiters)
print(truncated_string.strip())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Paris
