In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

In [4]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
use_triton = False

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [6]:
model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_safetensors=True,
    trust_remote_code=True,
    device="cuda:0",
    use_triton=use_triton,
    quantize_config=None,
)

Skipping module injection for FusedLlamaMLPForQuantizedModel as currently not supported with use_triton=False.


In [7]:
prompt = "Tell me about AI"
template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
USER: {prompt}
ASSISTANT:
'''

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    repetition_penalty=1.15
)

print(pipe(template)[0]['generated_text'])

*** Pipeline:
SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
USER: Tell me about AI
ASSISTANT:

AI, or Artificial Intelligence, refers to the development of computer systems that can perform tasks that typically require human intelligence, such as learning, problem-solving, decision-making, and language understanding. AI systems use algorithms and machine learning techniques to analyze data and make predictions or decisions based on that data.

There are many different types of AI, including:

1. Narrow or weak AI, which is d

In [8]:
prompt = "Who won the World Series in the year 1994? "

template = '''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
USER: {prompt}
ASSISTANT:
'''

prompt_template = PromptTemplate(template=template, input_variables=["prompt"])

In [9]:
llm = HuggingFacePipeline(pipeline=pipe)
llm_chain = LLMChain(prompt=prompt_template, llm=llm)
print(llm_chain.run(prompt))

The World Series was not played in the year 1994. The MLB season did not take place due to a players strike. Therefore, there was no World Series champion for that year.
