In [2]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain

import warnings
warnings.filterwarnings('ignore')

In [3]:
roman = json.load(open('../data/roman.json', 'r'))['text']

In [4]:
template = """
    You are a helpful bot that summarizes text in details. Return your response in 10 bullet points.
    TEXT = {text}.
"""
prompt = PromptTemplate(template = template, input_variables = ['text'])

In [18]:
model_name = 'meta-llama/Llama-2-7b-hf'
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = transformers.pipeline(
    'text-generation',
    model = model_name,
    tokenizer = tokenizer,
    torch_dtype = torch.bfloat16
)

llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature': 0.3, 'max_new_tokens': 4096})
llm_chain = LLMChain(prompt = prompt, llm = llm)

Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.02s/it]


In [26]:
print(llm_chain.run(roman))

In [19]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16,
    load_in_4bit = False,    # changing this to load_in_8bit=True works on smaller models
    trust_remote_code = True,
    device_map = 'auto',    # finds GPU
)

generation_pipe = transformers.pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    trust_remote_code = True,
    device_map = 'auto',    # finds GPU
)

llm = HuggingFacePipeline(pipeline = generation_pipe, model_kwargs = {'temperature': 0.3, 'max_new_tokens': 4096})
llm_chain = LLMChain(prompt = prompt, llm = llm)

Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.02s/it]


In [20]:
print(llm_chain.run(roman))