In [None]:
import pandas as pd
import numpy as np

import torch
from torch import bfloat16, cuda
import transformers

model_id = "meta-llama/Llama-2-7b-chat-hf"
device = f"cuda : {cuda.current_device()}" if cuda.is_available() else 'cpu'

## set quantization configuration to load large model with less GPU memory -- REQUIRES bitesandbytes lib
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

## Accessing HuggingFace using HF TOKEN

hf_token = '<generate your own huggingface token>'

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token = hf_token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    config = model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_token
)

## enable evaluation model to allow model inference
model.eval()

print(f"Model loaded on {device}")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token = hf_token
)

tokenizer