In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login

### Login to Hugginface (may be required)

In [2]:
with open("../../hf.key", "r") as f_in:
    hf_key = f_in.readline().strip()

login(token = hf_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /mounts/data/corp/huggingface/token
Login successful


### Choosing a model

In [3]:
model_name = "meta-llama/Llama-2-7b-hf"
#model_name = "meta-llama/Llama-2-7b-chat-hf"
#model_name = "meta-llama/Llama-2-13b-hf"
#model_name = "meta-llama/Llama-2-13b-chat-hf"
#model_name = "meta-llama/Llama-2-70b-hf"
#model_name = "meta-llama/Llama-2-70b-chat-hf"

In [4]:
server_model_path = "/mounts/data/corp/huggingface/"

## Choosing GPU

Either use the upper cell with `device_map="auto"` to use all GPUs or the lower cell to load a specific, single GPU. Loading a model for cache only needs the model name, not the `server_model_path`.

### All GPUS

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto") 
model = AutoModelForCausalLM.from_pretrained(server_model_path+model_name, device_map="auto") # server_model_path+

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Single GPU

In [None]:
# Define the GPU ID you want to use
gpu_id = 2

# Use the torch.cuda.device() context manager to set the current GPU
with torch.cuda.device(gpu_id):
    tokenizer = AutoTokenizer.from_pretrained(model_name) # use_auth_token=True
    model = AutoModelForCausalLM.from_pretrained(server_model_path+model_name).to(torch.device("cuda")) # use_auth_token=True

### Selected GPUs 

In [None]:
max_memory_mapping = {0: "1MB", 1:"49GB", 2: "49GB", 3: "49GB"}
tokenizer = AutoTokenizer.from_pretrained(model_name) # use_auth_token=True
model = AutoModelForCausalLM.from_pretrained(server_model_path+model_name, device_map="auto", max_memory=max_memory_mapping) # use_auth_token=True

## Inference

In [6]:
prompt = "Paris is the capitol of France. Berlin is the capitol of "

input_ids = tokenizer.encode(prompt, return_tensors="pt") # .to(torch.device("cuda:"+str(gpu_id)))  # comment out for multiple gpu: .to(torch...
max_length = input_ids.size(1)  + 5 # adjust for longer responses
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, top_k=1)
generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)  



In [7]:
print(generated_answer)

Paris is the capitol of France. Berlin is the capitol of 1930s
