# LLM Inference Tester

What is the best method the use LLM inference? 
VLLM has been omitted due to permission errors. 
TENSORRTLLM has been omitted due to failure to install.


In [14]:
import pandas as pd
import os
import torch


def write_to_CSV(model, starttime, endtime, type, method, response=None):
    new_data = pd.DataFrame({
        "Model": [model],  # Wrapping scalar values in lists
        "starttime": [starttime],
        "endtime": [endtime],
        "duration": [endtime - starttime],
        "type": [type],
        "method": [method],
        "Response": [response]
    })
    
    file_path = "modelTimings.csv"
    
    if os.path.exists(file_path):
        # If the file exists, read it and concatenate the new data
        existing_data = pd.read_csv(file_path)
        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
    else:
        # If the file doesn't exist, the new data becomes the updated data
        updated_data = new_data
        
    updated_data.to_csv(file_path, index=False)
def clean():
    torch.cuda.empty_cache()
modelNames = ["BramVanroy/fietje-2-chat","BramVanroy/fietje-2","BramVanroy/GEITje-7B-ultra","Rijgersberg/GEITje-7B","Qwen/Qwen2.5-1.5B-Instruct"]
prompts = ["Wat is de hoofdstad van Nederland?", "Wie is de primeur van nederland?", "Wat is het kwadraat van 5", "Hoeveel p's zijn er in appel?"]
systemPrompt = "Je bent een vriendelijk chatbot die graag vragen beantwoordt en altijd zijn best doet."

In [None]:
import time
from transformers import pipeline
loadedModelPipeline = []
for modelName in modelNames:
    # modelName = "BramVanroy/fietje-2-chat"
    action = "Load"
    method = "transformers-pipeline"
    startTime = time.time()

    generator = pipeline(model=modelName)
    print(generator.model.config._name_or_path)
    loadedModelPipeline.append(generator)
    # Time.Time works in seconds
    write_to_CSV(modelName, startTime, time.time(), action, method)

In [None]:
action = "Inference"
for generator in loadedModelPipeline:
  for prompt in prompts:
    #  This is not being included in the timing
    translatedPrompt = generator.tokenizer.apply_chat_template([
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
      ], tokenize=False, add_generation_prompt=True)
    startTime = time.time()
    response = generator(translatedPrompt, max_length=1024, num_return_sequences=1)
    write_to_CSV(generator.model.config._name_or_path, startTime, time.time(), action, method, response)
    print(response)
  del generator

In [15]:
clean()

## Causual Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
action = "Load"
method = "AutoModelForCausalLM"
loadedModelCasual = []
for modelName in modelNames:
    startTime = time.time()
    model = AutoModelForCausalLM.from_pretrained(modelName, trust_remote_code=True, device_map='auto')
    loadedModelCasual.append(model)
    tokenizer = AutoTokenizer.from_pretrained(modelName)
    input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
    write_to_CSV(modelName, startTime, time.time(), action, method)

In [None]:
action = "Inference"
for model in loadedModelCasual:
  for prompt in prompts:
    #  This is not being included in the timing
    translatedPrompt = generator.tokenizer.apply_chat_template([
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
      ], tokenize=False, add_generation_prompt=True)
    startTime = time.time()
    outputs = model.generate(input_ids, max_new_tokens=1024)
    response = tokenizer.decode(outputs[0])
    write_to_CSV(modelName, startTime, time.time(), action, method,response)
  del model


In [None]:
clean()

## Ollama

There is no automatic download for ollama

In [None]:
# %pip install ollama

In [14]:
# import ollama
# startTime = time.time()
# response = ollama.chat(model='llama3.1', messages=[
#   {
#     'role': 'user',
#     'content': 'Why is the sky blue?',
#   },
# ])
# write_to_CSV("llama3.1", startTime, time.time(), "Inference", "ollama", response)

# ONNX

In [None]:
%pip install git+https://github.com/huggingface/optimum.git

In [None]:
import time
from transformers import pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification
loadedModelOptimum = []
for modelName in modelNames:
    # modelName = "BramVanroy/fietje-2-chat"
    action = "Load"
    method = "optimum.onnxruntime"
    startTime = time.time()
    model = ORTModelForSequenceClassification.from_pretrained(modelName,from_transformers=True)
    tokenizer = AutoTokenizer.from_pretrained(modelName)
    generator = pipeline(model=model,tokenizer=tokenizer)
    print(generator.model.config._name_or_path)
    loadedModelPipeline.append(generator)
    # Time.Time works in seconds
    write_to_CSV(modelName, startTime, time.time(), action, method)

In [None]:
for model in loadedModelOptimum:
  for prompt in prompts:
    #  This is not being included in the timing
    translatedPrompt = generator.tokenizer.apply_chat_template([
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
      ], tokenize=False, add_generation_prompt=True)
    startTime = time.time()
    response = generator(prompt, max_length=1024, num_return_sequences=1)
    write_to_CSV(generator.model.config._name_or_path, startTime, time.time(), action, method, response)
    print(response)
  del generator

In [None]:
clean()

## Quantization

### Bits and bytes

In [12]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["lm_head"])
models8Bit = []
method = "8bit quantization with BitsAndBytesConfig"
for modelName in modelNames:
    timeStart = time.time()
    model = AutoModelForCausalLM.from_pretrained(modelName, quantization_config=quantization_config)
    models8Bit.append(model)
    write_to_CSV(modelName, timeStart, time.time(), "Load", method)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
action = "Inference"
for model in models8Bit:
  for prompt in prompts:
    #  This is not being included in the timing
    translatedPrompt = generator.tokenizer.apply_chat_template([
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
      ], tokenize=False, add_generation_prompt=True)
    startTime = time.time()
    outputs = model.generate(input_ids, max_new_tokens=1024)
    response = tokenizer.decode(outputs[0])
    write_to_CSV(modelName, startTime, time.time(), action, method,response)
  del model


In [13]:
clean()

NameError: name 'clean' is not defined