# LLM Inference Tester

What is the best method the use LLM inference? 
VLLM has been omitted due to permission errors. 
TENSORRTLLM has been omitted due to failure to install.


In [16]:
import pandas as pd
import os
import torch
import gc
import time

def write_to_CSV(model, starttime, endtime, type, method, response=None):
    new_data = pd.DataFrame({
        "Model": [model],  # Wrapping scalar values in lists
        "starttime": [starttime],
        "endtime": [endtime],
        "duration": [endtime - starttime],
        "type": [type],
        "method": [method],
        "Response": [response]
    })
    
    file_path = "modelTimings.csv"
    
    if os.path.exists(file_path):
        # If the file exists, read it and concatenate the new data
        existing_data = pd.read_csv(file_path)
        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
    else:
        # If the file doesn't exist, the new data becomes the updated data
        updated_data = new_data
        
    updated_data.to_csv(file_path, index=False)
def clean():
    torch.cuda.empty_cache()
    gc.collect()

# modelNames = ["BramVanroy/fietje-2-chat","BramVanroy/fietje-2","BramVanroy/GEITje-7B-ultra","Rijgersberg/GEITje-7B","Qwen/Qwen2.5-1.5B-Instruct"]
modelNames = ["BramVanroy/fietje-2-chat"]
prompts = ["Wat is de hoofdstad van Nederland?", "Wie is de primeur van nederland?", "Wat is het kwadraat van 5", "Hoeveel p's zijn er in appel?"]
systemPrompt = "Je bent een vriendelijk chatbot die graag vragen beantwoordt en altijd zijn best doet."

In [None]:
from transformers import pipeline
loadedModelPipeline = []
for modelName in modelNames:
    # modelName = "BramVanroy/fietje-2-chat"
    action = "Load"
    method = "transformers-pipeline"
    startTime = time.time()

    generator = pipeline(model=modelName)
    loadedModelPipeline.append(generator)
    # Time.Time works in seconds
    write_to_CSV(modelName, startTime, time.time(), action, method)
    print(generator.model.config._name_or_path)
    

In [None]:
action = "Inference"
for generator in loadedModelPipeline:
  for prompt in prompts:
    #  This is not being included in the timing
    translatedPrompt = generator.tokenizer.apply_chat_template([
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
      ], tokenize=False, add_generation_prompt=True)
    startTime = time.time()
    response = generator(translatedPrompt, max_length=1024, num_return_sequences=1)
    write_to_CSV(generator.model.config._name_or_path, startTime, time.time(), action, method, response)
    print(response)
  del generator
loadedModelPipeline = []
del loadedModelPipeline

In [15]:
clean()

## Causual Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
action = "Load"
method = "AutoModelForCausalLM"
loadedModelCasual = []
loadedTokenizers = []
for modelName in modelNames:
    startTime = time.time()
    model = AutoModelForCausalLM.from_pretrained(modelName, trust_remote_code=True, device_map='auto')
    loadedModelCasual.append(model)
    tokenizer = AutoTokenizer.from_pretrained(modelName)
    loadedTokenizers.append(tokenizer)
    # input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
    write_to_CSV(modelName, startTime, time.time(), action, method)
    print(model.config._name_or_path)

In [None]:
action = "Inference"
for idx, model in enumerate(loadedModelCasual):
    tokenizer = loadedTokenizers[idx]
    for prompt in prompts:
        translatedPrompt = tokenizer.apply_chat_template([
            {"role": "system", "content": systemPrompt},
            {"role": "user", "content": prompt}
        ], tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(translatedPrompt, return_tensors='pt', padding=True).to(model.device)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        startTime = time.time()
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=1024)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        write_to_CSV(modelNames[idx], startTime, time.time(), action, method, response)
        print(response)
        
    del model
    del tokenizer
    
loadedTokenizers = []
del loadedTokenizers
loadedModelCasual = []
del loadedModelCasual

In [None]:
clean()

## Ollama

There is no automatic download for ollama

In [None]:
# %pip install ollama

In [14]:
# import ollama
# startTime = time.time()
# response = ollama.chat(model='llama3.1', messages=[
#   {
#     'role': 'user',
#     'content': 'Why is the sky blue?',
#   },
# ])
# write_to_CSV("llama3.1", startTime, time.time(), "Inference", "ollama", response)

# ONNX

In [None]:
import time
from transformers import pipeline
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer

loadedModelOptimum = []
loadedTokenizersOptimum = []
loadedModelPipeline = []

for modelName in modelNames:
    # modelName = "BramVanroy/fietje-2-chat"
    action = "Load"
    method = "optimum.onnxruntime"
    startTime = time.time()
    model = ORTModelForCausalLM.from_pretrained(modelName, export=True, provider="CUDAExecutionProvider")
    tokenizer = AutoTokenizer.from_pretrained(modelName)
    loadedTokenizersOptimum.append(tokenizer)
    loadedModelOptimum.append(model)
    generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
    print(generator.model.config._name_or_path)
    loadedModelPipeline.append(generator)
    write_to_CSV(modelName, startTime, time.time(), action, method)
    print(model.config._name_or_path)
    


In [None]:
for idx, model in enumerate(loadedModelOptimum):
    tokenizer = loadedTokenizersOptimum[idx]
    
    # Ensure the model is on CUDA
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    for prompt in prompts:
        translatedPrompt = tokenizer.apply_chat_template([
            {"role": "system", "content": systemPrompt},
            {"role": "user", "content": prompt}
        ], tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(translatedPrompt, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        startTime = time.time()
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1024, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        write_to_CSV(model.config._name_or_path, startTime, time.time(), action, method, response)
        print(response)
    
    # Optionally delete the generator
    del model
    del tokenizer
loadedTokenizersOptimum = []
del loadedTokenizersOptimum
loadedModelOptimum = []
del loadedModelOptimum


In [None]:
clean()

## Quantization

### Bits and bytes

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoTokenizer
import time
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["lm_head"])
models8Bit = []
tokenizer8Bit = []
method = "8bit quantization with BitsAndBytesConfig"
for modelName in modelNames:
    timeStart = time.time()
    tokenizer = AutoTokenizer.from_pretrained(modelName)
    tokenizer8Bit.append(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(modelName, quantization_config=quantization_config)
    models8Bit.append(model)
    write_to_CSV(modelName, timeStart, time.time(), "Load", method)

In [None]:
action = "Inference"
for idx, model in enumerate(models8Bit):
    tokenizer = tokenizer8Bit[idx]
    for prompt in prompts:
        # Translate the system and user prompt
        translatedPrompt = tokenizer.apply_chat_template([
            {"role": "system", "content": systemPrompt},
            {"role": "user", "content": prompt}
        ], tokenize=False, add_generation_prompt=True)

        # Tokenize the translated prompt
        input_ids = tokenizer(translatedPrompt, return_tensors="pt").input_ids

        # Generate response (timing starts here)
        startTime = time.time()
        outputs = model.generate(input_ids, max_new_tokens=1024)
        endTime = time.time()

        # Decode the generated output
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(response)
        # Write the result to CSV
        write_to_CSV(modelNames[idx], startTime, endTime, action, method, response)

    # Clean up the model to free memory
    del model
    del tokenizer
tokenizer8Bit = []
del tokenizer8Bit
models8Bit = []
del models8Bit


In [None]:
clean()

## Self quantization (BEAWARE THIS TAKES A LONG TIME)

### GPTQConfig

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = modelNames[0]

tokenizer = AutoTokenizer.from_pretrained(model_id)

quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

PackageNotFoundError: No package metadata was found for auto-gptq

In [None]:
for prompt in prompts:
    translatedPrompt = tokenizer.apply_chat_template([
        {"role": "system", "content": systemPrompt},
        {"role": "user", "content": prompt}
    ], tokenize=False, add_generation_prompt=True)
    model.to("cuda")
    inputs = tokenizer(translatedPrompt, return_tensors='pt', padding=True)
    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")
    startTime = time.time()
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=1024)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    write_to_CSV(modelNames[0], startTime, time.time(), "Inference", "self-quanitzed-GPTQ", response)
    print(response)
