# LLM TIMER

In [9]:
import pandas as pd
import os

def write_to_CSV(model, starttime, endtime, type, method):
    new_data = pd.DataFrame({
        "Model": [model],  # Wrapping scalar values in lists
        "starttime": [starttime],
        "endtime": [endtime],
        "duration": [endtime - starttime],
        "type": [type],
        "method": [method]
    })
    
    file_path = "modelTimings.csv"
    
    if os.path.exists(file_path):
        # If the file exists, read it and concatenate the new data
        existing_data = pd.read_csv(file_path)
        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
    else:
        # If the file doesn't exist, the new data becomes the updated data
        updated_data = new_data
        
    updated_data.to_csv(file_path, index=False)


In [10]:
import time
from transformers import pipeline
modelName = "BramVanroy/fietje-2-chat"
action = "Load"
method = "transformers-pipeline"
startTime = time.time()

generator = pipeline(model=modelName)
# Time.Time works in seconds
write_to_CSV(modelName, startTime, time.time(), action, method)

Loading checkpoint shards: 100%|██████████| 2/2 [00:26<00:00, 13.27s/it]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
action = "Inference"
startTime = time.time()
generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2)
write_to_CSV(modelName, startTime, time.time(), action, method)

## VLLM

In [15]:
from vllm import LLM, SamplingParams
startTime = time.time()
modelName = "facebook/opt-125m"
prompt = "What is the capital of France?"
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model=modelName)

write_to_CSV(modelName, startTime, time.time(), action, "vllm")

INFO 10-16 11:10:45 importing.py:10] Triton not installed; certain GPU-related functions will not be available.


2024-10-16 11:10:45,494	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RuntimeError: Failed to infer device type

In [16]:
outputs = llm.generate(prompt, sampling_params)

NameError: name 'llm' is not defined

## Causual Inference

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
action = "Load"
method = "AutoModelForCausalLM"
modelName = "PrunaAI/BramVanroy-fietje-2-bnb-4bit-smashed"
startTime = time.time()
model = AutoModelForCausalLM.from_pretrained(modelName, trust_remote_code=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("BramVanroy/fietje-2")

input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
write_to_CSV(modelName, startTime, time.time(), action, method)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
action = "Inference"
startTime = time.time()

outputs = model.generate(input_ids, max_new_tokens=216)
tokenizer.decode(outputs[0])
write_to_CSV(modelName, startTime, time.time(), action, method)


## Ollama

In [13]:
%pip install ollama

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [14]:
import ollama
startTime = time.time()
response = ollama.chat(model='llama3.1', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
write_to_CSV("llama3.1", startTime, time.time(), "Inference", "ollama")