In [None]:
# Written for Google Colab.

!pip install -q -U \
  git+https://github.com/huggingface/transformers.git \
  git+https://github.com/huggingface/accelerate.git

!pip install -q \
  datasets \
  bitsandbytes \
  einops \
  wandb \
  contexttimer \
  ray \
  pandas \
  tenacity \
  black[jupyter] \
  openai

# -DLLAMA_CUBLAS=on build will fail if no GPU present.
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

In [None]:
# Direct HF cache to Drive location s.t. models persist across instances.

from google.colab import drive
import os

drive.mount("/drive")
os.environ["HF_HOME"] = "/drive/MyDrive/HFCache"

In [None]:
# Code formatting (https://stackoverflow.com/questions/63076002/code-formatter-like-nb-black-for-google-colab).
# !black "/drive/MyDrive/Colab Notebooks/InferenceLatencyAnalyzer.ipynb"

In [None]:
from transformers import (
    Pipeline,
    pipeline,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModelForCausalLM,
)
import contexttimer
import torch
import ray
import itertools
import pandas as pd

from typing import Callable, List, Tuple
from collections.abc import Iterator


def evaluate_model_latency(
    model: Callable[[str, int], str], repeat: int, prompt: str, max_seq_length: int
) -> Iterator[Tuple[str, float]]:
    for _ in range(repeat):
        with contexttimer.Timer() as timer:
            output = model(prompt, max_length=max_seq_length)
        yield (output, timer.elapsed)


@ray.remote(num_gpus=1, max_calls=1)
def _load_and_evaluate_model(parameter_info: pd.DataFrame) -> pd.DataFrame:
    model_name = parameter_info.model.values[0]
    model = LOADER_LOOKUP[model_name]()

    def evaluate_model_for_parameters(parameters: pd.Series) -> List[Tuple[str, float]]:
        result = list(
            evaluate_model_latency(
                model,
                parameters["repeat"],
                parameters["prompt"],
                parameters["max_tokens"],
            )
        )
        return result

    parameter_info["results"] = parameter_info.apply(
        evaluate_model_for_parameters, axis=1
    )
    return parameter_info


def load_and_evaluate_model(parameter_info: pd.DataFrame) -> pd.DataFrame:
    try:
        ref = _load_and_evaluate_model.remote(parameter_info)
        return ray.get(ref)
    except BaseException as e:
        # Needed to prevent task retry after keyboard interrupt.
        ray.cancel(ref, force=True)
        raise e

In [None]:
from llama_cpp import Llama
from tenacity import retry, stop_after_attempt, wait_fixed
import openai


def get_pipeline_wrapper(pipeline: Pipeline) -> Callable[[str, int], str]:
    def wrapper(prompt: str, max_length: int) -> str:
        result = pipeline(prompt, max_length=max_length)[0]['generated_text']
        return result[len(prompt) :]  # Eliminate the prompt.
    return wrapper


def falcon7b_default_loader() -> Callable[[str, int], str]:
    model = "tiiuae/falcon-7b"
    tokenizer = AutoTokenizer.from_pretrained(model)
    return get_pipeline_wrapper(
        pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto",
        )
    )


def falcon7b_quantized_loader() -> Callable[[str, int], str]:
    model_name = "ybelkada/falcon-7b-sharded-bf16"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_config, trust_remote_code=True
    )
    model.config.use_cache = False
    return get_pipeline_wrapper(
        pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto",
        )
    )


def llama_loader() -> Callable[[str, int], str]:
    return get_pipeline_wrapper(
        pipeline(
            "text-generation",
            model="huggyllama/llama-7b",  # Meta's version on HF is intentionally broken.
            device_map="auto",
        )
    )


def llama_cpp_loader() -> Callable[[str, int], str]:
    llm = Llama(
        model_path="/drive/MyDrive/Colab Datasets/ggml/ggml-model-f16.bin",
        n_gpu_layers=35,
        n_ctx=2048,
    )

    def wrapper(prompt: str, max_length: int) -> str:
        return llm(prompt, max_tokens=max_length, echo=False)["choices"][0]["text"]

    return wrapper


def open_ai_loader() -> Callable[[str, int], str]:
    @retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
    def invoke(prompt, max_length):
        # Payload with the conversation messages
        messages = [{"role": "user", "content": prompt}]
        openai.api_key = ";)"
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            max_tokens=max_length,
        )
        return response["choices"][0]["message"]["content"]

    return invoke

In [None]:
PROMPT_LIBRARY = {
    "HOWDY": "Howdy! Tell me a bit about Louisiana.",
    "EXTRACT": 'Input JSON with key "text".\n\nYour job is to take text "text", extract all information conveyed by the text into a list of complete sentences, and provide a short title describing the content of the text.\n\nYour output should be a JSON with a key "title" pointing to the string title and a key "info_list" pointing to a list of strings representing the result of your job.\n\nInput:\n        {\n            "text": "WASHINGTON (CNN)  -- A pair of tornadoes struck suburban Washington on Sunday, mangling trees and stripping siding off several homes, the National Weather Service confirmed. No injuries were immediately reported. The first tornado hit St. Charles, Maryland -- about 30 miles south of Washington -- just after 2 p.m. It uprooted several trees, many of which fell onto cars and homes. The strongest wind from that touchdown was 80 mph -- enough force to blow out windows. A second tornado followed about 30 minutes later outside Hyattsville, Maryland -- about 10 miles northeast of the capital. The high-speed winds, peaking at 100 mph, hit the George E. Peters Adventist School especially hard, tearing off a portion of the roof and flinging it and mounds of debris into the parking lot. A nearby construction trailer was also knocked over. E-mail to a friend ."\n        }\n        ',
    "EXTRACT_LONG": '\n        Input JSON with key "text".\n\nYour job is to take text "text", extract all information conveyed by the text into a list of complete sentences, and provide a short title describing the content of the text.\n\nYour output should be a JSON with a key "title" pointing to the string title and a key "info_list" pointing to a list of strings representing the result of your job.\n\nInput:\n        {\n            "text": "FARGO, North Dakota (CNN)  -- Forecasters issued flash flood warnings for Bismarck and surrounding areas Wednesday, as volunteers rushed to fill sandbags ahead of expected record floods in the flat state of North Dakota. Explosives are set off in the Missouri River on Wednesday to break up ice jams. Areas of three counties -- Morton, Emmons and Burleigh, which includes the North Dakota capital of Bismarck -- were under a flash flood warning until 12:30 p.m. CT (1:30 p.m. ET), the National Weather Service said. In an effort to alleviate the flooding, demolition crews blew up an ice jam Wednesday evening south of  Bismarck, according to CNN affiliate KXMB. Mayor John Warford said that water appeared to be moving more freely in the Missouri River after the explosives were set off, KXMB reported. The plan is make sure water continues flow through the river channel and not spread out over land. Ice jams in rivers have been a major factor in the flooding there. Most of the state, which endured a particularly harsh winter, remained under a flood warning Wednesday, with forecasters predicting possibly record flood levels on several rivers. Snow, which continued to fall Wednesday, complicated preparations, city officials said. "I woke up this morning and looked outside, I guess I thought of the same thing everybody else did. ... [What] came to mind is what a revolting development this is," said Mark Voxland, the mayor of Moorhead, Minnesota, a city just outside of Fargo.  Watch flooded fields of snow » . More than 1,000 people were evacuated from an area near Bismarck on Tuesday night as the Missouri River flooded, Rick Robinson of the North Dakota Department of Emergency Services said Wednesday.  See a map of the affected area » . Emergency officials said they were particularly concerned about the Red River, which snakes through eastern portions of North and South Dakota and western Minnesota. The river is expected to crest between 39 and 41 feet in Fargo on Friday, according to Cecily Fong of the North Dakota Department of Emergency Services. The record for the Red River at Fargo was set in 1897 at 40.1 feet, according to Pat Slattery of the National Oceanic and Atmospheric Administration. The threat of flooding prompted authorities to ask for volunteers to fill sandbags either to build temporary dikes or to bolster existing ones. In some areas, even at 3:30 a.m., hundreds of volunteers packed into individual sandbagging centers, an organizer said.  See images of flooding, preparation » . "There have been so many volunteers that we had to turn people away," said Ryan McEwan, a supervisor at one volunteer coordinating center. "It is very busy. They are filling sandbags as fast as they can." As of late Tuesday, Fargo residents and out-of-town volunteers had filled more than 1 million sandbags out of the needed 2 million. Fargo Deputy Mayor Tim Mahoney said he hoped that goal would be met by Thursday. Fargo\'s mayor, Dennis Walaker, said Wednesday that his city was about 95 percent prepared for the flooding, which is expected later in the week. "I went and looked at the dikes this morning, and they\'re significant, absolutely significant," he said in a briefing Wednesday morning. However, he said, "We have some areas we need to shore up." Just south of Fargo, authorities said they had rescued several people in Oxbow, a town of about 238 people, after a residential dike gave way. In some places, water had reached halfway up residents\' basement stairs, and in others, it had reached the main level of homes, Sgt. Gail Wichmann said. CNN\'s Chris Welch contributed to this report."\n        }\n        {"title": "Flash Flood Warning in North Dakota", "info_list": ["Flash flood warnings were issued for Bismarck and surrounding areas due to expected record floods in the flat state of North Dakota.", "Volunteers rushed to fill sandbags to prepare for the floods.", "Areas of three counties, including the North Dakota capital of Bismarck, were under a flash flood warning.", "Demolition crews blew up an ice jam south of Bismarck to alleviate the flooding.", "Most of the state remained under a flood warning, with possibly record flood levels predicted on several rivers.", "The Red River, which snakes through eastern portions of North and South Dakota and western Minnesota, was expected to crest between 39 and 41 feet in Fargo on Friday, threatening the area with flooding.", "Emergency officials asked for volunteers to fill sandbags to build temporary dikes or bolster existing ones.", "Residents and out-of-town volunteers had filled more than 1 million sandbags out of the needed 2 million as of late Tuesday.", "Fargo\'s mayor reported that the city was about 95% prepared, but there were still areas that needed to be shored up.", "An ice jam in rivers was a significant factor in the flooding.", "Snowfall on Wednesday complicated preparations.", "Evacuations were conducted, and several people in Oxbow were rescued when a residential dike gave way."]}'
}
LOADER_LOOKUP = {
    "ybelkada/falcon-7b-sharded-bf16": falcon7b_quantized_loader,
    "tiiuae/falcon-7b": falcon7b_default_loader,
    "llama": llama_loader,
    "llama_cpp": llama_cpp_loader,
    "open_ai": open_ai_loader,
}
EVALUATE_MODELS = [
    # "tiiuae/falcon-7b",
    # "ybelkada/falcon-7b-sharded-bf16",
    "open_ai",
    "llama_cpp",
    # "llama",
]
# REPEAT = 5
# PROMPT_NAMES = ["HOWDY", "EXTRACT"]
# MAX_TOKENS = [64, 1024]
REPEAT = 5
PROMPT_NAMES = ["HOWDY", "EXTRACT", "EXTRACT_LONG", "EXTRACT_LONG"]
MAX_TOKENS = [2**i for i in range(5, 12, 1)]

In [None]:
experiments = pd.DataFrame({"model": EVALUATE_MODELS, "repeat": REPEAT})
parameters = pd.DataFrame(
    itertools.product(PROMPT_NAMES, MAX_TOKENS), columns=["prompt_name", "max_tokens"]
)
experiments = experiments.merge(parameters, how="cross")
experiments["prompt"] = experiments["prompt_name"].map(PROMPT_LIBRARY)
experiments = experiments.loc[
    experiments["prompt"].str.len() <= (experiments["max_tokens"] * 3.75)
].reset_index(drop=True)

experiments

In [None]:
results = experiments.groupby(["model"], group_keys=False).apply(load_and_evaluate_model)
results

In [None]:
def get_raw_per_character_seconds(r):
    return sum([time / len(output) for output, time in r.results]) / len(r.results)


def get_avg_output_length(r):
    return sum([len(output) for output, _ in r.results]) / len(r.results)


results["raw_per_character_latency_seconds"] = results.apply(
    get_raw_per_character_seconds, axis=1
)
results["get_avg_output_length_chars"] = results.apply(get_avg_output_length, axis=1)

results

In [None]:
gpu_info = !nvidia-smi -L
gpu_info = "\n".join(gpu_info)

results["gpu_type"] = gpu_info.split(":")[1].split("(")[0]

In [None]:
import time

now = int(time.time())
results.drop(columns=["results"]).to_csv(
    f"/drive/MyDrive/Colab Datasets/inference_latency_analyzer_results/results_{now}.csv",
    index=False,
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df = results.copy()

df = df.rename(columns={"get_avg_output_length_chars": "avg_output_length_in_chars"})
df["per_character_ms"] = df["raw_per_character_latency_seconds"] * 1000

# Create a regression plot with grouping by the "model" column
plot = sns.lmplot(x="avg_output_length_in_chars",
                 y="per_character_ms",
                 data=df,
                 hue="model", # This will create different colors/groups based on the 'model' column
                 ci=None, # This disables the confidence interval shading
                 legend=True)

# Add a title
plt.title("Average Character Latency vs Average Output Length by Model")

# Show the plot
plt.show()